# CART best works for classfication data with continous features

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [80]:
from sklearn.datasets import make_classification
x,y = make_classification(n_features=2,n_samples=200,random_state=2,n_classes=2,n_informative=2,n_redundant=0,n_repeated=0)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=2)

In [81]:
def gini(y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return 1 - np.sum(probabilities**2)
def ginigain(S,y1,y2,y):
    return S - np.sum([gini(y1)*(len(y1)/len(y)),gini(y2)*(len(y2)/len(y))])
class TreeNode:
    def __init__(self,val=None,left=None,right=None,fea=None,thres=None) -> None:
        self.value = val
        self.left = left
        self.right = right
        self.fea = fea
        self.thres = thres
class DTree:
    def __init__(self,maxDepth):
        self.root = None
        self.max = maxDepth

    def fit(self,x,y):
        self.root = self._growTree(x,y,self.max,0)
    def _growTree(self,x,y,max,depth):
        if(len(y)==0):
            return None
        if(depth==max or len(set(y))==1):
            unique,counts = np.unique(y,return_counts=True)
            i = counts.argmax()
            return TreeNode(unique[i])
        if(gini(y)==0):
            return TreeNode(y[0])
        xo = x.T
        x1 = [[(row_idx, val) for val in row] for row_idx, row in enumerate(xo)]
        #2nd method to find info gain by splitting at each value instead of bins
        #but bins are better because splitting at each value consumes lot of computational resource and can cause overfitting
        #info = np.array(list(map(lambda z:
            #np.array(list(map(lambda k:
                #ginigain(S=gini(y),y1=y[x[:,k[0]]<=k[1]],y2=y[x[:,k[0]]>k[1]],y=y) 
                #,z))),x1)))
        mids = np.array(list(map(lambda z:
           (z[1:]+z[:-1])/2
            ,xo)))
        mid = [[(row_idx, val) for val in row] for row_idx, row in enumerate(mids)]
        info = np.array(list(map(lambda z:
             np.array(list(map(lambda k:
                ginigain(S=gini(y),y1=y[x[:,k[0]]<=k[1]],y2=y[x[:,k[0]]>k[1]],y=y) 
                ,z)))
            ,mid)))
        fea = info.max(axis=1).argmax()
        thresIndex = info.argmax(axis=1)[fea]
        thres = mids[int(fea)][int(thresIndex)]
        leftchild = self._growTree(x[x[:,fea]<=thres],y[x[:,fea]<=thres],max,depth+1)
        rightchild = self._growTree(x[x[:,fea]>thres],y[x[:,fea]>thres],max,depth+1)
        return TreeNode(None,leftchild,rightchild,fea,thres)
        
    def _leftchildValue(x,y,thres,fea):
        x1 = np.where(x[fea]<thres,x)
        y1 = y[np.all(x==x1,axis=1)]
        return x1,y1
    def _rightchildValue(x,y,thres,fea):
        x2 = np.where(x[fea]<thres,x)
        y2 = y[np.all(x==x2,axis=1)]
        return x2,y2
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.fea] <= node.thres:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
        

In [82]:
dt = DTree(maxDepth=4)
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)

In [83]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

print(classification_report(y_test, y_pred))

Accuracy: 93.75%
Precision: 0.92
Recall: 0.94
F1 Score: 0.93
Confusion Matrix:
[[42  3]
 [ 2 33]]
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        45
           1       0.92      0.94      0.93        35

    accuracy                           0.94        80
   macro avg       0.94      0.94      0.94        80
weighted avg       0.94      0.94      0.94        80



In [84]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
clf = DecisionTreeClassifier( random_state=2)
params = { 
'max_depth':[2,4,5,7,9,11,13],
'criterion':['entropy','gini'],
#'min_samples_split': [2, 5, 10, 20],
#'min_samples_leaf': [1, 2, 4, 10],
#'max_features': ['sqrt', 'log2', None],
#'splitter': ['best', 'random'],
#'max_leaf_nodes': [None, 10, 20, 30, 50],
#'class_weight': [None, 'balanced']
}
clf = GridSearchCV(clf,cv=5,n_jobs=-1,param_grid=params)
clf.fit(x_train, y_train)
y_pred = clf.best_estimator_.predict(x_test)
print("Best max depth: ",clf.best_params_)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

print(classification_report(y_test, y_pred))


Best max depth:  {'criterion': 'gini', 'max_depth': 4}
Accuracy: 95.00%
Precision: 0.92
Recall: 0.97
F1 Score: 0.94
Confusion Matrix:
[[42  3]
 [ 1 34]]
              precision    recall  f1-score   support

           0       0.98      0.93      0.95        45
           1       0.92      0.97      0.94        35

    accuracy                           0.95        80
   macro avg       0.95      0.95      0.95        80
weighted avg       0.95      0.95      0.95        80

