# IDE tree for categorical input features

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
from sklearn.datasets import make_classification
x,y = make_classification(n_features=3,n_samples=100,random_state=2,n_classes=2,n_informative=3,n_redundant=0,n_repeated=0)
x = np.digitize(x, bins=[-0.5,0.5]) 
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.4, random_state=2)

In [54]:
def entropy(y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        return -np.sum(probabilities * np.log2(probabilities))
def infogain(S,y1,y2,y):
    return S - np.sum([entropy(y1)*(len(y1)/len(y)),entropy(y2)*(len(y2)/len(y))])
class TreeNode:
    def __init__(self,val=None,left=None,right=None,fea=None,thres=None) -> None:
        self.value = val
        self.left = left
        self.right = right
        self.fea = fea
        self.thres = thres
class DTree:
    def __init__(self,maxDepth):
        self.root = None
        self.max = maxDepth

    def fit(self,x,y):
        self.root = self._growTree(x,y,self.max,0)
    def _growTree(self,x,y,maxdp,depth):
        if(len(y)==0):
            return None
        if(depth==maxdp or len(set(y))==1):
            unique,counts = np.unique(y,return_counts=True)
            i = counts.argmax()
            return TreeNode(unique[i])
        if(entropy(y)==0):
            return TreeNode(y[0])
        xo = x.T
        if all(len(np.unique(col)) == 1 for col in xo):
            unique, counts = np.unique(y, return_counts=True)
            i = counts.argmax()
            return TreeNode(unique[i])

        classes = (list(map(lambda z:
            list(np.unique(z))
            ,xo)))
        
        uniqclass = [[(row_idx, val) for val in row] for row_idx, row in enumerate(classes)]
        uniqclass = list(filter(lambda z: len(z)!=1,uniqclass))
        info = list(map(lambda z:
             list(map(lambda k:
                infogain(S=entropy(y),y1=y[x[:,k[0]]==k[1]],y2=y[x[:,k[0]]!=k[1]],y=y) 
                ,z))
            ,uniqclass))
        fea = np.array([max(x) for x in info]).argmax()
        thresIndex = [x.index(max(x)) for x in info][fea]
        fea = uniqclass[fea][0][0]
        thres = classes[int(fea)][int(thresIndex)]
        leftchild = self._growTree(x[x[:,fea]==thres],y[x[:,fea]==thres],maxdp,depth+1)
        rightchild = self._growTree(x[x[:,fea]!=thres],y[x[:,fea]!=thres],maxdp,depth+1)
        return TreeNode(None,leftchild,rightchild,fea,thres)
        
    def _leftchildValue(x,y,thres,fea):
        x1 = np.where(x[fea]<thres,x)
        y1 = y[np.all(x==x1,axis=1)]
        return x1,y1
    def _rightchildValue(x,y,thres,fea):
        x2 = np.where(x[fea]<thres,x)
        y2 = y[np.all(x==x2,axis=1)]
        return x2,y2
    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])
    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.fea] == node.thres:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
        

In [55]:
dt = DTree(maxDepth=4)
dt.fit(x_train,y_train)
y_pred = dt.predict(x_test)


uniq- [[(0, 0), (0, 1), (0, 2)], [(1, 0), (1, 1), (1, 2)], [(2, 0), (2, 1), (2, 2)]]
uniq- [[(0, 0), (0, 1), (0, 2)], [(1, 0), (1, 1), (1, 2)], [(2, 0), (2, 1)]]
uniq- [[(0, 0), (0, 1), (0, 2)], [(1, 0), (1, 1), (1, 2)]]
uniq- [[(0, 0), (0, 1), (0, 2)]]
uniq- [[(0, 0), (0, 1)], [(1, 0), (1, 1), (1, 2)]]
uniq- [[(0, 0), (0, 1)]]
uniq- [[(1, 0), (1, 1)]]
[0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0
 1 0 1]
[0 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 1 0
 1 0 1]


In [56]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

print(classification_report(y_test, y_pred))

Accuracy: 90.00%
Precision: 0.88
Recall: 0.88
F1 Score: 0.88
Confusion Matrix:
[[22  2]
 [ 2 14]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        24
           1       0.88      0.88      0.88        16

    accuracy                           0.90        40
   macro avg       0.90      0.90      0.90        40
weighted avg       0.90      0.90      0.90        40



In [57]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
clf = DecisionTreeClassifier( random_state=2)
params = { 
'max_depth':[2,4,5,7,9,11,13],
'criterion':['entropy'],
#'min_samples_split': [2, 5, 10, 20],
#'min_samples_leaf': [1, 2, 4, 10],
#'max_features': ['sqrt', 'log2', None],
#'splitter': ['best', 'random'],
#'max_leaf_nodes': [None, 10, 20, 30, 50],
#'class_weight': [None, 'balanced']
}
clf = GridSearchCV(clf,cv=5,n_jobs=-1,param_grid=params)
clf.fit(x_train, y_train)
y_pred = clf.best_estimator_.predict(x_test)
print("Best max depth: ",clf.best_params_)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

print(classification_report(y_test, y_pred))

Best max depth:  {'criterion': 'entropy', 'max_depth': 4}
Accuracy: 90.00%
Precision: 0.88
Recall: 0.88
F1 Score: 0.88
Confusion Matrix:
[[22  2]
 [ 2 14]]
              precision    recall  f1-score   support

           0       0.92      0.92      0.92        24
           1       0.88      0.88      0.88        16

    accuracy                           0.90        40
   macro avg       0.90      0.90      0.90        40
weighted avg       0.90      0.90      0.90        40

