In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

train = pd.read_csv('train.csv')

# NODE CLASS

In [8]:
class Node():
    def __init__(self,feature_index=None,threshold=None,left=None,right=None,info_gain=None, value=None):
        #decision node
        self.feature_index=feature_index
        self.threshold=threshold
        self.left=left
        self.right=right
        self.info_gain=info_gain
        #Leaf node
        self.value=value

# TREE CLASS

In [9]:
class Decision_tree():
    def __init__(self,min_samples_split=2,max_depth=2):
       #initialize the root
        self.root=None
        
        #stopping conditions
        self.min_samples_split= min_samples_split
        self.max_depth=max_depth
        
    def build_tree(self,dataset,curr_depth=0):
        X,Y= dataset[:,:-1],dataset[:,-1]
        nb_samples,nb_features=np.shape(X)
        
        #split until conditions are met
        #print(nb_samples,self.min_samples_split,curr_depth,self.max_depth)

        if nb_samples>=self.min_samples_split and curr_depth<self.max_depth:
           #print(nb_samples,self.min_samples_split,curr_depth,self.max_depth)
            
            best_split=self.get_best_split(dataset, nb_samples,nb_features)
            #print(best_split)
            if best_split!={}:
                if best_split['info_gain']>0  :
                    #print(best_split['info_gain'])
                    left_subtree=self.build_tree(best_split["dataset_left"],curr_depth+1)
                    rigth_subtree=self.build_tree(best_split["dataset_right"],curr_depth+1)
                    return Node(best_split["feature_index"],best_split["threshold"],
                            left_subtree,rigth_subtree,best_split["info_gain"])
     
            #compute leaf node
            leaf_value=self.calculate_leaf_value(Y)
            return Node(value=leaf_value)
        
        
    def get_best_split(self,dataset,nb_samples,nb_features):
            
            #dictionary to store the best split
            best_split={}
            max_info_gain=-float("inf")
            
            #loop over all the features
            for feature_index in range(nb_features):
                feature_values=dataset[:,feature_index]
                possible_thresholds=np.unique(feature_values)
                #loop over all the feature values present in data
                for threshold in possible_thresholds:
                    #get current split
                    dataset_left,dataset_right= self.split(dataset,feature_index,threshold)
                    if len(dataset_left)>0 and len(dataset_right)>0:
                        y,left_y,right_y=dataset[:,-1],dataset_left[:,-1],dataset_right[:,-1]
                        #compute info gain
                        curr_info_gain=self.information_gain(y,left_y,right_y,"gini")
                        #update best split if conditions are met
                        if curr_info_gain>max_info_gain:
                            best_split["feature_index"]=feature_index
                            best_split["threshold"]=threshold
                            best_split["dataset_left"]=dataset_left
                            best_split["dataset_right"]=dataset_right
                            best_split["info_gain"]=curr_info_gain
                            max_info_gain=curr_info_gain
            return best_split
        
    def split(self,dataset,feature_index,threshold):
        dataset_left=np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right=np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left,dataset_right
    
    def information_gain(self,parent,l_split,r_split,mode="entropy"):
        weight_l=len(l_split)/len(parent)
        weight_r=len(r_split)/len(parent)
        if mode=="gini":
            gain=self.gini_index(parent)-(weight_l*self.gini_index(l_split)+ weight_r*self.gini_index(r_split))
        else:
            gain=self.entropy(parent)-(weight_l*self.entropy(l_split)+ weight_r*self.entropy(r_split))
        return gain
    
    def entropy(self,y):
        class_labels=np.unique(y)
        entropy=0
        for cls in class_labels:
            p_cls=len(y[y==cls])/len(y)
            entropy += -p_cls*np.log2(p_cls)
        return entropy
    def gini_index(self,y):
        class_labels=np.unique(y)
        gini=0
        for cls in class_labels:
            p_cls=len(y[y==cls])/len(y)
            gini += p_cls**2
        return 1-gini
    def calculate_leaf_value(self,Y):
        Y=list(Y)
        return max(Y, key=Y.count)
    def print_tree(self,tree=None,indent=" "):
        if not tree:
            tree=self.root
        if tree.value is not None:
            print(tree.Value)
        else:
            print("X_"+str(tree.feature_index),"<=", tree.threshold, "?",tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent+indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    def fit(self, X, Y):
        dataset= np.concatenate((X, Y), axis=1)
        self.root= self.build_tree(dataset)
    def predict(self,X):
        predictions= [self.make_prediction(x, self.root) for x in X]
        return predictions
    def make_prediction(self,x,tree):
       
        if tree.value!=None: return tree.value
        feature_val=x[tree.feature_index]
        print(tree.value,feature_val,tree.threshold,tree.feature_index)
        if feature_val<=tree.threshold:
            return self.make_prediction(x,tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [10]:
X2=train.iloc[:,:-1].values
Y2=train.iloc[:,-1].values.reshape(-1,1)
X2_train,X2_test,Y2_train,Y2_test= train_test_split(X2,Y2,test_size=.2,random_state=41)

In [11]:
tree_classifier=Decision_tree(min_samples_split=10, max_depth=12)
tree_classifier.fit(X2_train,Y2_train)
#tree_classifier.print_tree()
#tree_classifier.predict(X2_test)
#print(tree_classifier.root)

<__main__.Node object at 0x0000025867B1A8E0>
