In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from binarytree import tree,Node
import copy

In [2]:
df=pd.read_csv("train.csv")

In [3]:
X=df.drop(['left'],axis=1)
Y=df['left']

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
X_train = pd.concat([X_train,Y_train],axis=1)
eps = np.finfo(float).eps
numAttr=['last_evaluation','satisfaction_level','average_montly_hours','time_spend_company','number_project']

In [5]:
def find_overall_gini(X_train):
    output=X_train.keys()[-1]
    target_value_list=X_train[output].unique()
    gini=2
    for value in target_value_list:
        frac = float(X_train[output].value_counts()[value])/(len(X_train[output])+eps)
        gini*=frac
    return abs(gini)

In [6]:
def find_attr_gini(X_train,attr):
    
    output=X_train.keys()[-1]
    target_value_list=X_train[output].unique()
    current_value_list=X_train[attr].unique()
    final_gini=0
    den=1
    for cur_value in current_value_list:
        gini=2
        for value in target_value_list:
            num = len(X_train[attr][X_train[attr] == cur_value][X_train[output] == value])
            den = len(X_train[attr][X_train[attr] == cur_value])
            frac=num/(den+eps)
            gini*=frac
        frac2=float(den)/len(X_train)
        final_gini+=frac2*gini
    return abs(final_gini)

In [7]:
def get_subdataframeNum1(X_train,node,value):
    return X_train[X_train[node] < value].reset_index(drop=True)

In [8]:
def get_subdataframeNum2(X_train,node,value):
    return X_train[X_train[node] >= value].reset_index(drop=True)

In [9]:
storemidpoint={}
storemidpoint.clear()
def split_calculation(X_train):
    gini1=find_overall_gini(X_train)
    for feature in numAttr:
        IGMax=0
        midmax=0
        uniqueVal = X_train[feature].unique()
        for mid in uniqueVal:
            
            subtable1=get_subdataframeNum1(X_train,feature,mid)
            subtable2=get_subdataframeNum2(X_train,feature,mid)
            firstgini=0
            secondgini=0
            if subtable1.empty:
                firstgini=0
            else:
                frac1=float(len(subtable1))/len(X_train)
                firstgini = frac1*find_overall_gini(subtable1)
            if subtable2.empty:
                secondgini=0
            else:
                frac2=float(len(subtable2))/len(X_train)
                secondgini = frac2*find_overall_gini(subtable2)

            gini2=firstgini+secondgini
            ig=abs(gini1)-abs(gini2)
            if ig > IGMax:
                IGMax=ig
                midmax=mid
        _tuple=(IGMax,midmax)
        storemidpoint[feature]=_tuple
    return storemidpoint

In [10]:
def find_best_node(X_train):
    information_gain={}
    gini_1=find_overall_gini(X_train)
    for attr in X_train.keys()[:-1]:
        if attr not in numAttr:
            information_gain[attr]=((gini_1-find_attr_gini(X_train,attr)),0)
    return information_gain

In [11]:
def get_subdataframe(X_train,node,value):
    return X_train[X_train[node] == value].reset_index(drop=True)

In [12]:
class nodestruct:
    def __init__(self, value):
        self.value=value
        self.positive=0
        self.negative=0
        self.child=[]
        self.midpoint=0
        self.values=[]

In [13]:
def build_tree(X_train):
    
    if len(X_train)==1:
        return None
    
    output=X_train.keys()[-1]
    outputValue,counts = np.unique(X_train[output],return_counts=True)                        
    
    
    if len(counts)<=1:
        root=nodestruct(outputValue[0])
        if outputValue[0]==0:
            root.negative = counts[0]
        else:
            root.positive = counts[0]
        return root
    
    else:
        
        numericsplit = split_calculation(X_train)
        categorysplit = find_best_node(X_train)
        numericsplit.update(categorysplit)
        wonAttr=""
        IGMAX=0
        Split_point=0
        
        for key,tuple_ in numericsplit.items():
            if IGMAX < tuple_[0]:
                IGMAX = tuple_[0]
                Split_point = tuple_[1]
                wonAttr = key
                
        categorysplit.clear()
        numericsplit.clear()
        if wonAttr=="":
            return
        root = nodestruct(wonAttr)
        root.midpoint = Split_point
        root.negative = counts[0]
        root.positive = counts[1]
        
        if wonAttr in numAttr:
            subtable0 = get_subdataframeNum1(X_train,wonAttr,Split_point)
            subtable1 = get_subdataframeNum2(X_train,wonAttr,Split_point)
            root.child.append(build_tree(subtable0))
            root.child.append(build_tree(subtable1))
            
        else:
            for j in X_train[wonAttr].unique():
                subtable0 = get_subdataframe(X_train,wonAttr,j)
                subtable0 = subtable0.drop([wonAttr],axis=1)
                root.values.append(j)
                root.child.append(build_tree(subtable0))
    
        return root

In [14]:
rootNode=build_tree(X_train)
# print rootNode.value

In [15]:
# print storemidpoint

In [16]:
def testing(X_test,root):
    
    if len(root.child)==0:
        y_pred.append(root.value)
        return
    i=root.value
    flag=0
    if i in numAttr:
        if X_test[i]<root.midpoint and root.child[0]!=None:
            flag=1
            testing(X_test,root.child[0])
        elif root.child[1]!=None:
            flag=1
            testing(X_test,root.child[1])
        if flag==0:
            if root.positive>root.negative:
                y_pred.append(1)
            else:
                y_pred.append(0)
            return
    else:
        k=0
        for j in root.values:
            if X_test[i]==j:
                if root.child[k]!=None:
                    flag=1
                    testing(X_test,root.child[k])
                    break
                else:
                    if root.positive>root.negative:
                        y_pred.append(1)
                    else:
                        y_pred.append(0)
                    return
            k=k+1
        if flag==0:
            if root.positive>root.negative:
                y_pred.append(1)
            else:
                y_pred.append(0)
            return

In [17]:
y_pred=[]

# X_test1 = pd.read_csv("sample_test.csv")

for index,row in X_test.iterrows():
    testing(row,rootNode)
    
# print y_pred

print confusion_matrix(Y_test,y_pred)
print classification_report(Y_test,y_pred)
print accuracy_score(Y_test, y_pred)

[[1690   14]
 [  41  503]]
              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1704
           1       0.97      0.92      0.95       544

   micro avg       0.98      0.98      0.98      2248
   macro avg       0.97      0.96      0.97      2248
weighted avg       0.98      0.98      0.98      2248

0.9755338078291815


In [18]:
df1 = pd.read_csv("train.csv")
X1 = df1.drop(['left'],axis=1)
Y1 = df1['left']
X1 = pd.concat([X1,pd.get_dummies(X1['sales'], prefix='sales')],axis=1)
X1 = pd.concat([X1,pd.get_dummies(X1['salary'], prefix='salary')],axis=1)
X1.drop(['sales','salary'],axis=1, inplace=True)
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X1,Y1,test_size = 0.2)
modelTrain = DecisionTreeClassifier()
modelTrain.fit(X_train1,Y_train1)
Y_predict=modelTrain.predict(X_test1)
print confusion_matrix(Y_test1,Y_predict)
print classification_report(Y_test1,Y_predict)
print accuracy_score(Y_test1, Y_predict)

[[1675   40]
 [  27  506]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1715
           1       0.93      0.95      0.94       533

   micro avg       0.97      0.97      0.97      2248
   macro avg       0.96      0.96      0.96      2248
weighted avg       0.97      0.97      0.97      2248

0.9701957295373665
