In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from binarytree import tree,Node
import copy

In [2]:
df=pd.read_csv("train.csv")

In [3]:
X=df.drop(['left'],axis=1)
Y=df['left']
X = pd.concat([X,pd.get_dummies(X['sales'], prefix='sales')],axis=1)
X = pd.concat([X,pd.get_dummies(X['salary'], prefix='salary')],axis=1)
X.drop(['sales','salary'],axis=1, inplace=True)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
X_Original_train=copy.deepcopy(X_train)
X_train = pd.concat([X_train,Y_train],axis=1)
X_Original_test=copy.deepcopy(X_test)
eps = np.finfo(float).eps
# X_train

In [5]:
def find_overall_misclassification(X_train):
    output=X_train.keys()[-1]
    target_value_list=X_train[output].unique()
    misclassification = 100
    for value in target_value_list:
        frac = float(X_train[output].value_counts()[value])/(len(X_train[output])+eps)
        misclassification=min(misclassification,frac)
    return misclassification

In [6]:
def find_attr_misclassification(X_train,attr):
    if X_train.empty:
        return
    output=X_train.keys()[-1]
    target_value_list=X_train[output].unique()
    current_value_list=X_train[attr].unique()
    final_misclassification=0
    den=1
    for cur_value in current_value_list:
        misclassification=100
        for value in target_value_list:
            num = len(X_train[attr][X_train[attr] == cur_value][X_train[output] == value])
            den = len(X_train[attr][X_train[attr] == cur_value])
            frac=num/(den+eps)
            misclassification=min(misclassification,frac)
        frac2=float(den)/len(X_train)
        final_misclassification+=frac2*misclassification
    return final_misclassification

In [7]:
def get_subdataframeNum1(X_train,node,value):
    if X_train.empty:
        return
    return X_train[X_train[node] < value].reset_index(drop=True)

In [8]:
def get_subdataframeNum2(X_train,node,value):
    if X_train.empty:
        return
    return X_train[X_train[node] >= value].reset_index(drop=True)

In [9]:
storemidpoint={}
storemidpoint.clear()
numeric_features = ['number_project','last_evaluation','satisfaction_level','average_montly_hours','time_spend_company']
for feature in numeric_features:
    IGMax=0
    midmax=0
    prev=None
    X_train.sort_values(feature,inplace=True)
    for index,row in X_train.iterrows():
        if prev is not None:
            if prev['left']!=row['left']:
                mid=(float(prev[feature])+float(row[feature]))/2
    #             print mid
                subtable1=get_subdataframeNum1(X_train,feature,mid)
                subtable2=get_subdataframeNum2(X_train,feature,mid)
                firstmisclassification=0
                secondmisclassification=0
                if subtable1.empty:
                    firstmisclassification=0
                else:
                    frac1=float(len(subtable1))/len(X_train)
                    firstmisclassification = frac1*find_overall_misclassification(subtable1)
                if subtable2.empty:
                    secondmisclassification=0
                else:
                    frac2=float(len(subtable2))/len(X_train)
                    secondmisclassification = frac2*find_overall_misclassification(subtable2)
                
                misclassification1=find_overall_misclassification(X_train)
                misclassification2=firstmisclassification+secondmisclassification
                ig=abs(misclassification1)-abs(misclassification2)
    #             print ig
                if ig > IGMax:
                    IGMax=ig
                    midmax=mid

        prev=row
    storemidpoint[feature]=midmax
    for index,row in X_train.iterrows():
        if float(row[feature])>=midmax:
            X_train.at[index,feature]=1
        else:
            X_train.at[index,feature]=0
# print IGMax,midmax

In [10]:
def find_best_node(X_train):
    information_gain=[]
    for attr in X_train.keys()[:-1]:
        information_gain.append(find_overall_misclassification(X_train)+find_attr_misclassification(X_train,attr))
    return X_train.keys()[:-1][np.argmax(information_gain)]

In [11]:
def get_subdataframe(X_train,node,value):
    return X_train[X_train[node] == value].reset_index(drop=True)

In [12]:
class nodestruct:
    def __init__(self, value, pos=0, neg=0, left=None, right=None):
        self.value=value
        self.positive=pos
        self.negative=neg
        self.left=left
        self.right=right

In [13]:
def build_tree(X_train):
    
    if len(X_train.columns)==1:
        return 
    output=X_train.keys()[-1]
    node = find_best_node(X_train)
    
    root=nodestruct(node)
#     print root.value

    outputValue,counts = np.unique(X_train[output],return_counts=True)                        
    
    if len(counts)<=1:
        if outputValue[0]==0:
            root.negative=counts[0]
        else:
            root.positive=counts[0]
    else:
        if outputValue[0]==0:
            root.negative=counts[0]
        else:
            root.positive=counts[0]
        if outputValue[1]==0:
            root.negative=counts[1]
        else:
            root.positive=counts[1]

        attValue=X_train[node].unique()
        for val in attValue:

            subdataframe = get_subdataframe(X_train,node,val)                     
            subdataframe=subdataframe.drop([node],axis=1)
            outputValue,counts = np.unique(subdataframe[output],return_counts=True)                        
            if len(counts)<=1:
                pass
            else:
                if val == 1:
                    root.right = build_tree(subdataframe)
                else:
                    root.left = build_tree(subdataframe)

    return root

In [14]:
rootNode=build_tree(X_train)
# print rootNode.value

In [15]:
print storemidpoint

{'satisfaction_level': 0.46499999999999997, 'last_evaluation': 0.995, 'average_montly_hours': 276.0, 'time_spend_company': 5.0, 'number_project': 2.5}


In [16]:
def testing(X_test,root):
    
    if root.left == None and root.right == None:
        if root.positive>root.negative:
            y_pred.append(1)
        else:
            y_pred.append(0)
        return
    
    j = root.value
    if j in X_test:
        if X_test[j] == 1:
            if root.right==None:
                if root.positive>root.negative:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
            else:
                testing(X_test,root.right)
        else:
            if root.left==None:
                if root.positive>root.negative:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
            else:
                testing(X_test,root.left)
    else:
        if root.left==None:
            if root.positive>root.negative:
                y_pred.append(1)
            else:
                y_pred.append(0)
        else:
            testing(X_test,root.left)

In [17]:
def convert_to_binary(X_test):
    numeric_features = ['number_project','last_evaluation','satisfaction_level','average_montly_hours','time_spend_company']
    for feature in numeric_features:
        for index,row in X_test.iterrows():
            if float(row[feature]) >= storemidpoint[feature]:
                X_test.at[index,feature]=1
            else:
                X_test.at[index,feature]=0

In [18]:
y_pred=[]

# X_test1 = pd.read_csv("sample_test.csv")
# X_test1 = pd.concat([X_test1,pd.get_dummies(X_test1['sales'], prefix='sales')],axis=1)
# X_test1 = pd.concat([X_test1,pd.get_dummies(X_test1['salary'], prefix='salary')],axis=1)
# X_test1.drop(['sales','salary'],axis=1, inplace=True)

convert_to_binary(X_test)

for index,row in X_test.iterrows():
    testing(row,rootNode)
    
# print y_pred

print confusion_matrix(Y_test,y_pred)
print classification_report(Y_test,y_pred)
print accuracy_score(Y_test, y_pred)

[[1588  116]
 [ 188  356]]
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      1704
           1       0.75      0.65      0.70       544

   micro avg       0.86      0.86      0.86      2248
   macro avg       0.82      0.79      0.81      2248
weighted avg       0.86      0.86      0.86      2248

0.8647686832740213


In [19]:
modelTrain = DecisionTreeClassifier()
modelTrain.fit(X_Original_train,Y_train)
Y_predict=modelTrain.predict(X_Original_test)
print confusion_matrix(Y_test,Y_predict)
print classification_report(Y_test,Y_predict)
print accuracy_score(Y_test, Y_predict)

[[1675   29]
 [  20  524]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1704
           1       0.95      0.96      0.96       544

   micro avg       0.98      0.98      0.98      2248
   macro avg       0.97      0.97      0.97      2248
weighted avg       0.98      0.98      0.98      2248

0.978202846975089
