In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.tree import DecisionTreeClassifier
from binarytree import tree,Node
import copy

In [2]:
df=pd.read_csv("train.csv")

In [3]:
X=df.drop(['left','number_project','last_evaluation','satisfaction_level','average_montly_hours','time_spend_company'],axis=1)
Y=df['left']
X = pd.concat([X,pd.get_dummies(X['sales'], prefix='sales')],axis=1)
X = pd.concat([X,pd.get_dummies(X['salary'], prefix='salary')],axis=1)
X.drop(['sales','salary'],axis=1, inplace=True)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
X_Original_train=copy.deepcopy(X_train)
X_Original_test=copy.deepcopy(X_test)
X_train = pd.concat([X_train,Y_train],axis=1)
eps = np.finfo(float).eps
# X_train

In [5]:
def find_overall_entropy(X_train):
    if X_train.empty:
        return
    output=X_train.keys()[-1]
#     print len(X_train[output])
    target_value_list=X_train[output].unique()
    entropy=0
    for value in target_value_list:
        frac = float(X_train[output].value_counts()[value])/(len(X_train[output])+eps)
        entropy+=-frac*np.log2(frac+eps)
    return entropy

In [6]:
def find_attr_entropy(X_train,attr):
    if X_train.empty:
        return
    output=X_train.keys()[-1]
    target_value_list=X_train[output].unique()
    current_value_list=X_train[attr].unique()
    final_entropy=0
    den=1
    for cur_value in current_value_list:
        entropy=0
        for value in target_value_list:
            num = len(X_train[attr][X_train[attr] == cur_value][X_train[output] == value])
            den = len(X_train[attr][X_train[attr] == cur_value])
            frac=num/(den+eps)
            entropy+=-frac*np.log2(frac+eps)
        frac2=float(den)/len(X_train)
        final_entropy+=-frac2*entropy
    return final_entropy

In [7]:
def find_best_node(X_train):
    if X_train.empty:
        return
    information_gain=[]
    for attr in X_train.keys()[:-1]:
        information_gain.append(find_overall_entropy(X_train)+find_attr_entropy(X_train,attr))
    return X_train.keys()[:-1][np.argmax(information_gain)]

In [8]:
def get_subdataframe(X_train,node,value):
    if X_train.empty:
        return
    return X_train[X_train[node] == value].reset_index(drop=True)

In [9]:
class nodestruct:
    def __init__(self, value, pos=0, neg=0, left=None, right=None):
        self.value=value
        self.positive=pos
        self.negative=neg
        self.left=left
        self.right=right

In [10]:
def build_tree(X_train):
    
    if len(X_train.columns)==1:
        return 
    output=X_train.keys()[-1]
    node = find_best_node(X_train)
    
    root=nodestruct(node)
#     print root.value

    outputValue,counts = np.unique(X_train[output],return_counts=True)                        
    
    if len(counts)<=1:
        if outputValue[0]==0:
            root.negative=counts[0]
        else:
            root.positive=counts[0]
    else:
        if outputValue[0]==0:
            root.negative=counts[0]
        else:
            root.positive=counts[0]
        if outputValue[1]==0:
            root.negative=counts[1]
        else:
            root.positive=counts[1]

        attValue=X_train[node].unique()
        for val in attValue:

            subdataframe = get_subdataframe(X_train,node,val)                     
            subdataframe=subdataframe.drop([node],axis=1)
            outputValue,counts = np.unique(subdataframe[output],return_counts=True)                        
            if len(counts)<=1:
                pass
            else:
                if val == 1:
                    root.right = build_tree(subdataframe)
                else:
                    root.left = build_tree(subdataframe)

    return root

In [11]:
rootNode=build_tree(X_train)
# print rootNode.right.value

In [12]:
def testing(X_test,root):
    
    if root.left == None and root.right == None:
        if root.positive>root.negative:
            y_pred.append(1)
        else:
            y_pred.append(0)
        return
    
    j = root.value
    if j in X_test:
        if X_test[j] == 1:
            if root.right==None:
                if root.positive>root.negative:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
            else:
                testing(X_test,root.right)
        else:
            if root.left==None:
                if root.positive>root.negative:
                    y_pred.append(1)
                else:
                    y_pred.append(0)
            else:
                testing(X_test,root.left)
    else:
        if root.left==None:
            if root.positive>root.negative:
                y_pred.append(1)
            else:
                y_pred.append(0)
        else:
            testing(X_test,root.left)

In [17]:
y_pred=[]

# X_test1 = pd.read_csv("sample_test.csv")
# X_test1 = pd.concat([X_test1,pd.get_dummies(X_test1['sales'], prefix='sales')],axis=1)
# X_test1 = pd.concat([X_test1,pd.get_dummies(X_test1['salary'], prefix='salary')],axis=1)
# X_test1.drop(['sales','salary'],axis=1, inplace=True)

for index,row in X_test.iterrows():
    testing(row,rootNode)

# print y_pred

print confusion_matrix(Y_test,y_pred)
print classification_report(Y_test,y_pred)
print accuracy_score(Y_test, y_pred)

[[1699    0]
 [ 549    0]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      1699
           1       0.00      0.00      0.00       549

   micro avg       0.76      0.76      0.76      2248
   macro avg       0.38      0.50      0.43      2248
weighted avg       0.57      0.76      0.65      2248

0.7557829181494662


In [14]:
modelTrain = DecisionTreeClassifier()
modelTrain.fit(X_Original_train,Y_train)
Y_predict=modelTrain.predict(X_Original_test)
print confusion_matrix(Y_test,Y_predict)
print classification_report(Y_test,Y_predict)
print accuracy_score(Y_test, Y_predict)

[[1699    0]
 [ 549    0]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      1699
           1       0.00      0.00      0.00       549

   micro avg       0.76      0.76      0.76      2248
   macro avg       0.38      0.50      0.43      2248
weighted avg       0.57      0.76      0.65      2248

0.7557829181494662


  'precision', 'predicted', average, warn_for)
