In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from binarytree import tree,Node

In [16]:
df=pd.read_csv("train.csv")

In [17]:
X=df.drop(['left','number_project','last_evaluation','satisfaction_level','average_montly_hours','time_spend_company'],axis=1)
Y=df['left']
X = pd.concat([X,pd.get_dummies(X['sales'], prefix='sales')],axis=1)
X = pd.concat([X,pd.get_dummies(X['salary'], prefix='salary')],axis=1)
X.drop(['sales','salary'],axis=1, inplace=True)

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2)
X_train = pd.concat([X_train,Y_train],axis=1)
eps = np.finfo(float).eps
# X_train

In [19]:
def find_overall_entropy(X_train):
    output=X_train.keys()[-1]
#     print len(X_train[output])
    target_value_list=X_train[output].unique()
    entropy=0
    for value in target_value_list:
        frac = float(X_train[output].value_counts()[value])/(len(X_train[output])+eps)
        entropy+=-frac*np.log2(frac+eps)
    return entropy

In [20]:
def find_attr_entropy(X_train,attr):
    output=X_train.keys()[-1]
    target_value_list=X_train[output].unique()
    current_value_list=X_train[attr].unique()
    final_entropy=0
    den=1
    for cur_value in current_value_list:
        entropy=0
        for value in target_value_list:
            num = len(X_train[attr][X_train[attr] == cur_value][X_train[output] == value])
            den = len(X_train[attr][X_train[attr] == cur_value])
            frac=num/(den+eps)
            entropy+=-frac*np.log2(frac+eps)
        frac2=float(den)/len(X_train)
        final_entropy+=-frac2*entropy
    return final_entropy

In [21]:
def find_best_node(X_train):
    information_gain=[]
    for attr in X_train.keys()[:-1]:
        information_gain.append(find_overall_entropy(X_train)+find_attr_entropy(X_train,attr))
    return X_train.keys()[:-1][np.argmax(information_gain)]

In [22]:
def get_subdataframe(X_train,node,value):
    return X_train[X_train[node] == value].reset_index(drop=True)

In [23]:
class nodestruct:
    def __init__(self, value, left=None, right=None):
        self.value=value
        self.left=left
        self.right=right

In [24]:
def build_tree(X_train):
    if len(X_train.columns)==1:
        return 
    output=X_train.keys()[-1]
    node = find_best_node(X_train)
    
    root=nodestruct(node)
#     print root.value
    
    attValue=X_train[node].unique()
    for val in attValue:

        subdataframe = get_subdataframe(X_train,node,val)
        outputValue,counts = np.unique(subdataframe[output],return_counts=True)                        
        subdataframe=subdataframe.drop([node],axis=1)

        if len(counts)<=1:
            root.value=outputValue[0]
            return root
        else:
            if val == 0:
                root.left = build_tree(subdataframe)
            else:
                root.right = build_tree(subdataframe)
    
    return root

In [25]:
rootNode=build_tree(X_train)
# print rootNode.right.value

In [26]:
def testing(X_test,root):
    if root.value == 0 or root.value == 1:
        y_pred.append(root.value)
#         print root.value
        return
    features = X_test.keys()
#     print features
    for j in features:
        if j == root.value:
#             X_test=X_test.drop([j],axis=1)
            if X_test[j] == '0':
                testing(X_test,root.left)
            else:
                testing(X_test,root.right)

In [27]:
y_pred=[]
for index,row in X_test.iterrows():
    testing(row,rootNode)
# print len(y_pred)
# print len(Y_test)
# Y_test
print confusion_matrix(Y_test,y_pred)
print classification_report(Y_test,y_pred)

[[1720    0]
 [ 528    0]]
              precision    recall  f1-score   support

           0       0.77      1.00      0.87      1720
           1       0.00      0.00      0.00       528

   micro avg       0.77      0.77      0.77      2248
   macro avg       0.38      0.50      0.43      2248
weighted avg       0.59      0.77      0.66      2248



In [28]:
modelTrain = DecisionTreeClassifier()
X_tmp=X_train.drop(['left'],axis=1)
modelTrain.fit(X_tmp,Y_train)
Y_predict=modelTrain.predict(X_test)
print confusion_matrix(Y_test,Y_predict)
print classification_report(Y_test,Y_predict)

[[1720    0]
 [ 528    0]]
              precision    recall  f1-score   support

           0       0.77      1.00      0.87      1720
           1       0.00      0.00      0.00       528

   micro avg       0.77      0.77      0.77      2248
   macro avg       0.38      0.50      0.43      2248
weighted avg       0.59      0.77      0.66      2248

