# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder                             #to convert alphanumeric data to numeric
from sklearn.model_selection import train_test_split                       #to split data for training and testing
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.metrics import confusion_matrix                               #to judge accuracy (fmeasure, precision and recall)
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

# Importing Dataset

In [2]:
train = pd.read_excel('train.xlsx')
test = pd.read_excel('test.xlsx')

# Data set summary

In [3]:
print(train.info())
print(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 20 columns):
EmployeeNumber             999 non-null int64
Age                        999 non-null int64
BusinessTravel             999 non-null object
Department                 999 non-null object
Education                  999 non-null int64
EducationField             999 non-null object
Gender                     999 non-null object
JobLevel                   999 non-null int64
JobRole                    999 non-null object
JobSatisfaction            999 non-null int64
MaritalStatus              999 non-null object
MonthlyIncome              999 non-null int64
NumCompaniesWorked         999 non-null int64
OverTime                   999 non-null object
PercentSalaryHike          999 non-null int64
PerformanceRating          999 non-null int64
TotalWorkingYears          999 non-null int64
YearsAtCompany             999 non-null int64
YearsSinceLastPromotion    999 non-null int64
Attrition     

# Separation of Features, Class Label

In [4]:
X = train.iloc[:, [1, 9, 11]].values
y = train.iloc[:, -1].values
print(X[:5])
print(y[:5])

labelencoder_y =LabelEncoder()
y = labelencoder_y.fit_transform(y)
print("Sample y:",y[:5])
print("0 :",labelencoder_y.classes_[0])
print("1 :",labelencoder_y.classes_[1])

[[  41    4 5993]
 [  49    2 5130]
 [  37    3 2090]
 [  33    3 2909]
 [  27    2 3468]]
['Yes' 'No' 'Yes' 'No' 'No']
Sample y: [1 0 1 0 0]
0 : No
1 : Yes


# y has Categorical data hence needs Encoding

In [5]:
Xt = test.iloc[:, [1, 9, 11]].values
yt = test.iloc[:, -1].values
print(Xt[:5])
print(yt[:5])

labelencoder_yt =LabelEncoder()
yt = labelencoder_yt.fit_transform(yt)
print("Sample yt:",yt[:5])
print("0 :",labelencoder_yt.classes_[0])
print("1 :",labelencoder_yt.classes_[1])

[[   42     1 16799]
 [   52     1  2950]
 [   37     3  3629]
 [   35     4  9362]
 [   25     4  3229]]
['No' 'No' 'No' 'No' 'No']
Sample yt: [0 0 0 0 0]
0 : No
1 : Yes


# Splitting of Data into Training & Testing

In [6]:
X_train= X
y_train= y
X_test= Xt
y_test= yt

# Machine: Classifier | Classifier: Decision Tree

In [7]:
classifier = dtc(criterion='entropy', min_samples_leaf=4, random_state=0)
classifier.fit(X_train, y_train)                                              #train(features,associated class label)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

# Predicting the Test set results

In [8]:
y_pred = classifier.predict(X_test)                                           #testing classifier on testing data

In [9]:
X_test

array([[   42,     1, 16799],
       [   52,     1,  2950],
       [   37,     3,  3629],
       ...,
       [   27,     2,  6142],
       [   49,     2,  5390],
       [   34,     3,  4404]], dtype=int64)

In [10]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

# Making the Confusion Matrix

In [11]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[366  35]
 [ 63   7]]


classification accuracy = correct predictions / total predictions

In [12]:
(366+7)/(63+35+366+7)

0.7919320594479831

# Finding Accuracy of Model

In [13]:
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_test, y_pred)
acc

0.7919320594479831

In [14]:
err=1-acc                                                                             #error
err

0.20806794055201694

# Drawing the Decision Tree

In [15]:
dot_data=StringIO()
export_graphviz(classifier,out_file=dot_data, filled=True,rounded=True,special_characters=True)
graph=pydotplus.graph_from_dot_data(dot_data.getvalue())   
graph.write_png('tree1.png')

True

ref https://www.geeksforgeeks.org/confusion-matrix-machine-learning/