In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn import preprocessing

In [2]:
df = pd.read_csv("general_data.csv")
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
df["Age"].mean()

36.923809523809524

In [4]:
df['Attrition'].replace('Yes',1, inplace=True)
df['Attrition'].replace('No',0, inplace=True)
df['EducationField'].replace('Life Sciences',1, inplace=True)
df['EducationField'].replace('Medical',2, inplace=True)
df['EducationField'].replace('Marketing', 3, inplace=True)
df['EducationField'].replace('Other',4, inplace=True)
df['EducationField'].replace('Technical Degree',5, inplace=True)
df['EducationField'].replace('Human Resources', 6, inplace=True)
df['Department'].replace('Research & Development',1, inplace=True)
df['Department'].replace('Sales',2, inplace=True)
df['Department'].replace('Human Resources', 3, inplace=True)
df['BusinessTravel'].replace('Travel_Rarely',1, inplace=True)
df['BusinessTravel'].replace('Travel_Frequently',2, inplace=True)
df['BusinessTravel'].replace('Non-Travel',3, inplace=True)

In [5]:
features = ["Age","Gender","BusinessTravel","Department","EducationField","DistanceFromHome","EmployeeCount","Education","EmployeeID","PercentSalaryHike","StockOptionLevel","TrainingTimesLastYear","YearsAtCompany","YearsSinceLastPromotion","YearsWithCurrManager"]

In [6]:
for features in df.select_dtypes(include='object').columns:
    le = preprocessing.LabelEncoder()
    df[features] = le.fit_transform(df[features])
    print("Origin Classes:", list(le.classes_))

Origin Classes: ['Female', 'Male']
Origin Classes: ['Healthcare Representative', 'Human Resources', 'Laboratory Technician', 'Manager', 'Manufacturing Director', 'Research Director', 'Research Scientist', 'Sales Executive', 'Sales Representative']
Origin Classes: ['Divorced', 'Married', 'Single']
Origin Classes: ['Y']


In [7]:
df = df.drop(columns=['Over18','EmployeeCount','StandardHours','EmployeeID'])

In [8]:
new_age_var = np.where(df["Age"].isnull(),36,df["Age"])
df["Age"] = new_age_var

In [9]:
tree_model = tree.DecisionTreeClassifier()
pred = pd.DataFrame([df["Age"],df["Gender"],df["BusinessTravel"],df["DistanceFromHome"],df["Education"],df["PercentSalaryHike"],df["StockOptionLevel"],df["TrainingTimesLastYear"],df["YearsAtCompany"],df["YearsSinceLastPromotion"],df["YearsWithCurrManager"]]).T
tree_model.fit(X=pred,y=df["Attrition"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [10]:
with open("Dtree3.dot",'w') as f:
    f = tree.export_graphviz(tree_model,feature_names=["Age","Gender","BusinessTravel","DistanceFromHome","Education","PercentSalaryHike","StockOptionLevel","TrainingTimesLastYear","YearsAtCompany","YearsSinceLastPromotion","YearsWithCurrManager"],out_file=f);

In [11]:
from sklearn.ensemble import RandomForestClassifier
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [12]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=0.2,oob_score=True)
features = ["Age","Gender","BusinessTravel","DistanceFromHome","Education","PercentSalaryHike","StockOptionLevel","TrainingTimesLastYear","YearsAtCompany","YearsSinceLastPromotion","YearsWithCurrManager"]
rf_model.fit(X=df[features],y=df["Attrition"])
print("OOB Accuracy")
print(rf_model.oob_score_)

OOB Accuracy
1.0


In [13]:
for feature,imp in zip(features,rf_model.feature_importances_):
    print(feature,imp);

Age 0.18621369878276547
Gender 0.03100257660456093
BusinessTravel 0.04576723888946632
DistanceFromHome 0.1377351231834176
Education 0.07210719155067832
PercentSalaryHike 0.12257947111404757
StockOptionLevel 0.06026422491823805
TrainingTimesLastYear 0.07688841751453115
YearsAtCompany 0.11546223510042376
YearsSinceLastPromotion 0.06630512099684588
YearsWithCurrManager 0.08567470134502504


# From "feature,imp" We got 'Age' ,'DistanceFromHome', 'PercentSalaryHike', 'YearsAtCompany' are important variable for decision tree.

In [14]:
Final_pred = pd.DataFrame([df["Age"],df["DistanceFromHome"],df["PercentSalaryHike"],df["YearsAtCompany"]]).T
tree_model.fit(X=Final_pred,y=df["Attrition"])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [15]:
with open("Dtree_Att.dot",'w') as f:
    f = tree.export_graphviz(tree_model,feature_names=['Age','DistanceFromHome','PercentSalaryHike','YearsAtCompany'],out_file=f);

In [16]:
rf_model = RandomForestClassifier(n_estimators=1000,max_features=0.2,oob_score=True)
features = ['Age','DistanceFromHome','PercentSalaryHike','YearsAtCompany']
rf_model1 = rf_model.fit(X=df[features],y=df["Attrition"])
print("OOB Accuracy")
print(rf_model.oob_score_)

OOB Accuracy
0.998639455782313


# This model is 99% Accurate.