In [2]:
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from IPython.display import Image
import matplotlib.gridspec as gridspec
import seaborn as sns
import warnings
%matplotlib inline 
%config InlineBackend.figure_format = 'retina' 
pd.set_option('display.max_columns', None) 
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv("attrition_data.csv")
display(df.head())

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [5]:
def predict_ready(dframe):
    dframe = dframe.drop(columns=['Over18', 'EmployeeCount', 'StandardHours', 'EmployeeNumber'])
    for cate_features in dframe.select_dtypes(include='object').columns:
        le = preprocessing.LabelEncoder()
        dframe[cate_features] = le.fit_transform(dframe[cate_features])
        #print("Origin Classes:", list(le.classes_))

    dummies = ['Department', 'EducationField', 'JobRole', 'MaritalStatus']
    dframe = pd.get_dummies(data=dframe, columns=dummies)
    #display(df.head())
    numerical_list = ['Age', 'DailyRate', 'DistanceFromHome', 'HourlyRate', 'MonthlyIncome', 'MonthlyRate',
                  'NumCompaniesWorked', 'PercentSalaryHike', 'TotalWorkingYears', 'TrainingTimesLastYear',
                  'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

    std = preprocessing.StandardScaler()
    scaled = std.fit_transform(dframe[numerical_list])
    scaled = pd.DataFrame(scaled, columns=numerical_list)
    for i in numerical_list:
        dframe[i] = scaled[i]
    std = preprocessing.StandardScaler()
    scaled = std.fit_transform(dframe[numerical_list])
    scaled = pd.DataFrame(scaled, columns=numerical_list)
    for i in numerical_list:
        dframe[i] = scaled[i]
    #display(df.head())
    dframe = dframe.drop(columns=['Attrition'])
    return dframe

In [6]:
preprocessedData = predict_ready(df)
display(preprocessedData.head())

Unnamed: 0,Age,BusinessTravel,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Department_0,Department_1,Department_2,EducationField_0,EducationField_1,EducationField_2,EducationField_3,EducationField_4,EducationField_5,JobRole_0,JobRole_1,JobRole_2,JobRole_3,JobRole_4,JobRole_5,JobRole_6,JobRole_7,JobRole_8,MaritalStatus_0,MaritalStatus_1,MaritalStatus_2
0,0.44635,2,0.742527,-1.010909,2,2,0,1.383138,3,2,4,-0.10835,0.72602,2.125136,1,-1.150554,3,1,0,-0.421642,-2.171982,1,-0.164613,-0.063296,-0.679146,0.245834,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
1,1.322365,1,-1.297775,-0.14715,1,3,1,-0.240677,2,2,2,-0.291719,1.488876,-0.678049,0,2.129306,4,4,1,-0.164511,0.155707,3,0.488508,0.764998,-0.368715,0.806541,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
2,0.008343,2,1.414363,-0.887515,2,4,1,1.284725,2,1,3,-0.937654,-1.674841,1.324226,1,-0.057267,3,2,0,-0.550208,0.155707,3,-1.144294,-1.167687,-0.679146,-1.155935,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1
3,-0.429664,1,1.461466,-0.764121,4,4,0,-0.486709,3,1,3,-0.763634,1.243211,-0.678049,1,-1.150554,3,3,0,-0.421642,0.155707,3,0.161947,0.764998,0.252146,-1.155935,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0
4,-1.086676,2,-0.524295,-0.887515,1,1,1,-1.274014,3,1,2,-0.644858,0.3259,2.525591,0,-0.877232,3,4,1,-0.678774,0.155707,3,-0.817734,-0.615492,-0.058285,-0.595227,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0


In [7]:
file = open('decisiontree1.pkl', "rb")
dt = pickle.load(file)

In [8]:
results = dt.predict(preprocessedData)
#my_confusion_matrix(y_test, y_test_pred_tree1) # Defined before
#tree1_auc = roc_auc_score(y_test, y_test_pred_tree1)
#print("AUC:", tree1_auc)

In [15]:
res = list(results)
exit = res.count(1)
stay = res.count(0)
print('Number of employees predicted to be leaving the company:',exit)
print('Number of employees predicted to stay in the company:',stay)
print('Attrition rate = ', (exit / stay) * 100)

Number of employees predicted to be leaving the company: 153
Number of employees predicted to stay in the company: 1317
Attrition rate =  11.617312072892938
