In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing

In [2]:
df = pd.read_csv("general_data.csv")

In [3]:

df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

### Check for NULL values

In [7]:
df.isna().sum()

Age                         0
Attrition                   0
BusinessTravel              0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeID                  0
Gender                      0
JobLevel                    0
JobRole                     0
MaritalStatus               0
MonthlyIncome               0
NumCompaniesWorked         19
Over18                      0
PercentSalaryHike           0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           9
TrainingTimesLastYear       0
YearsAtCompany              0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

In [8]:
df['TotalWorkingYears'].mean()

11.279936378095888

In [9]:
new_TotalWorkingYears = np.where(df['TotalWorkingYears'].isna(),11.0,df['TotalWorkingYears'])

In [10]:
df['TotalWorkingYears'] = new_TotalWorkingYears

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
label_encoder = preprocessing.LabelEncoder()

In [13]:
df['Attrition'] = label_encoder.fit_transform(df['Attrition'])

In [14]:
df['Gender'] = label_encoder.fit_transform(df['Gender'])

### Clean the data and finding importatnt variable(features) using Random forest

In [15]:
df['NumCompaniesWorked'].fillna(method='pad',inplace=True)

In [17]:
rf_model = RandomForestClassifier(n_estimators=1000,max_depth=2,oob_score=True)

In [21]:

features = ['Age','DistanceFromHome','Education','EmployeeCount','Gender','NumCompaniesWorked','PercentSalaryHike','StandardHours','StockOptionLevel','TotalWorkingYears','TrainingTimesLastYear',
            'YearsAtCompany','YearsSinceLastPromotion','YearsWithCurrManager']

In [22]:
rf_model.fit(X=df[features],y=df['Attrition'])

RandomForestClassifier(max_depth=2, n_estimators=1000, oob_score=True)

In [23]:
print('Accuracy:')
print(rf_model.oob_score_)

Accuracy:
0.8387755102040816


In [24]:
for features,imp in zip(features,rf_model.feature_importances_):
    print(features,imp)

Age 0.19103757859696102
DistanceFromHome 0.01615846884952093
Education 0.00668121205475806
EmployeeCount 0.0
Gender 0.0035673278343711875
NumCompaniesWorked 0.05730325907140456
PercentSalaryHike 0.015417378966784929
StandardHours 0.0
StockOptionLevel 0.0030069289386720575
TotalWorkingYears 0.2854597634126803
TrainingTimesLastYear 0.027758073659779864
YearsAtCompany 0.20490151709473933
YearsSinceLastPromotion 0.021300449029310945
YearsWithCurrManager 0.1674080424910168


In [26]:
predictors = pd.DataFrame(df['TotalWorkingYears'],df['YearsAtCompany'])
predictors

Unnamed: 0_level_0,TotalWorkingYears
YearsAtCompany,Unnamed: 1_level_1
1,6.0
5,28.0
5,28.0
8,10.0
6,5.0
...,...
3,13.0
3,13.0
4,9.0
9,6.0


### Training and Predicting the values

In [27]:
tree_model = tree.DecisionTreeClassifier(max_depth=4)

In [29]:
tree_model.fit(X=predictors,y=df['Attrition'])

DecisionTreeClassifier(max_depth=4)

In [30]:
tree_model.score(X=predictors,y=df['Attrition'])

0.8387755102040816

In [31]:
with open("Tree_attrition.dot","w") as f:
    f = tree.export_graphviz(tree_model,feature_names=['YearsAtCompany'],out_file=f);

### Inference

<img src="tree_attrition.png" width="800" />

__1.	If the yearsatcompany<6.5 , the people who are removed or the cases where attrition happens is 3699 and people who are remained is 711.__

__2.	Above class is further subdivided into if the yearsatcompany<=1.5 then the people who are removed from the office are 1332 and 375 are kept.__

__3.	Then further if the yearsatcompany=1 then ppl who are removed are 84 and 48 are kept further subset for yearsatcompany<=5.5 then ppl who are removed are 1248 and 327 are kept in the company this further is subdivided into two more category of for yrs<=3.5 and in that category 612 are removed and 123 are kept in the company.__

__4.	For yrsatcompany<=32.5 then it can further divided into for yrs>=32.5 42 ppl are removed and 0 are kept and  years <=22.5 2325 are kept and 336 are removed.__

__5.	For yrsatcompany<=8.5 the ppl who are removed are characterized in two samples 1758 and 270 . out of 1758 87 are removed for less than 8.5 yrs of work at company and 1671 are removed for work greater than 8.5 yrs.__

__6.	 For yrsatcompany>8.5 but <=25 yrs  567 ppl are removed from the company and 66 are kept out of these for workatcompany<25.5, 564 ppl are removed and for yrs at company<25.5, 3 ppl are removed.__
