In [1]:
import numpy as np
import pandas as pd

In [2]:
from scipy.stats import zscore
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [4]:
# models------------------------------------------------------------
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from warnings import filterwarnings
filterwarnings('ignore')

In [8]:
df = pd.read_csv("clean_train_data.csv")
df.head()

Unnamed: 0,EmployeeID,Attrition,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,...,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,5110001,0,35,1,0,5,0,1,69.0,1,...,1,1,1,1,20.0,7,2,3,1,18932.0
1,5110002,1,32,2,2,5,5,0,62.0,4,...,2,0,8,0,20.0,4,1,3,2,18785.0
2,5110003,0,31,1,0,5,5,0,45.0,5,...,2,1,3,0,26.0,12,1,3,2,22091.0
3,5110004,0,34,2,2,10,5,0,32.0,3,...,4,1,1,0,23.0,5,1,3,0,20302.0
4,5110005,0,37,0,0,27,5,0,49.0,3,...,4,1,8,0,21.0,12,1,9,0,21674.0


In [9]:
df[df.columns[2:]].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,5180.0,37.039575,8.96244,18.0,31.0,36.0,42.0,60.0
TravelProfile,5180.0,1.107915,0.535057,0.0,1.0,1.0,1.0,2.0
Department,5180.0,0.666409,0.919795,0.0,0.0,0.0,2.0,2.0
HomeToWork,5180.0,10.987259,8.122103,1.0,5.0,9.0,15.0,36.0
EducationField,5180.0,2.762741,2.180032,0.0,0.0,3.0,5.0,5.0
Gender,5180.0,0.606178,0.488643,0.0,0.0,1.0,1.0,1.0
HourlnWeek,5180.0,58.040251,12.613182,24.5,50.0,59.0,67.0,92.5
Involvement,5180.0,3.226641,0.872431,1.0,3.0,3.0,4.0,5.0
WorkLifeBalance,5180.0,3.012741,1.410602,1.0,2.0,3.0,4.0,5.0
Designation,5180.0,1.699421,0.943366,0.0,1.0,2.0,2.0,4.0


In [10]:
df.shape

(5180, 22)

### Model Prerequisites

In [11]:
# dependent and independent variables

x = df.drop(["EmployeeID","Attrition"], axis=1)
y = df["Attrition"]

In [12]:
# train and test data splits

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=0)

In [15]:
x_train.columns

Index(['Age', 'TravelProfile', 'Department', 'HomeToWork', 'EducationField',
       'Gender', 'HourlnWeek', 'Involvement', 'WorkLifeBalance', 'Designation',
       'JobSatisfaction', 'ESOPs', 'NumCompaniesWorked', 'OverTime',
       'SalaryHikelastYear', 'WorkExperience', 'LastPromotion',
       'CurrentProfile', 'MaritalStatus', 'MonthlyIncome'],
      dtype='object')

In [14]:
print("Train Data Size: {}".format(x_train.shape[0]))
print("Test Data Size: {}".format(x_test.shape[0]))
print("\n")
print("Train Target Class Proportion")
print("")
print(y_train.value_counts(normalize=True))

Train Data Size: 3626
Test Data Size: 1554


Train Target Class Proportion

0    0.718423
1    0.281577
Name: Attrition, dtype: float64


In [28]:
# fitting and training the model-------------------------------------
model = RandomForestClassifier(max_features=12, n_estimators=200).fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 1.0
  Test Data : 0.9839


#----- f1-score -----#

  Train Data: 1.0
  Test Data : 0.9708


In [29]:
import pickle

In [30]:
# saving the model to localdisk

pickle_out = open("attrition.pkl","wb")
pickle.dump(model, pickle_out)

In [31]:
# loading and check the model

loaded_model = pickle.load(open("attrition.pkl", "rb"))
loaded_model.score(x_test,y_test)

0.9839124839124839