In [1]:
import numpy as np
import pandas as pd

In [2]:
from scipy.stats import zscore
from sklearn.impute import SimpleImputer

In [3]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [4]:
# models------------------------------------------------------------
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB

In [5]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
from warnings import filterwarnings
filterwarnings('ignore')

In [8]:
df = pd.read_csv("clean_train_data.csv")
df.head()

Unnamed: 0,EmployeeID,Attrition,Age,TravelProfile,Department,HomeToWork,EducationField,Gender,HourlnWeek,Involvement,...,JobSatisfaction,ESOPs,NumCompaniesWorked,OverTime,SalaryHikelastYear,WorkExperience,LastPromotion,CurrentProfile,MaritalStatus,MonthlyIncome
0,5110001,0,35,1,0,5,0,1,69.0,1,...,1,1,1,1,20.0,7,2,3,1,18932.0
1,5110002,1,32,2,2,5,5,0,62.0,4,...,2,0,8,0,20.0,4,1,3,2,18785.0
2,5110003,0,31,1,0,5,5,0,45.0,5,...,2,1,3,0,26.0,12,1,3,2,22091.0
3,5110004,0,34,2,2,10,5,0,32.0,3,...,4,1,1,0,23.0,5,1,3,0,20302.0
4,5110005,0,37,0,0,27,5,0,49.0,3,...,4,1,8,0,21.0,12,1,9,0,21674.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5180 entries, 0 to 5179
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   EmployeeID          5180 non-null   int64  
 1   Attrition           5180 non-null   int64  
 2   Age                 5180 non-null   int64  
 3   TravelProfile       5180 non-null   int64  
 4   Department          5180 non-null   int64  
 5   HomeToWork          5180 non-null   int64  
 6   EducationField      5180 non-null   int64  
 7   Gender              5180 non-null   int64  
 8   HourlnWeek          5180 non-null   float64
 9   Involvement         5180 non-null   int64  
 10  WorkLifeBalance     5180 non-null   int64  
 11  Designation         5180 non-null   int64  
 12  JobSatisfaction     5180 non-null   int64  
 13  ESOPs               5180 non-null   int64  
 14  NumCompaniesWorked  5180 non-null   int64  
 15  OverTime            5180 non-null   int64  
 16  Salary

### Model Prerequisites

In [10]:
# dependent and independent variables

x = df.drop(["EmployeeID","Attrition"], axis=1)
y = df["Attrition"]

In [11]:
# train and test data splits

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, stratify=y)

In [12]:
print("Train Data Size: {}".format(x_train.shape[0]))
print("Test Data Size: {}".format(x_test.shape[0]))
print("\n")
print("Train Target Class Proportion")
print("")
print(y_train.value_counts(normalize=True))

Train Data Size: 3626
Test Data Size: 1554


Train Target Class Proportion

0    0.72118
1    0.27882
Name: Attrition, dtype: float64


In [13]:
# scaling the data for ann, log models

x_train_scaled = x_train.apply(zscore)
x_test_scaled = x_test.apply(zscore)

### Model Testing

### Decision Tree Classifier

In [14]:
# performing grid search

model = DecisionTreeClassifier(random_state=123)

parameters = {"criterion":["gini"],
              "max_depth":[5,6],
              "min_samples_leaf":[50,60,70],
              "min_samples_split":[240,270,300]}

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=5).fit(x_train,y_train)
print(grid.best_params_)

{'criterion': 'gini', 'max_depth': 5, 'min_samples_leaf': 70, 'min_samples_split': 240}


In [15]:
# fitting and training the model-------------------------------------
model = grid.best_estimator_.fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 0.7821
  Test Data : 0.7761


#----- f1-score -----#

  Train Data: 0.5655
  Test Data : 0.5661


### Decision Tree Classifier (Bagging Basis)

In [16]:
# fitting and training the model-------------------------------------
model = BaggingClassifier(n_estimators=25, random_state=123).fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 1.0
  Test Data : 0.9794


#----- f1-score -----#

  Train Data: 1.0
  Test Data : 0.9636


### Decision Tree Classifier (Adaptive Boosting Basis)

In [17]:
# fitting and training the model-------------------------------------

estmr = DecisionTreeClassifier(max_depth=14)
model = AdaBoostClassifier(base_estimator=estmr, n_estimators=75, random_state=123).fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 1.0
  Test Data : 0.9801


#----- f1-score -----#

  Train Data: 1.0
  Test Data : 0.9636


### Decision Tree Classifier (Gradient Boosting Basis)

In [18]:
# fitting and training the model-------------------------------------

model = GradientBoostingClassifier(n_estimators=75, random_state=123).fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 0.8886
  Test Data : 0.8662


#----- f1-score -----#

  Train Data: 0.7681
  Test Data : 0.7158


### Random Forest Classifier

In [19]:
# performing grid search

model = RandomForestClassifier(random_state=123)

parameters = {"n_estimators":[100,200],                 # [150,200,250]
              "max_features":[10,12]}                   # [7,8,9]

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=5).fit(x_train,y_train)
print(grid.best_params_)

{'max_features': 10, 'n_estimators': 200}


In [20]:
# fitting and training the model-------------------------------------
model = grid.best_estimator_.fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 1.0
  Test Data : 0.9858


#----- f1-score -----#

  Train Data: 1.0
  Test Data : 0.9747


### ANN Classifier

In [21]:
# performing grid search

model = MLPClassifier(random_state=123)

parameters = {"hidden_layer_sizes":[(500)],                            # [(500),(100,100)]
              "activation":["tanh"],                                   # ["relu","logistic","tanh","identity"]
              "solver":["adam"],                                       # ["sgd","adam","lbfgs"]
              "max_iter":[400],                                        # [300,400]
              "tol":[0.0001]}                                          # [0.001, 0.0001]

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=5).fit(x_train_scaled,y_train)
print(grid.best_params_)

{'activation': 'tanh', 'hidden_layer_sizes': 500, 'max_iter': 400, 'solver': 'adam', 'tol': 0.0001}


In [22]:
# fitting and training the model-------------------------------------
model = grid.best_estimator_.fit(x_train_scaled,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train_scaled)
test_predictions = model.predict(x_test_scaled)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 1.0
  Test Data : 0.9807


#----- f1-score -----#

  Train Data: 1.0
  Test Data : 0.9658


### Logistic Regression Model

In [23]:
# performing grid search

model = LogisticRegression(random_state=123)

parameters = {"penalty":["l1","l2","elasticnet"],
              "solver":['newton-cg','lbfgs','liblinear','sag','saga'],
              "tol":[0.1,0.01,0.001,0.0001]}

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=5).fit(x_train,y_train)
print(grid.best_params_)

{'penalty': 'l1', 'solver': 'liblinear', 'tol': 0.0001}


In [24]:
# fitting and training the model-------------------------------------
model = grid.best_estimator_.fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 0.8075
  Test Data : 0.8102


#----- f1-score -----#

  Train Data: 0.5965
  Test Data : 0.5897


### LDA Classifier Model

In [25]:
# performing grid search

model = LinearDiscriminantAnalysis()

parameters = {"solver":['svd', 'lsqr', 'eigen'],
              "tol":[0.1,0.01,0.001,0.0001]}

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=5).fit(x_train,y_train)
print(grid.best_params_)

{'solver': 'svd', 'tol': 0.1}


In [26]:
# fitting and training the model-------------------------------------
model = grid.best_estimator_.fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 0.8058
  Test Data : 0.7992


#----- f1-score -----#

  Train Data: 0.5949
  Test Data : 0.5702


### Naive Bayes Model

In [27]:
# fitting and training the model-------------------------------------
model = GaussianNB().fit(x_train,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train)
test_predictions = model.predict(x_test)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 0.754
  Test Data : 0.7716


#----- f1-score -----#

  Train Data: 0.6039
  Test Data : 0.6306


### KNN Classifier Model

In [28]:
# performing grid search

model = KNeighborsClassifier()

parameters = {"n_neighbors":[1,2,3,4,5],"p":[1,2],"weights":["uniform","distance"],
              "algorithm":['auto','ball_tree','kd_tree','brute']}

grid = GridSearchCV(estimator=model, param_grid=parameters, cv=5).fit(x_train_scaled,y_train)
print(grid.best_params_)

{'algorithm': 'auto', 'n_neighbors': 1, 'p': 1, 'weights': 'uniform'}


In [29]:
# fitting and training the model-------------------------------------
model = KNeighborsClassifier(n_neighbors=2,weights='uniform',algorithm="auto",p=1).fit(x_train_scaled,y_train)

# calculations-------------------------------------------------------
train_predictions = model.predict(x_train_scaled)
test_predictions = model.predict(x_test_scaled)

train_accuracy = round(accuracy_score(y_train,train_predictions),4)
test_accuracy = round(accuracy_score(y_test,test_predictions),4)

train_f1_score = round(f1_score(y_train,train_predictions),4)
test_f1_score = round(f1_score(y_test,test_predictions),4)

# printing-----------------------------------------------------------
print("#----- Accuracy -----#")
print("")
print("  Train Data: {}".format(train_accuracy))
print("  Test Data : {}".format(test_accuracy))
print("\n")
print("#----- f1-score -----#")
print("")
print("  Train Data: {}".format(train_f1_score))
print("  Test Data : {}".format(test_f1_score))

#----- Accuracy -----#

  Train Data: 0.9964
  Test Data : 0.9852


#----- f1-score -----#

  Train Data: 0.9935
  Test Data : 0.9733


---------------