# Classification Tutorial - Employee Attrition

#### A notebook about binary classification using a modified dataset from Kaggle.
https://www.kaggle.com/c/sm/data

#### Import data and the necessary libraries

In [27]:
!pip install xgboost
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer,StandardScaler



In [28]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option("display.precision", 3)

In [29]:
df = pd.read_csv('Employee_Attrition.csv')
df.shape

(14999, 8)

In [30]:
df.head(10)

Unnamed: 0,satisfaction_level,last_evaluation,number_of_projects,average_monthly_hours,years_at_company,work_accident,promotion_last_5years,left
0,0.38,0.53,2,157,3,0,0,1
1,0.8,0.86,5,262,6,0,0,1
2,0.11,0.88,7,272,4,0,0,1
3,0.72,0.87,5,223,5,0,0,1
4,0.37,0.52,2,159,3,0,0,1
5,0.41,0.5,2,153,3,0,0,1
6,0.1,0.77,6,247,4,0,0,1
7,0.92,0.85,5,259,5,0,0,1
8,0.89,1.0,5,224,5,0,0,1
9,0.42,0.53,2,142,3,0,0,1


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_of_projects     14999 non-null  int64  
 3   average_monthly_hours  14999 non-null  int64  
 4   years_at_company       14999 non-null  int64  
 5   work_accident          14999 non-null  int64  
 6   promotion_last_5years  14999 non-null  int64  
 7   left                   14999 non-null  int64  
dtypes: float64(2), int64(6)
memory usage: 937.6 KB


In [32]:
df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_of_projects,average_monthly_hours,years_at_company,work_accident,promotion_last_5years,left
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.613,0.716,3.803,201.05,3.498,0.145,0.021,0.238
std,0.249,0.171,1.233,49.943,1.46,0.352,0.144,0.426
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [33]:
df.corr()

Unnamed: 0,satisfaction_level,last_evaluation,number_of_projects,average_monthly_hours,years_at_company,work_accident,promotion_last_5years,left
satisfaction_level,1.0,0.105,-0.143,-0.02,-0.101,0.059,0.026,-0.388
last_evaluation,0.105,1.0,0.349,0.34,0.132,-0.007,-0.009,0.007
number_of_projects,-0.143,0.349,1.0,0.417,0.197,-0.005,-0.006,0.024
average_monthly_hours,-0.02,0.34,0.417,1.0,0.128,-0.01,-0.004,0.071
years_at_company,-0.101,0.132,0.197,0.128,1.0,0.002,0.067,0.145
work_accident,0.059,-0.007,-0.005,-0.01,0.002,1.0,0.039,-0.155
promotion_last_5years,0.026,-0.009,-0.006,-0.004,0.067,0.039,1.0,-0.062
left,-0.388,0.007,0.024,0.071,0.145,-0.155,-0.062,1.0


In [34]:
df.isna().sum()

satisfaction_level       0
last_evaluation          0
number_of_projects       0
average_monthly_hours    0
years_at_company         0
work_accident            0
promotion_last_5years    0
left                     0
dtype: int64

In [35]:
df.left.value_counts()

0    11428
1     3571
Name: left, dtype: int64

#### We choose the dependent and the independent variables and define a repeated Stratified K-Fold cross validator.

In [36]:
X = df.iloc[:,:-1]
y = df.left
cv = model_selection.RepeatedStratifiedKFold(n_splits=6, n_repeats=3, random_state=1)

#### First, we try some basic alogrithms:

In [37]:
models = [
    ('CART', DecisionTreeClassifier(max_depth=2)),
    ('LR', LogisticRegression()),
    ('SVM', SVC()),
    ('kNN', KNeighborsClassifier(n_neighbors = 7))
]

for name, clf in models:
    n_scores = model_selection.cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('{}: Mean Accuracy: {:.3f} ({:.3f})'.format(name,np.mean(n_scores), np.std(n_scores)))


CART: Mean Accuracy: 0.850 (0.004)
LR: Mean Accuracy: 0.766 (0.006)
SVM: Mean Accuracy: 0.784 (0.002)
kNN: Mean Accuracy: 0.951 (0.004)


#### Then, we move on to ensemble methods:

In [38]:
models = [
    ('RT', RandomForestClassifier()),
    ('GB', GradientBoostingClassifier()),
    ('XGB', XGBClassifier()),
    ('Ada', AdaBoostClassifier(n_estimators=100)),
    ('Extra', ExtraTreesClassifier(n_estimators=100))
]

for name, clf in models:
    n_scores = model_selection.cross_val_score(clf, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('{}: Mean Accuracy: {:.3f} ({:.3f})'.format(name,np.mean(n_scores), np.std(n_scores)))    

RT: Mean Accuracy: 0.992 (0.002)
GB: Mean Accuracy: 0.975 (0.003)
XGB: Mean Accuracy: 0.987 (0.002)
Ada: Mean Accuracy: 0.961 (0.004)
Extra: Mean Accuracy: 0.990 (0.002)


#### We try some data preprocessing to check if we have any improvement in the results.  At first, we use data normalization:

In [39]:
X_norm = Normalizer().fit_transform(X)

In [40]:
models = [
    ('RT', RandomForestClassifier()),
    ('GB', GradientBoostingClassifier()),
    ('XGB', XGBClassifier()),
    ('Ada', AdaBoostClassifier(n_estimators=100)),
    ('Extra', ExtraTreesClassifier(n_estimators=100))
]

for name, clf in models:
    n_scores = model_selection.cross_val_score(clf, X_norm, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('{}: Mean Accuracy: {:.3f} ({:.3f})'.format(name,np.mean(n_scores), np.std(n_scores)))   

RT: Mean Accuracy: 0.984 (0.002)
GB: Mean Accuracy: 0.956 (0.005)
XGB: Mean Accuracy: 0.980 (0.003)
Ada: Mean Accuracy: 0.937 (0.004)
Extra: Mean Accuracy: 0.984 (0.003)


#### And then data standardization:

In [41]:
X_scaled = StandardScaler().fit_transform(X)

In [42]:
models = [
    ('RT', RandomForestClassifier()),
    ('GB', GradientBoostingClassifier()),
    ('XGB', XGBClassifier()),
    ('Ada', AdaBoostClassifier(n_estimators=100)),
    ('Extra', ExtraTreesClassifier(n_estimators=100))
]

for name, clf in models:
    n_scores = model_selection.cross_val_score(clf, X_scaled, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('{}: Mean Accuracy: {:.3f} ({:.3f})'.format(name,np.mean(n_scores), np.std(n_scores))) 

RT: Mean Accuracy: 0.992 (0.002)
GB: Mean Accuracy: 0.975 (0.003)
XGB: Mean Accuracy: 0.987 (0.002)
Ada: Mean Accuracy: 0.961 (0.004)
Extra: Mean Accuracy: 0.990 (0.002)


#### Afterwards, we try feature extraction using PCA method. For more information:
https://en.wikipedia.org/wiki/Principal_component_analysis

In [43]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5)
X_pca = pd.DataFrame(pca.fit_transform(X))
X_pca

Unnamed: 0,0,1,2,3,4
0,-44.068,-0.686,1.203,-0.171,0.282
1,60.968,2.339,0.077,-0.123,-0.255
2,70.980,0.911,-2.321,-0.172,0.397
3,21.966,1.629,-0.546,-0.131,-0.182
4,-42.068,-0.698,1.221,-0.172,0.295
...,...,...,...,...,...
14994,-50.068,-0.647,1.149,-0.170,0.253
14995,-41.069,-0.705,1.231,-0.172,0.302
14996,-58.067,-0.595,1.078,-0.173,0.286
14997,78.970,0.588,-1.291,-0.174,0.412


In [44]:
print(pca.explained_variance_ratio_)

[9.98565340e-01 8.69246970e-04 4.73865973e-04 4.96932182e-05
 2.43172315e-05]


In [45]:
models = [
    ('RT', RandomForestClassifier(n_estimators = 100)),
    ('GB', GradientBoostingClassifier()),
    ('XGB', XGBClassifier()),
    ('Ada', AdaBoostClassifier(n_estimators=100)),
    ('Extra', ExtraTreesClassifier(n_estimators=100))
]

for name, clf in models:
    n_scores = model_selection.cross_val_score(clf, X_pca, y, scoring='accuracy', cv=cv, n_jobs=-1)
    print('{}: Mean Accuracy: {:.3f} ({:.3f})'.format(name,np.mean(n_scores), np.std(n_scores)))    

RT: Mean Accuracy: 0.988 (0.002)
GB: Mean Accuracy: 0.969 (0.004)
XGB: Mean Accuracy: 0.985 (0.003)
Ada: Mean Accuracy: 0.940 (0.003)
Extra: Mean Accuracy: 0.987 (0.003)


#### As we conclude from the above results, none of the aforementioned methods improves the results, therefore we choose to proceed with the initial dataset, and the algorithm that performed better (in our case Random Forest). We could also employ Grid/Random search for fine-tuning.

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
         X, y, test_size = 0.3, random_state=7)

In [47]:
model = RandomForestClassifier(random_state=7,max_features='auto', n_estimators= 200, 
                            max_depth=6, criterion='entropy') 
model.fit(X_train, y_train) 
y_pred=model.predict(X_test) 
print("Accuracy: ",round(accuracy_score(y_test,y_pred),3))

Accuracy:  0.974


In [48]:
print(confusion_matrix(y_test,y_pred))

[[3436   13]
 [ 103  948]]


In [49]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      3449
           1       0.99      0.90      0.94      1051

    accuracy                           0.97      4500
   macro avg       0.98      0.95      0.96      4500
weighted avg       0.97      0.97      0.97      4500

