In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io

In [2]:
df=pd.read_csv('hr-employee-attrition.csv')


In [3]:
df.drop(['EmployeeCount','Over18','StandardHours','EmployeeNumber'],axis=1,inplace=True)

In [4]:
from sklearn.preprocessing import LabelEncoder
lb=LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object':
        df[col]=lb.fit_transform(df[col])

In [5]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,2,1102,2,1,2,1,2,0,...,3,1,0,8,0,1,6,4,0,5
1,49,0,1,279,1,8,1,1,3,1,...,4,4,1,10,3,3,10,7,1,7
2,37,1,2,1373,1,2,2,4,4,1,...,3,2,0,7,3,3,0,0,0,0
3,33,0,1,1392,1,3,4,1,4,0,...,3,3,0,8,3,3,8,7,3,0
4,27,0,2,591,1,2,1,3,1,1,...,3,4,1,6,3,3,2,2,2,2


In [6]:
target=df['Attrition']
X=df.drop(['Attrition'],axis=1)

In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,target,test_size=0.2,random_state=7,stratify=target)

In [8]:
print(X_train.shape)

(1176, 30)


In [9]:
from sklearn.tree import DecisionTreeClassifier

tree_1=DecisionTreeClassifier(random_state=1)
tree_1.fit(X_train,y_train)

In [10]:
from sklearn.metrics import classification_report

y_pred_train=tree_1.predict(X_train)
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       986
           1       1.00      1.00      1.00       190

    accuracy                           1.00      1176
   macro avg       1.00      1.00      1.00      1176
weighted avg       1.00      1.00      1.00      1176



In [11]:
y_pred=tree_1.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88       247
           1       0.41      0.47      0.44        47

    accuracy                           0.81       294
   macro avg       0.65      0.67      0.66       294
weighted avg       0.82      0.81      0.81       294



In [12]:
from sklearn.metrics import classification_report,accuracy_score

def classification_performance(model,features,target,dataset_name_string):
    print(f"{dataset_name_string} Data Performance")
    print()
    
    predicted_target=model.predict(features)
    report=pd.DataFrame(classification_report(target,predicted_target,output_dict=True))
    
    print(report)
    print()
    print("Accuracy Score : ",accuracy_score(target,predicted_target)*100)

In [13]:
from sklearn.model_selection import KFold,cross_val_score

def kfold_cross_validation_score(model,feature,target):
    kfold=KFold(n_splits=10)
    result=cross_val_score(model,feature,target,cv=kfold,scoring='accuracy')
    
    print("K-Fold Accuracy Mean : ",round(result.mean()*100,2) )
    print("K-Fold Accuracy Standard Deviation : ",round(result.std()*100,2) )

In [14]:
classification_performance(tree_1,X_train,y_train,'Training')

Training Data Performance

               0      1  accuracy  macro avg  weighted avg
precision    1.0    1.0       1.0        1.0           1.0
recall       1.0    1.0       1.0        1.0           1.0
f1-score     1.0    1.0       1.0        1.0           1.0
support    986.0  190.0       1.0     1176.0        1176.0

Accuracy Score :  100.0


In [15]:
classification_performance(tree_1,X_test,y_test,'Testing')

Testing Data Performance

                    0          1  accuracy   macro avg  weighted avg
precision    0.895833   0.407407  0.806122    0.651620      0.817752
recall       0.870445   0.468085  0.806122    0.669265      0.806122
f1-score     0.882957   0.435644  0.806122    0.659300      0.811448
support    247.000000  47.000000  0.806122  294.000000    294.000000

Accuracy Score :  80.61224489795919


In [16]:
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier

bag_classifier=BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=10,
    random_state=7,
    oob_score=True
)

In [17]:
bag_model = bag_classifier.fit(X_train,y_train)

  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


In [18]:
bag_model.oob_score_

0.8239795918367347

In [19]:
classification_performance(bag_model,X_train,y_train,'Training')

Training Data Performance

                    0           1  accuracy    macro avg  weighted avg
precision    0.980119    1.000000  0.982993     0.990060      0.983331
recall       1.000000    0.894737  0.982993     0.947368      0.982993
f1-score     0.989960    0.944444  0.982993     0.967202      0.982606
support    986.000000  190.000000  0.982993  1176.000000   1176.000000

Accuracy Score :  98.29931972789116


In [20]:
classification_performance(bag_model,X_test,y_test,'Testing')

Testing Data Performance

                    0          1  accuracy   macro avg  weighted avg
precision    0.874539   0.565217   0.85034    0.719878      0.825089
recall       0.959514   0.276596   0.85034    0.618055      0.850340
f1-score     0.915058   0.371429   0.85034    0.643243      0.828151
support    247.000000  47.000000   0.85034  294.000000    294.000000

Accuracy Score :  85.03401360544217


In [21]:
kfold_cross_validation_score(bag_classifier,X,target)

  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


K-Fold Accuracy Mean :  84.63
K-Fold Accuracy Standard Deviation :  1.43


  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


In [22]:
rf_classifier=RandomForestClassifier(n_estimators=25,
                                     criterion='gini',
                                     max_depth=12,
                                     max_features=6,
                                     min_impurity_decrease=0.01,
                                     max_samples=1.0,
                                     oob_score=True,
                                     class_weight='balanced',
                                     random_state=7
                                    )

In [23]:
rf_model=rf_classifier.fit(X_train,y_train)

In [24]:
classification_performance(rf_model,X_train,y_train,'Training')

Training Data Performance

                    0           1  accuracy    macro avg  weighted avg
precision    0.937642    0.459184  0.818027     0.698413      0.860340
recall       0.838742    0.710526  0.818027     0.774634      0.818027
f1-score     0.885439    0.557851  0.818027     0.721645      0.832512
support    986.000000  190.000000  0.818027  1176.000000   1176.000000

Accuracy Score :  81.80272108843538


In [25]:
classification_performance(rf_model,X_test,y_test,'Testing')

Testing Data Performance

                    0          1  accuracy   macro avg  weighted avg
precision    0.910314   0.380282  0.782313    0.645298      0.825581
recall       0.821862   0.574468  0.782313    0.698165      0.782313
f1-score     0.863830   0.457627  0.782313    0.660728      0.798893
support    247.000000  47.000000  0.782313  294.000000    294.000000

Accuracy Score :  78.2312925170068


In [26]:
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


In [27]:
params={
    'n_estimators': [25,50,75,100],
    'criterion': ['entropy','gini'],
    'max_depth': [5,10,12,15],
    'max_features': [5,6,7,8]
}

In [30]:
grid_model=GridSearchCV(estimator=RandomForestClassifier(),
                      param_grid=params,
                      cv=10,
                      scoring='f1',  
                      verbose=3,
                      n_jobs=-1)

In [31]:
grid_model.fit(X_train,y_train)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


In [32]:
grid_model.best_estimator_

In [34]:
print(grid_model.best_params_)
print(grid_model.best_score_)
print(grid_model.cv_results_['mean_test_score'])

{'criterion': 'gini', 'max_depth': 15, 'max_features': 8, 'n_estimators': 25}
0.35935910250003206
[0.18399134 0.17741107 0.12841897 0.17128082 0.1397802  0.18671033
 0.16564841 0.1518323  0.23429908 0.232099   0.18594579 0.21042841
 0.21824675 0.224764   0.17416468 0.22141069 0.23847396 0.26847683
 0.24879823 0.28492828 0.28774999 0.26763965 0.27878981 0.29168039
 0.32710692 0.27910174 0.27911139 0.2844734  0.33109041 0.27921592
 0.29751177 0.30140203 0.24443648 0.25384521 0.24577504 0.26641634
 0.29062704 0.26090685 0.29663448 0.30378266 0.27853121 0.29315346
 0.2979677  0.30890032 0.31766284 0.28601677 0.25649274 0.30375656
 0.26800772 0.28082572 0.28118852 0.2746206  0.31779815 0.25936943
 0.30974263 0.25917175 0.30353165 0.31623457 0.30898925 0.26599874
 0.28326671 0.31760413 0.28665479 0.28804714 0.18919386 0.16002635
 0.1426294  0.15530303 0.1879687  0.18286823 0.17867212 0.16547563
 0.16969697 0.18547356 0.21837173 0.19969433 0.21477684 0.22138999
 0.2346454  0.21327384 0.298781

In [39]:
rf_classifier_withgridsearch_cv=RandomForestClassifier(n_estimators=25,
                                     criterion='gini',
                                     max_depth=15,
                                     max_features=8,
                                     oob_score=True,
                                     class_weight='balanced',
                                     random_state=7
                                    )
rf_model_withgridsearch=rf_classifier_withgridsearch_cv.fit(X_train,y_train)

In [40]:
classification_performance(rf_classifier_withgridsearch_cv,X_train,y_train,'Training')

Training Data Performance

                    0           1  accuracy    macro avg  weighted avg
precision    0.996967    1.000000  0.997449     0.998483      0.997457
recall       1.000000    0.984211  0.997449     0.992105      0.997449
f1-score     0.998481    0.992042  0.997449     0.995262      0.997441
support    986.000000  190.000000  0.997449  1176.000000   1176.000000

Accuracy Score :  99.74489795918367


In [42]:
classification_performance(rf_classifier_withgridsearch_cv,X_test,y_test,'Training')

Training Data Performance

                    0          1  accuracy   macro avg  weighted avg
precision    0.870968   0.733333  0.863946    0.802151      0.848965
recall       0.983806   0.234043  0.863946    0.608924      0.863946
f1-score     0.923954   0.354839  0.863946    0.639397      0.832973
support    247.000000  47.000000  0.863946  294.000000    294.000000

Accuracy Score :  86.39455782312925


In [44]:
random_model=RandomizedSearchCV(estimator=RandomForestClassifier(),
                               scoring='f1',
                               param_distributions=params,
                               cv=5,
                               verbose=1,
                               n_jobs=-1,
                               )

In [45]:
random_model.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [46]:
random_model.best_estimator_