# Step 1: Importing Libraries

In [None]:
# Data analysis tools
import pandas as pd
import numpy as np

# Data Visualization Tools
import seaborn as sns
import matplotlib.pyplot as plt

# Data Pre-Processing Libraries
from sklearn.preprocessing import StandardScaler, LabelEncoder
import category_encoders as ce

# For Train-Test Split
from sklearn.model_selection import train_test_split

# Libraries for various Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# Metrics Tools
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score

#For Receiver Operating Characteristic (ROC)
from sklearn.metrics import roc_curve ,roc_auc_score, auc

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Step 2: Loading the Dataset

In [None]:
df = pd.read_csv('../input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

# Step 3: Understanding the Structure of the Dataset

In [None]:
df.info()

# Step 4: Data Pre-Processing

   # a) Treating Missing Values

In [None]:
df.isnull().sum()

<b> No Missing Values in the dataset. Hence no treatment for missing values required</b>


# b) Finding and removing all the duplicated values

In [None]:
len(df[df.duplicated()])

<b> The dataset does not have any duplicate values.</b>

# c) Checking for Imbalance

<b>From the graph, it is clear that the class distribution is Imbalanced. The dataset has 80% samples of class 0 (employee is not leaving the job) and 20% samples of class 1(employee has decided to leave the job).</b>

In [None]:
sns.countplot(df["Attrition"])
plt.xlabel("Class")
plt.ylabel("frequency")
plt.title("Checking imbalance")

# Step 6: Defining the Tatget and Predictor Variables and Standard Scaling

<b> If a feature’s variance is more than the variance of other features, that particular feature might dominate other features in the dataset. This could affect the accuracy of predictions. Hence, we need to scale all the features to a standard centred scale. For this purpose, we use StandardScaler() method.</b>

In [None]:
X=df.drop('Attrition',axis=1)

In [None]:
em = pd.get_dummies(X)

In [None]:
em.head()

In [None]:
df['Attrition']=LabelEncoder().fit_transform(df['Attrition'])
y=df['Attrition']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(em,y, test_size=0.30, random_state=0)

# Step 7: Fitting the dataset to various models

<b>We will fit the dataset to various models and find out the best fit model among these.

Various models used in this notebook are:
    

1)  Logistic Regression

2)  KNN                

3)  Naive-Bayes       

4)  SVM                   

5)  Decision Tree         

6)  Gradient Boosting     

7)  Random Forest         

8)  AdaBoost             

9)  XGBoost    </b>           

# 1) Logistic Regression

In [None]:
#Fitting the model

logistic_Regression = LogisticRegression(max_iter=3000,random_state=0,class_weight="balanced",solver = "saga")
logistic_Regression.fit(x_train,y_train)

In [None]:
# Applying the model to the x_test

y_pred = logistic_Regression.predict(x_test)

In [None]:
# Finding Accuracy

log = accuracy_score(y_pred,y_test)*100

In [None]:
# Confusion Matrix

cmlr=confusion_matrix(y_pred,y_test)
print(cmlr)

In [None]:
# Classification Report that computes various
# metrics like Precision, Recall and F1 Score

print(classification_report(y_pred,y_test))

In [None]:
# Plotting the ROC Curve


prob_lr=logistic_Regression.predict_proba(x_test)
auc_lr = roc_auc_score(y_test,prob_lr[:,1])
fprlr,tprlr,_ = roc_curve(y_test,prob_lr[:,1])
roc_auc=auc(fprlr,tprlr)
plt.plot(fprlr,tprlr,label = "AUC = %.2f" % auc_lr)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Logistic Regression")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 2) KNN

In [None]:
#Fitting the model

knn = KNeighborsClassifier(n_neighbors=35)
knn.fit(x_train,y_train)

In [None]:
# Applying the model to the x_test

pred_knn = knn.predict(x_test)

In [None]:
# Finding Accuracy

KNN = accuracy_score(pred_knn,y_test)*100

In [None]:
# Confusion Matrix

cm_knn=confusion_matrix(pred_knn,y_test)
print(cm_knn)

In [None]:
# Classification Report that computes various
# metrics like Precision, Recall and F1 Score

print(classification_report(pred_knn,y_test))

In [None]:
# Plotting the ROC Curve

prob_knn= knn.predict_proba(x_test)
auc_knn = roc_auc_score(y_test,prob_knn[:,1])
fprknn,tprknn,_= roc_curve(y_test,prob_knn[:,1])
roc_auc_knn=auc(fprknn,tprknn)
plt.plot(fprknn,tprknn,label = "AUC = %.2f" % auc_knn)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for KNN")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 3) Naive-Bayes

In [None]:
#Fitting the model

gnb=GaussianNB()
gnb.fit(x_train,y_train)

In [None]:
# Applying the model to the x_test

pred_gnb = gnb.predict(x_test)

In [None]:
# Finding Accuracy

GNB = accuracy_score(pred_gnb,y_test)*100

In [None]:
# Confusion Matrix

cm_gnb=confusion_matrix(pred_gnb,y_test)
print(cm_gnb)

In [None]:
# Classification Report that computes various 
# metrics like Precision, Recall and F1 Score

print(classification_report(pred_gnb,y_test))

In [None]:
# Plotting the ROC Curve

prob_gnb= gnb.predict_proba(x_test)
auc_gnb = roc_auc_score(y_test,prob_gnb[:,1])
fprgnb,tprgnb,_= roc_curve(y_test,prob_gnb[:,1])
roc_auc_gnb=auc(fprgnb,tprgnb)
plt.plot(fprgnb,tprgnb,label = "AUC = %.2f" % auc_gnb)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Naive-Bayes")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 4) SVM

In [None]:
#Fitting the model

svc = SVC(probability=True)
svc.fit(x_train,y_train)

# Applying the model to the x_test
pred_svc = svc.predict(x_test)

In [None]:
# Finding Accuracy

SVC = accuracy_score(pred_svc,y_test)*100

In [None]:
# Confusion Matrix

cm_svc=confusion_matrix(pred_svc,y_test)
print(cm_svc)

In [None]:
# Classification Report that computes various 
#metrics like Precision, Recall and F1 Score

print(classification_report(pred_svc,y_test))

In [None]:
# Plotting the ROC Curve

prob_svc= svc.predict_proba(x_test)
auc_svc = roc_auc_score(y_test,prob_svc[:,1])
fprsvc,tprsvc,_= roc_curve(y_test,prob_svc[:,1])
roc_auc_svc=auc(fprsvc,tprsvc)
plt.plot(fprsvc,tprsvc,label = "AUC = %.2f" % auc_svc)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for SVM")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 5) Decision Tree

In [None]:
#Fitting the model

dtree_en = DecisionTreeClassifier()
clf = dtree_en.fit(x_train,y_train)

In [None]:
# Applying the model to the x_test

pred_dt = clf.predict(x_test)

In [None]:
# Finding Accuracy

DTREE = accuracy_score(pred_dt,y_test)*100

In [None]:
# Confusion Matrix

cm_dt=confusion_matrix(y_test,pred_dt)
print(cm_dt)

# Classification Report that computes various 
# metrics like Precision, Recall and F1 Score

print(classification_report(y_test,pred_dt))

In [None]:
# Plotting the ROC Curve

prob_dt= dtree_en.predict_proba(x_test)
auc_dt = roc_auc_score(y_test,prob_dt[:,1])
fprdt,tprdt,_= roc_curve(y_test,prob_dt[:,1])
roc_auc_dt=auc(fprdt,tprdt)
plt.plot(fprdt,tprdt,label = "AUC = %.2f" % auc_dt)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Decision Tree")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 6) Gradient Boosting

In [None]:
#Fitting the model

GBC=GradientBoostingClassifier(n_estimators=150)
GBC.fit(x_train,y_train)

In [None]:
# Applying the model to the x_test

Y_predict=GBC.predict(x_test)

In [None]:
# Finding Accuracy

gbc = accuracy_score(y_test,Y_predict)*100

In [None]:
# Confusion Matrix

cm_gbc=confusion_matrix(y_test,Y_predict)
print(cm_gbc)

# Classification Report that computes various 
# metrics like Precision, Recall and F1 Score

print(classification_report(y_test,Y_predict))

In [None]:
# Plotting the ROC Curve

prob_GBC= GBC.predict_proba(x_test)
auc_GBC = roc_auc_score(y_test,prob_GBC[:,1])
fprGBC,tprGBC,_= roc_curve(y_test,prob_GBC[:,1])
roc_auc_GBC=auc(fprGBC,tprGBC)
plt.plot(fprGBC,tprGBC,label = "AUC = %.2f" % auc_GBC)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Gradient Boosting")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 7) Random Forest

In [None]:
#Fitting the model

rfc = RandomForestClassifier(n_estimators=30,criterion='gini',random_state=1,max_depth=10)
rfc.fit(x_train, y_train)

In [None]:
# Applying the model to the x_test

pred_rf= rfc.predict(x_test)

In [None]:
# Finding Accuracy

RFC = accuracy_score(y_test,pred_rf)*100

In [None]:
# Confusion Matrix

cm_rf=confusion_matrix(pred_rf,y_test)
print(cm_rf)

In [None]:
# Classification Report that computes various 
# metrics like Precision, Recall and F1 Score

print(classification_report(pred_rf,y_test))

In [None]:
# Plotting the ROC Curve

prob_rfc= rfc.predict_proba(x_test)
auc_rfc = roc_auc_score(y_test,prob_rfc[:,1])
fprrfc,tprrfc,_= roc_curve(y_test,prob_rfc[:,1])
roc_auc_rfc=auc(fprrfc,tprrfc)
plt.plot(fprrfc,tprrfc,label = "AUC = %.2f" % auc_rfc)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Random Forest")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 8) AdaBoost (Entropy-Decision Tree)

In [None]:
#Fitting the model. Base model is chosen to be Decision Tree

model = DecisionTreeClassifier(criterion='entropy',max_depth=1,random_state=0)
adaboost = AdaBoostClassifier(n_estimators=80, base_estimator=model,random_state=0)
adaboost.fit(x_train,y_train)

In [None]:
# Applying the model to the x_test

pred = adaboost.predict(x_test)

In [None]:
# Finding Accuracy

ada = accuracy_score(y_test,pred)*100

In [None]:
# Confusion Matrix

cm_ada=confusion_matrix(pred,y_test)
print(cm_ada)

# Classification Report that computes various 
# metrics like Precision, Recall and F1 Score

print(classification_report(pred,y_test))

In [None]:
# Plotting the ROC Curve

prob_adaboost= adaboost.predict_proba(x_test)
auc_adaboost = roc_auc_score(y_test,prob_adaboost[:,1])
fpradaboost,tpradaboost,_= roc_curve(y_test,prob_adaboost[:,1])
roc_auc_adaboost=auc(fpradaboost,tpradaboost)
plt.plot(fpradaboost,tpradaboost,label = "AUC = %.2f" % auc_adaboost)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for AdaBoost (Entropy-Decision Tree)")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# 9) XGBoost 

In [None]:
#Fitting the model

xgb =  XGBClassifier(learning_rate =0.000001,n_estimators=1000,max_depth=5,min_child_weight=1,
                     subsample=0.8,colsample_bytree=0.8,nthread=4,scale_pos_weight=1,seed=27)
xgb.fit(x_train, y_train)

In [None]:
# Applying the model to the x_test


predxg = xgb.predict(x_test)

# Finding Accuracy
xg = accuracy_score(y_test,predxg)*100


In [None]:
# Confusion Matrix

cm_xg=confusion_matrix(predxg,y_test)
print(cm_xg)

# Classification Report that computes various 
# metrics like Precision, Recall and F1 Score

print(classification_report(predxg,y_test))

In [None]:
# Plotting the ROC Curve

prob_xgb= xgb.predict_proba(x_test)
auc_xgb = roc_auc_score(y_test,prob_xgb[:,1])
fprxgb,tprxgb,_= roc_curve(y_test,prob_xgb[:,1])
roc_auc_xgb=auc(fprxgb,tprxgb)
plt.plot(fprxgb,tprxgb,label = "AUC = %.2f" % auc_xgb)
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for XGBoost")
plt.plot([0,1],[0,1],"--")
plt.legend()
plt.show()

# Step 8: Choosing the Best model

There are various ways to evaluate a classification model. Some of them are:
 
1) Accuracy
    
2) AUC
    
3) ROC
    
4) f1 Score


I am evaluating with all these metrics in order to find the best fit model

# Confusion Matrix

A confusion matrix is an N X N matrix, where N is the number of classes being predicted. Confusion Matrix gives us a matrix as output and describes the complete performance of the model.

The correct predictions falls on the diagonal line of the matrix.

4 important terms in Confusion Matrix:

<b>True Positives</b>  : We predict YES and the actual output is also YES.

<b>True Negatives</b>  : We predict NO and the actual output is NO.

<b>False Positives(Type I Error)</b> : We predict YES but the actual output is NO.

<b>False Negatives(Type II error)</b> : We predict NO but the actual output is YES.

<b>The Confusion matrix in itself is not a performance measure, but almost all of the performance metrics are based on Confusion Matrix.


# 1) Accuracy

In [None]:
# Accuracy values for all the models
print("1)  Logistic Regression    :",round(log, 2))
print("2)  KNN                    :",round(KNN, 2))
print("3)  Naive-Bayes            :",round(GNB, 2))
print("4)  SVM                    :",round(SVC, 2))
print("5)  Decision Tree          :",round(DTREE, 2))
print("6)  Gradient Boosting      :",round(gbc, 2))
print("7)  Random Forest          :",round(RFC, 2))
print("8)  AdaBoost               :",round(ada, 2))
print("9)  XGBoost                :",round(xg, 2))


Here, <b>Gradient Boosting has the highest accuracy rate.</b> But during Data visualization step, we observed that the <b>class distribution is Imbalanced</b>. The dataset has 80% samples of class 0 (the employee is not leaving their job) and 20% samples of class 1(The employee has decided to leave their job). This is the reason why most of the models are getting accuracy above 90% by simply predicting every training sample belonging to class 0. But, when we apply this model to a new test-set, then the <b>test accuracy would drop to less than 60%.</b>

<b>In this case, Accuracy metric proves to be a poor indicator of model performance. Therefore, we need to consider other metrics before deciding the best model.</b>

# 2) Area Under Curve (AUC)

In [None]:
# Area Under the Curve(AUC) of all the models
print('Area under the curve for Logistic Regression :',round(roc_auc, 2))
print('Area under the curve for KNN                 :',round(roc_auc_knn, 2))
print('Area under the curve for Naive-Bayes         :',round(roc_auc_gnb, 2))
print('Area under the curve for SVM                 :',round(roc_auc_svc, 2))
print('Area under the curve for Decision Tree       :',round(roc_auc_dt, 2))
print('Area under the curve for Gradient Boosting   :',round(roc_auc_GBC, 2))
print('Area under the curve for Random Forest       :',round(roc_auc_rfc, 2))
print('Area under the curve for AdaBoost            :',round(roc_auc_adaboost, 2))
print('Area under the curve for XGBoost             :',round(roc_auc_xgb, 2))

The area under the curve (AUC), is an aggregated measure of performance of a binary classifier on all possible threshold values. AUC calculates the area under the ROC curve, and therefore it is between 0 and 1.<b> For any classifier, the higher the AUC of a model the better it is.


Here, <b>Gradient Boosting </b> have the highest AUC value. Hence, based on the AUC values, Gradient Boosting is the best fit model.

# 3) ROC Curve

In [None]:
#ROC Curve for all models
plt.figure(figsize = (20,10))
plt.plot(fprlr,tprlr,label = "Logistic Regression")
plt.plot(fprknn,tprknn,label = "KNN")
plt.plot(fprgnb,tprgnb,label = "Naive-Bayes")
plt.plot(fprsvc,tprsvc,label = "SVM")
plt.plot(fprdt,tprdt,label = "Decision Tree")
plt.plot(fprGBC,tprGBC,label = "Gradient Boosting",color='black')
plt.plot(fprrfc,tprrfc,label = "Random Forest",color='yellow')
plt.plot(fpradaboost,tpradaboost,label = " AdaBoost")
plt.plot(fprxgb,tprxgb,label = "XGBoost")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.rcParams['font.size'] = 12
plt.title('ROC curve')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.legend(loc="lower right", fontsize=10)
plt.grid(True)

The <b>Receiver Operating Characteristic (ROC)</b> curve is plot which shows the performance of a binary classifier as function of its cut-off threshold. ROC curve is one of the most effective evaluation metrics because it visualizes the accuracy of predictions for a whole range of cutoff values. It essentially shows the true positive rate (TPR) against the false positive rate (FPR) for all possible threshold values. <b>A model is said to be the best model when the ROC is close to the upper left corner.</b>

Looking at the ROC curve plot above, the <b>Naive-Bayes, has the curve that is closest to the upper left corner. Hence, based on the ROC plot, Naive-Bayes is the best fit model.</b>

# 4) F1-Score

In [None]:
# f1_score of all models
print("1)  Logistic Regression    :",round(f1_score(y_pred,y_test), 2))
print("2)  KNN                    :",round(f1_score(pred_knn,y_test), 2))
print("3)  Naive-Bayes            :",round(f1_score(pred_gnb,y_test), 2))
print("4)  SVM                    :",round(f1_score(pred_svc,y_test), 2))
print("5)  Decision Tree          :",round(f1_score(pred_dt,y_test), 2))
print("6)  Gradient Boosting      :",round(f1_score(Y_predict,y_test), 2))
print("7)  Random Forest          :",round(f1_score(pred_rf,y_test), 2))
print("8)  AdaBoost               :",round(f1_score(pred,y_test), 2))
print("9)  XGBoost                :",round(f1_score(predxg,y_test), 2))

<b>Precision</b>           - It is the number of True Positive divided by the number of positive results predicted by the classifier.

<b>Recall/ Sensitivity</b> - It is the number of True Positives divided by the number of all relevant samples

<b>F1 Score</b>            - F1 Score is the Harmonic Mean between precision and recall.

F1 Score tells how precise the classifier is (how many values it classifies correctly).

<b>The greater the F1 Score, the better is the performance of our model.</b>


Here, <b>Naive-Bayes</b> has the highest f1_score. Hence, based on the f1_score, Naive-Bayes is the best fit model.

# 5)Type I Error

In [None]:
#Accessing the False Positives of all models from their confusion Matrix
print("1)  Logistic Regression    :",cmlr[0][1])
print("2)  KNN                    :",cm_knn[0][1])
print("3)  Naive-Bayes            :",cm_gnb[0][1])
print("4)  SVM                    :",cm_svc[0][1])
print("5)  Decision Tree          :",cm_dt[0][1])
print("6)  Gradient Boosting      :",cm_gbc[0][1])
print("7)  Random Forest          :",cm_rf[0][1])
print("8)  AdaBoost               :",cm_ada[0][1])
print("9)  XGBoost                :",cm_xg[0][1])

False Positives(Type I Error) occurs when we incorrectly reject a true hypothesis.<b>Lower the value of False Positives, better is the model</b>. This is because, while predicting, <b>if we predict that an employee is not going to leave the job, but later he/she actually leaves the job, then this kind of wrong prediction could further increase Attrition Rate to an alarming range.</b>

The False Positives(Type I Error) for all the models can be accessed from the confusion matrix.


<b>Gradient Boosting algorithm has the least number of False Positives(Type I Error). Hence, based on the False Positives(Type I Error), Gradient Boosting is the best fit model.</b>

# Step 9: Finalizing the Best Model

After all the comparison using 5 different metrics:

when considering the metrics AUC, ROC and Type I error, Gradient Boosting is found to be the best model.

when considering the metrics Accuracy and F1 Score, Random Forest is found to be the best model.

# Finally, <u><b>Gradient Boosting</b></u> algorithm proves to be the best model for the Employee Attrition.
