In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('employee_attrition.csv')
df.head()

In [None]:
df.shape
df.info()
df.describe()

In [None]:
# dropping few columns that are irrelevant for our prediction
df=df.drop(columns=['EmployeeCount','EmployeeNumber','StandardHours','Over18'])
df.head()

In [None]:
df.isnull().sum()

In [None]:
df['EducationField'].unique()
df=pd.get_dummies(df,drop_first=True)
df.head()

In [None]:
df.columns

In [None]:
df_2=df.drop(columns='Attrition_Yes')
df_2.head()

In [None]:
df_2.corrwith(df['Attrition_Yes']).plot.bar(
    figsize=(20,10),title='Correlation',rot=45,grid=True
)

In [None]:
corr=df.corr()
corr

In [None]:
(df['Attrition_Yes']==0).sum()
(df['Attrition_Yes']==1).sum()

In [None]:
plt.figure(figsize=(30,10))
sns.heatmap(corr,annot=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(df.drop(columns=['Attrition_Yes']),df['Attrition_Yes']
                                               ,test_size=0.2,random_state=0)

In [None]:
X_train.shape
X_test.shape
Y_train.shape
Y_test.shape

In [None]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)

In [None]:
# Building the model
# Method 1. Logistic Regression
from sklearn.linear_model import LogisticRegression
clf_1=LogisticRegression()
clf_1.fit(X_train_scaled,Y_train)
y_pred1=clf_1.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,precision_score,recall_score
acc=accuracy_score(Y_test,y_pred1)
f1=f1_score(Y_test,y_pred1)
precision=accuracy_score(Y_test,y_pred1)
recall=recall_score(Y_test,y_pred1)
cm=confusion_matrix(Y_test,y_pred1)
cm

In [None]:
result_1=pd.DataFrame([['Logistic Regression',acc,f1,precision,recall]],
                      columns=['Model','Accuracy','f1','Precison','Recall'])
result_1

In [None]:
from sklearn.model_selection import cross_val_score
cv_1=cross_val_score(clf_1,X_train_scaled,Y_train,cv=10)
cv_1

In [None]:
print("Accuracy is",np.mean(cv_1)*100,"%")
print("Standard deviation is",np.std(cv_1)*100,"%")

In [None]:
# Method 2. Random Forest Classification
from sklearn.ensemble import RandomForestClassifier
clf_2=RandomForestClassifier()
clf_2.fit(X_train_scaled,Y_train)
y_pred2=clf_2.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,precision_score,recall_score
acc=accuracy_score(Y_test,y_pred2)
f1=f1_score(Y_test,y_pred2)
precision=accuracy_score(Y_test,y_pred2)
recall=recall_score(Y_test,y_pred2)
cm=confusion_matrix(Y_test,y_pred2)
cm

In [None]:
result_2=pd.DataFrame([['Random Forest Classification',acc,f1,precision,recall]],
                      columns=['Model','Accuracy','f1','Precison','Recall'])
result_2

In [None]:
from sklearn.model_selection import cross_val_score
cv_2=cross_val_score(clf_2,X_train_scaled,Y_train,cv=10)
cv_2

In [None]:
print("Accuracy is",np.mean(cv_2)*100,"%")
print("Standard deviation is",np.std(cv_2)*100,"%")

In [None]:
# Method 3. XGBoost Classifier
from xgboost import XGBRFClassifier
clf_3=XGBRFClassifier()
clf_3.fit(X_train_scaled,Y_train)
y_pred3=clf_3.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,precision_score,recall_score
acc=accuracy_score(Y_test,y_pred3)
f1=f1_score(Y_test,y_pred3)
precision=accuracy_score(Y_test,y_pred3)
recall=recall_score(Y_test,y_pred3)
cm=confusion_matrix(Y_test,y_pred3)
cm

In [None]:
result_3=pd.DataFrame([['XGBoost Classification',acc,f1,precision,recall]],
                      columns=['Model','Accuracy','f1','Precison','Recall'])
result_3

In [None]:
from sklearn.model_selection import cross_val_score
cv_3=cross_val_score(clf_3,X_train_scaled,Y_train,cv=10)
cv_3

In [None]:
print("Accuracy is",np.mean(cv_3)*100,"%")
print("Standard deviation is",np.std(cv_3)*100,"%")

In [None]:
# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV
parameters={
    'penalty':['l1','l2','elasticnet','none'],
    'C':[0.25,0.5,0.75,1.0,1.25,1.5,1.75,2.0],
    'solver':['newton-cg','lbfgs','liblinear','sag','saga']
}

In [None]:
random_search=RandomizedSearchCV(estimator=clf_1,param_distributions=parameters,
                                 n_iter=10,scoring='roc_auc',n_jobs=-1,cv=10,verbose=3)
random_search.fit(X_train_scaled,Y_train)
random_search.best_params_

In [None]:
# Building the final model
from sklearn.linear_model import LogisticRegression
clf_final=LogisticRegression(solver='liblinear',penalty='l2',C=0.25)
clf_final.fit(X_train_scaled,Y_train)
y_pred_final=clf_final.predict(X_test_scaled)

In [None]:
from sklearn.model_selection import cross_val_score
cv_final=cross_val_score(clf_final,X_train_scaled,Y_train,cv=10)
cv_final

In [None]:
print("Accuracy is",np.mean(cv_final)*100,"%")
print("Standard deviation is",np.std(cv_final)*100,"%")

In [None]:
# Predicting a single observation
single_obs = [[41, 1102,	1, 2,	2,	94,	3,	2,	4,	5993,	19479,	8,	11,	3,	1,	0,	8,	0,	1,	6,	4,	0,	5,
               0,	1,	0,	1,	1,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	0,	1,	0,	0,	1,	1]]
obs=scaler.transform(single_obs)
obs=pd.DataFrame(obs,columns=single_obs)

In [None]:
final_prediction=clf_final.predict(obs)
final_prediction