In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
import numpy as np
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import classification_report
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('HRAnalytics.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

There are no null values, so imputation is not required.

There are few Object datatype features, so encoding is required

In [None]:
from sklearn.preprocessing import LabelEncoder
lab_enc=LabelEncoder()

In [None]:
data=lab_enc.fit_transform(df['Attrition'])
pd.Series(data)
df['Attrition']=data

In [None]:
df['Attrition']

In [None]:
data=lab_enc.fit_transform(df['BusinessTravel'])
pd.Series(data)
df['BusinessTravel']=data
df['BusinessTravel']

In [None]:
data=lab_enc.fit_transform(df['Department'])
pd.Series(data)
df['Department']=data
df['Department']

In [None]:
data=lab_enc.fit_transform(df['EducationField'])
pd.Series(data)
df['EducationField']=data

In [None]:
data=lab_enc.fit_transform(df['Gender'])
pd.Series(data)
df['Gender']=data

In [None]:
data=lab_enc.fit_transform(df['JobRole'])
pd.Series(data)
df['JobRole']=data

In [None]:
data=lab_enc.fit_transform(df['MaritalStatus'])
pd.Series(data)
df['MaritalStatus']=data

In [None]:
data=lab_enc.fit_transform(df['Over18'])
pd.Series(data)
df['Over18']=data

In [None]:
data=lab_enc.fit_transform(df['OverTime'])
pd.Series(data)
df['OverTime']=data

In [None]:
df.head()

In [None]:
plt.figure(figsize=(25,25),facecolor='yellow')
plotno=1
for column in df:
    if plotno<=36:
        ax=plt.subplot(9,4,plotno)
        sns.distplot(df[column])
        plt.xlabel(column,fontsize=20)
    plotno+=1
plt.tight_layout()

In [None]:
y=df['Attrition']
x=df.drop(columns=['Attrition'])

In [None]:
plt.figure(figsize=(25,25),facecolor='white')
plotnumber=1
for column in x:
    if plotnumber<=36:
        ax=plt.subplot(9,4,plotnumber)
        plt.scatter(x[column],y)
        plt.xlabel(column,fontsize=20)
        plt.ylabel('Attrition',fontsize=20)
    plotnumber+=1
plt.tight_layout()

In [None]:
df_corr=df.corr().abs()
plt.figure(figsize=(35,35))
sns.heatmap(df_corr,annot=True,annot_kws={'size':10})
plt.show

In [None]:
from sklearn.feature_selection import SelectKBest,f_classif

In [None]:
best_features=SelectKBest(score_func=f_classif,k='all')

fit=best_features.fit(x,y)

data_scores=pd.DataFrame(fit.scores_)

data_columns=pd.DataFrame(x.columns)

feature_scores=pd.concat([data_columns,data_scores],axis=1)

feature_scores.columns=['Feature_Name','Score']

print(feature_scores.nlargest(26,'Score'))

In [None]:
plt.figure(figsize=(25,25),facecolor='white')
plotnumber=1
for column in x:
    if plotnumber<=36:
        ax=plt.subplot(36,1,plotnumber)
        sns.lineplot(x='Attrition',y=column,data=df)
        plt.xlabel(column,fontsize=2)
        plt.ylabel('Attrition',fontsize=2)
    plotnumber+=1
plt.tight_layout()

In [None]:
plt.figure(figsize=(25,15))
plotno=1
for column in x:
    if plotno<=35:
        ax=plt.subplot(9,4,plotno)
        sns.stripplot(y,x[column])
    plotno+=1
plt.show()

Based on heatmap, selectk best feature, we can determine which features have best relation with Attrition

In [None]:
y=df['Attrition']
x=df[['OverTime','TotalWorkingYears','JobLevel','MaritalStatus','YearsInCurrentRole','MonthlyIncome','Age','YearsWithCurrManager','StockOptionLevel','YearsAtCompany','JobInvolvement','JobSatisfaction','EnvironmentSatisfaction','DistanceFromHome','JobRole','Department','WorkLifeBalance','TrainingTimesLastYear','DailyRate','RelationshipSatisfaction','NumCompaniesWorked','YearsSinceLastPromotion','Education','Gender','EducationField','MonthlyRate']]

In [None]:
plt.figure(figsize=(20,20),facecolor='red')
plotno=1
for column in x:
    if plotno<=26:
        ax=plt.subplot(5,6,plotno)
        sns.boxplot(data=x[column])
        plt.xlabel(column,fontsize=12)
    plotno+=1
plt.show()

Based on box plot we can see TotalWorkingYears, YearsInCurrentRole, MonthlyIncome, YearsWithCurrManager, StockOptionLevel, YearsAtCompany, TrainingTimesLastYear, YearsSinceLastPromotion have outliers we need to remove

In [None]:
df.describe()

In [None]:
q1=df.quantile(0.25)
q3=df.quantile(0.75)
iqr=q3-q1

In [None]:
TWY=(q3.TotalWorkingYears + (1.5*iqr.TotalWorkingYears))
index=np.where(df['TotalWorkingYears']>TWY)
df=df.drop(df.index[index])
df.shape

In [None]:
YICR=(q3.YearsInCurrentRole + (1.5*iqr.YearsInCurrentRole))
index=np.where(df['YearsInCurrentRole']>YICR)
df=df.drop(df.index[index])
df.shape

In [None]:
MI=(q3.MonthlyIncome + (1.5*iqr.MonthlyIncome))
index=np.where(df['MonthlyIncome']>MI)
df=df.drop(df.index[index])
df.shape

In [None]:
YWCM=(q3.YearsWithCurrManager + (1.5*iqr.YearsWithCurrManager))
index=np.where(df['YearsWithCurrManager']>YWCM)
df=df.drop(df.index[index])
df.shape

In [None]:
SOL=(q3.StockOptionLevel + (1.5*iqr.StockOptionLevel))
index=np.where(df['StockOptionLevel']>SOL)
df=df.drop(df.index[index])
df.shape

In [None]:
YAC=(q3.YearsAtCompany + (1.5*iqr.YearsAtCompany))
index=np.where(df['YearsAtCompany']>YAC)
df=df.drop(df.index[index])
df.shape

In [None]:
TTLY=(q3.TrainingTimesLastYear + (1.5*iqr.TrainingTimesLastYear))
index=np.where(df['TrainingTimesLastYear']>TTLY)
df=df.drop(df.index[index])
df.shape

In [None]:
YSLP=(q3.YearsSinceLastPromotion + (1.5*iqr.YearsSinceLastPromotion))
index=np.where(df['YearsSinceLastPromotion']>YSLP)
df=df.drop(df.index[index])
df.shape

In [None]:
TTLY=(q3.TrainingTimesLastYear - (1.5*iqr.TrainingTimesLastYear))
index=np.where(df['TrainingTimesLastYear']<TTLY)
df=df.drop(df.index[index])
df.reset_index()

In [None]:
sns.lineplot(x='Attrition',y='Age',data=df)
plt.show()

When age increases Attrition tends towards NO

In [None]:
sns.lineplot(x='Attrition',y='BusinessTravel',data=df)
plt.show()

We can see Business travel has slight inverse effect on Attrition.

In [None]:
sns.lineplot(x='Attrition',y='DailyRate',data=df)
plt.show()

We can see that Attrition rate is higher in employees with low dailyrate, dailyrate is inversely reflected on attrition.

In [None]:
sns.lineplot(x='Attrition',y='Department',data=df)
plt.show()

HR iss least likely to affect Attrition, sales is more likely to cause Attrition

In [None]:
sns.lineplot(x='Attrition',y='DistanceFromHome',data=df)
plt.show()

As Office to home distance increases, attrition tends towards NO

In [None]:
sns.lineplot(x='Attrition',y='OverTime',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='TotalWorkingYears',data=df)
plt.show()

We can see that Attrition rate is higher in with less working years, it is inversely reflected on attrition.

In [None]:
sns.lineplot(x='Attrition',y='JobLevel',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='MaritalStatus',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='YearsInCurrentRole',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='MonthlyIncome',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='YearsWithCurrManager',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='StockOptionLevel',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='YearsAtCompany',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='JobInvolvement',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='JobSatisfaction',data=df)
plt.show()

In [None]:
sns.lineplot(x='Attrition',y='EnvironmentSatisfaction',data=df)
plt.show()

Age, Business Travel, daily rate, Total Working hours, years in current role, monthly income, years with current manager, stock option level, years at company, job involvement, job satisfaction, enviroment satisfaction are imversely effective on Attrition. When above features increase, Attrion tends towards NO

In [None]:
scaler=StandardScaler()
x_scaled=scaler.fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(x_scaled,y,test_size=0.17,random_state=333)

In [None]:
def metric_score(clf,x_train,x_test,y_train,y_test,train=True):
    if train:
        y_pred=clf.predict(x_train)
        print('\n----Train result----')
        print(f'Accuracy Score:{accuracy_score(y_train,y_pred)*100:.2f}%')
        
    elif train==False:
        pred=clf.predict(x_test)
        print('\n----Test result----')
        print(f'Accuracy Score:{accuracy_score(y_test,pred)*100:.2f}%')

        print('\n\n Test Classification report \n',classification_report(y_test,pred,digits=2))

In [None]:
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)

In [None]:
metric_score(knn,x_train,x_test,y_train,y_test,train=True)
metric_score(knn,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg=LogisticRegression()
log_reg.fit(x_train,y_train)

In [None]:
metric_score(log_reg,x_train,x_test,y_train,y_test,train=True)
metric_score(log_reg,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bag_knn=BaggingClassifier(KNeighborsClassifier(n_neighbors=9),
                          n_estimators=6,max_samples=0.75,
                          bootstrap=True,random_state=366,oob_score=True)

In [None]:
bag_knn.fit(x_train,y_train)

In [None]:
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=True)
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=False)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostRegressor

In [None]:
metric_score(rnd_clf,x_train,x_test,y_train,y_test,train=True)
metric_score(rnd_clf,x_train,x_test,y_train,y_test,train=False)

Based on the accuracy scores of several models, Bag KNN has closest Train and Test result, the difference in bagknn model's train and test is smallest

So we will move forward with Bagging - KNN model

In [None]:
from sklearn.model_selection import KFold,cross_val_score

In [None]:
cross_val_score(bag_knn,x_scaled,y,cv=5)

In [None]:
cross_val_score(BaggingClassifier(KNeighborsClassifier(n_neighbors=9),
                          n_estimators=6,max_samples=0.75,
                          bootstrap=True,random_state=366,oob_score=True),x_scaled,y,cv=5).mean()

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
bag_knn.get_params().keys()

In [None]:
param_grid = {
    'base_estimator__n_neighbors' : [1, 2, 3, 4, 5],
    'max_samples' : [0.05, 0.1, 0.2, 0.5]}

grid_search = GridSearchCV(BaggingClassifier(KNeighborsClassifier(),n_estimators = 9, max_features = 12),param_grid)
grid_search.fit(x_train, y_train)

In [None]:
best_parameters=grid_search.best_params_
print(best_parameters)

In [None]:
bag_knn=BaggingClassifier(KNeighborsClassifier({'base_estimator__n_neighbors': 1, 'max_samples': 0.2}))

In [None]:
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=True)
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=False)

Score did not improve so we'll use original Bagging KNNeighbors model

In [None]:
bag_knn=BaggingClassifier(KNeighborsClassifier(n_neighbors=9),
                          n_estimators=6,max_samples=0.75,
                          bootstrap=True,random_state=366,oob_score=True)

bag_knn.fit(x_train,y_train)

In [None]:
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=True)
metric_score(bag_knn,x_train,x_test,y_train,y_test,train=False)

In [None]:
pickle.dump(bag_knn,open('HR Analytics','wb'))