Problem Statement A large company named XYZ, employs, at any given point of time, around 4000 employees. However, every year, around 15% of its employees leave the company and need to be replaced with the talent pool available in the job market. The management believes that this level of attrition (employees leaving, either on their own or because they got fired) is bad for the company, because of the following reasons -

The former employees’ projects get delayed, which makes it difficult to meet timelines, resulting in a reputation loss among consumers and partners
A sizeable department has to be maintained, for the purposes of recruiting new talent
More often than not, the new employees have to be trained for the job and/or given time to acclimatise themselves to the company
Hence, the management has contracted an HR analytics firm to understand what factors they should focus on, in order to curb attrition. In other words, they want to know what changes they should make to their workplace, in order to get most of their employees to stay. Also, they want to know which of these variables is most important and needs to be addressed right away.

Since you are one of the star analysts at the firm, this project has been given to you.

Goal of the case study You are required to model the probability of attrition using a logistic regression. The results thus obtained will be used by the management to understand what changes they should make to their workplace, in order to get most of their employees to stay.

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
df=pd.read_csv("../input/hr-analytics-case-study/general_data.csv")
df

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
print(df.isnull().any())
print(df.isnull().any().any())

In [None]:
#df.drop(['EmployeeCount','EmployeeID','StandardHours','Over18','NumCompaniesWorked','TotalWorkingYears'],axis=1, inplace = True)
plt.figure(figsize=(50,50)) #plt is the object of matplot lib and .figure() is used to show or change properties of graphs
sns.heatmap(df.isnull(),cmap='viridis',yticklabels=False,cbar=False) #heatmaps are matrix plots which can visualize data in 2D
plt.show()

In [None]:
df.fillna(0,inplace=True)

In [None]:
df.drop(['EmployeeCount','EmployeeID','StandardHours','Over18','NumCompaniesWorked','TotalWorkingYears'],axis=1, inplace = True)

In [None]:
df.columns

In [None]:
#df['NumCompaniesWorked'].value_counts()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
#df.drop(["EmployeeCount","EmployeeID","StandardHours"],1,inplace= True)

In [None]:
df_corr = df[['Age','Attrition','BusinessTravel','DistanceFromHome','Education', 'EducationField','Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome',
       'PercentSalaryHike', 'StockOptionLevel',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']]

In [None]:
corr=df_corr.corr()
corr

In [None]:
f,ax = plt.subplots(figsize=(16, 7))
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
print(round(df['Attrition'].value_counts(normalize = True),2))
sns.countplot(x='Attrition',data=df)


In [None]:
sns.pairplot(df[['Age', 'Attrition', 'BusinessTravel', 'DistanceFromHome', 'EducationField', 'Gender', 'JobLevel', 'JobRole',
       'MaritalStatus', 'MonthlyIncome',
       'PercentSalaryHike', 'StockOptionLevel',
       'TrainingTimesLastYear', 'YearsAtCompany', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']],hue = 'Attrition')

In [None]:
sns.countplot(x = "Attrition",data=df,hue="Gender")
#plt.scatter(df['Attrition'], data=df,hue="Gender")
#sns.catplot(x="Attrition", y=df, data=df)
#sns.boxplot(x="Attrition", y="Gender", data=df)

In [None]:
sns.countplot(x = "Attrition",data=df,hue="JobLevel")

In [None]:
sns.countplot(x = "Attrition",data=df,hue="MaritalStatus")

In [None]:
sns.countplot(x = "Attrition",data=df)
plt.show()

In [None]:
sns.pairplot(df[['Age','MonthlyIncome','DistanceFromHome','Gender']],hue = 'Gender',hue_order=['Male','Female'], palette={'Male':'black','Female':'yellow'},plot_kws={'alpha':0.1},height=4)

In [None]:
df.isnull().any()

In [None]:
print(df['BusinessTravel'].unique())
print(df['Department'].unique())
print(df['EducationField'].unique())
print(df['Gender'].unique())
print(df['JobRole'].unique())
print(df['MaritalStatus'].unique())

In [None]:
from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
df['BusinessTravel'] = labelEncoder_X.fit_transform(df['BusinessTravel'])
df['Department'] = labelEncoder_X.fit_transform(df['Department'])
df['EducationField'] = labelEncoder_X.fit_transform(df['EducationField'])
df['Gender'] = labelEncoder_X.fit_transform(df['Gender'])
df['JobRole'] = labelEncoder_X.fit_transform(df['JobRole'])
df['MaritalStatus'] = labelEncoder_X.fit_transform(df['MaritalStatus'])

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder_y=LabelEncoder()
df['Attrition']=label_encoder_y.fit_transform(df['Attrition'])

In [None]:
f,ax = plt.subplots(figsize=(16, 7))
sns.heatmap(df.corr(), annot=True)
plt.show()

In [None]:
df.corr()

In [None]:
df.isnull().any().any()

In [None]:
y=df['Attrition']
x=df.drop('Attrition',axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)


In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
log_score = cross_val_score(estimator=LogisticRegression(), X=X_test, y=y_test, cv=5)  

In [None]:
plt.plot(log_score)

In [None]:
log_score

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
clf = DecisionTreeClassifier(random_state=42)
#clf.fit(X_train, y_train)
bag_clf = BaggingClassifier(base_estimator=clf, n_estimators=10)
bag_clf.fit(X_train, y_train)
bag_clf.predict(X_test)

In [None]:
def kpi_metrics(clf, X_train, y_train, X_test, y_test, train=True):
    '''
    print the accuracy score, classification report and confusion matrix of classifier
    '''
    if train:
        '''
        training performance
        '''
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, clf.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, clf.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, clf.predict(X_train))))

        res = cross_val_score(clf, X_train, y_train, cv=10, scoring='accuracy')
        
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        return pd.DataFrame(res)
        
    elif train==False:
        '''
        test performance
        '''
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, clf.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, clf.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, clf.predict(X_test))))
        
        
        res = cross_val_score(clf, X_test, y_test, cv=10, scoring='accuracy')
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        return pd.DataFrame(res)
        

In [None]:
import numpy as np
kpi_metrics(bag_clf, X_train, y_train, X_test, y_test, train=True)

In [None]:
kpi_metrics(bag_clf, X_train, y_train, X_test, y_test, train=False)