<a href="https://colab.research.google.com/github/Sourav61/Projects/blob/main/Employee%20Retention%20Analysis/Retention_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Author: <a href = "https://github.com/Sourav61">Sourav Pahwa</a>
<br>ID: GO_STP_13420

<b>Q) Predict retention of an employee within an organization such that whether the employee will leave the company or continue with it. An organization is only as good as its employees, and these people are the true source of its competitive advantage. Dataset is downloaded from Kaggle. Link: <a href="https://www.kaggle.com/giripujar/hr-analytics">https://www.kaggle.com/giripujar/hr-analytics")</a></b>

First do data exploration and visualization, after this create a logistic regression model to predict Employee Attrition Using Machine Learning & Python.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("HR_comma_sep.csv")

In [None]:
df.head(10)

In [None]:
df.tail(10)

In [None]:
df.info()

In [None]:
df.describe(include="all")

In [None]:
df.kurt()

In [None]:
df.skew()

In [None]:
df.keys()

In [None]:
df.columns

In [None]:
df.axes

In [None]:
df.items()

In [None]:
df.boxplot(rot=45)
plt.show()

In [None]:
df.hist(figsize=(15,20),xrot=45,yrot=45)
plt.show()

In [None]:
df.dtypes

In [None]:
df.duplicated().any()

In [None]:
df.duplicated().sum() 

In [None]:
df.isna()

In [None]:
df.isnull().any()

In [None]:
df.isnull().sum()

In [None]:
msno.bar(df.sample(14999),color="cyan")
plt.show()

In [None]:
msno.matrix(df.sample(14999),color=(1, 0, 1))
plt.show()

In [None]:
df.corr()

In [None]:
fig = plt.figure(figsize = (12,10))
sns.heatmap(df.corr(), cmap='inferno', annot = True) 
plt.show()

In [None]:
corr = df.corr()
sns.heatmap((corr),
xticklabels=corr.columns.values,
yticklabels=corr.columns.values,cmap='cubehelix_r',annot=False,fmt=".2g")
plt.title('Heatmap of Correlation Matrix', fontsize=20)
corr

In [None]:
plt.figure(figsize=(14,14))
sns.heatmap(df.cov(), annot=True, fmt =".2f",square=True,cmap='rainbow')
plt.title("Covariation",fontsize = 15)
plt.show()

In [None]:
satisfaction_mean = df['satisfaction_level'].mean()
left_mean = df[df['left']==1]['satisfaction_level'].mean()
print( f'The mean for the employee population is: {satisfaction_mean}')
print( f'The mean for the employees that had left is: {left_mean}')

In [None]:
f, axes = plt.subplots(ncols=3, figsize=(15, 6))
sns.distplot(df.satisfaction_level, kde=False, color="m", ax=axes[0]).set_title('Employee Satisfaction Measure',fontsize=14)
axes[0].set_ylabel('Employee Count',fontsize=10)
sns.distplot(df.last_evaluation, kde=False, color="g", ax=axes[1]).set_title('Employee Evaluation Measure',fontsize=14)
axes[1].set_ylabel('Employee Count',fontsize=10)
sns.distplot(df.average_montly_hours, kde=False, color="b", ax=axes[2]).set_title('Employee Average Monthly Hours Measure',fontsize=14)
axes[2].set_ylabel('Employee Count',fontsize=10)
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(15, 4))
sns.countplot(y="salary", hue='left', data=df).set_title('Employee Salary Turnover Distribution');

In [None]:
color_types = ['#78C850','#F08030','#6890F0','#A8B820','#A8A878','#A040A0','#F8D030',
'#E0C068','#EE99AC','#C03028','#F85888','#B8A038','#705898','#98D8D8','#7038F8']
sns.countplot(x='Department', data=df, palette=color_types).set_title('Employee Department Distribution');

In [None]:
data = df[['satisfaction_level','average_montly_hours','promotion_last_5years','salary']]
data.head(10)

In [None]:
df1 = pd.get_dummies(data['salary'])
df1.head(10)

In [None]:
df1.tail(10)

In [None]:
merge = pd.concat([data,df1],axis='columns')
merge

In [None]:
merge.drop(['salary','high'], axis=1, inplace=True)
merge

In [None]:
x = merge.copy()
x

In [None]:
y = df['left']
y

In [None]:
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=.25, random_state=5)
print(xtrain.shape) 
print(xtest.shape)
print(ytrain.shape)
ytest.shape

In [None]:
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression(solver='newton-cg')
lm.fit(xtrain,ytrain)

In [None]:
y_pred = lm.predict(xtest)
y_pred

In [None]:
lm.score(xtest,ytest)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix,plot_confusion_matrix,mean_squared_error, mean_absolute_error ,r2_score,  classification_report

In [None]:
accuracy_score(ytest,y_pred)

In [None]:
confusion_matrix(ytest,y_pred)

In [None]:
plot_confusion_matrix(lm, xtest, ytest,cmap=plt.cm.cubehelix_r)
plt.show()

In [None]:
print("The Mean Squared Error is: ", end=" ")
mse = mean_squared_error(y_pred,ytest)
print(mse)
print("The Mean Absolute Error is: ", end=" ")
mae = mean_absolute_error(y_pred, ytest)
print(mae)

In [None]:
print('The R2 Score is: %0.2f ' % r2_score(ytest, y_pred))

In [None]:
print(classification_report(ytest,y_pred))

In [None]:
plt.scatter(ytest, y_pred,c='m', marker="^")
plt.plot(ytest, lm.predict(xtest))
plt.show()