In [None]:
#!/usr/bin/env python

coding: utf-8

**Capstone project: Providing data-driven suggestions for HR**

Import packages

For data manipulation

In [None]:
import numpy as np
import pandas as pd


For data visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


For displaying all of the columns in dataframes

In [None]:
pd.set_option('display.max_columns', None)


For data modeling

In [None]:
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


For metrics and helpful functions

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.tree import plot_tree


For saving models

In [None]:
import pickle



Load dataset

Load dataset into a dataframe

In [None]:

df0 = pd.read_csv("HR_capstone_dataset.csv")


Display first few rows of the dataframe

In [None]:

df0.head()



Gather basic information about the data

In [None]:

df0.info()
df0.describe()


Display all column names

In [None]:
df0.columns


Rename columns as needed

In [None]:

df0 = df0.rename(columns={'Work_accident': 'work_accident',
                          'average_montly_hours': 'average_monthly_hours',
                          'time_spend_company': 'tenure',
                          'Department': 'department'})

df0.columns



Check missing values

In [None]:

df0.isna().sum()


Check for duplicates

In [None]:
df0.duplicated().sum()


3,008 rows contain duplicates. That is 20% of the data.

Inspect some rows containing duplicates as needed

In [None]:

df0[df0.duplicated()].head()



Drop duplicates and save resulting dataframe in a new variable as needed

In [None]:

df1 = df0.drop_duplicates(keep='first')


Display first few rows of new dataframe as needed

In [None]:
df1.head()



Check outliers

Create a boxplot to visualize distribution of `tenure` and detect any outliers

In [None]:
plt.figure(figsize=(6,6))
plt.title('Boxplot to detect outliers for tenure', fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
sns.boxplot(x=df1['tenure'])
 


The boxplot above shows that there are outliers in the `tenure` variable.

Determine the number of rows containing outliers

Compute the 25th percentile value in `tenure`

In [None]:
percentile25 = df1['tenure'].quantile(0.25)


Compute the 75th percentile value in `tenure`

In [None]:
percentile75 = df1['tenure'].quantile(0.75)


Compute the interquartile range in `tenure`

In [None]:
iqr = percentile75 - percentile25


Define the upper limit and lower limit for non-outlier values in `tenure`

In [None]:
upper_limit = percentile75 + 1.5 * iqr
lower_limit = percentile25 - 1.5 * iqr
print("Lower limit:", lower_limit)
print("Upper limit:", upper_limit)


Identify subset of data containing outliers in `tenure`

In [None]:
outliers = df1[(df1['tenure'] > upper_limit) | (df1['tenure'] < lower_limit)]


Count how many rows in the data contain outliers in `tenure`

In [None]:
print("Number of rows in the data containing outliers in `tenure`:", len(outliers))


Get numbers of people who left vs. stayed

In [None]:

print(df1['left'].value_counts())
print()


Get percentages of people who left vs. stayed

In [None]:

print(df1['left'].value_counts(normalize=True))



Create a plot as needed

Set figure and axes

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (22,8))


Create boxplot showing `average_monthly_hours` distributions for `number_project`, comparing employees who stayed versus those who left

In [None]:
sns.boxplot(data=df1, x='average_monthly_hours', y='number_project', hue='left', orient="h", ax=ax[0])
ax[0].invert_yaxis()
ax[0].set_title('Monthly hours by number of projects', fontsize='14')


Create histogram showing distribution of `number_project`, comparing employees who stayed versus those who left

In [None]:
tenure_stay = df1[df1['left']==0]['number_project']
tenure_left = df1[df1['left']==1]['number_project']
sns.histplot(data=df1, x='number_project', hue='left', multiple='dodge', shrink=2, ax=ax[1])
ax[1].set_title('Number of projects histogram', fontsize='14')


Display the plots

In [None]:
plt.show()


Get value counts of stayed/left for employees with 7 projects

In [None]:
df1[df1['number_project']==7]['left'].value_counts()



Create a plot as needed

Create scatterplot of `average_monthly_hours` versus `satisfaction_level`, comparing employees who stayed versus those who left

Create a plot as needed

Set figure and axes

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (22,8))


Create boxplot showing distributions of `satisfaction_level` by tenure, comparing employees who stayed versus those who left

In [None]:
sns.boxplot(data=df1, x='satisfaction_level', y='tenure', hue='left', orient="h", ax=ax[0])
ax[0].invert_yaxis()
ax[0].set_title('Satisfaction by tenure', fontsize='14')


Create histogram showing distribution of `tenure`, comparing employees who stayed versus those who left

In [None]:
tenure_stay = df1[df1['left']==0]['tenure']
tenure_left = df1[df1['left']==1]['tenure']
sns.histplot(data=df1, x='tenure', hue='left', multiple='dodge', shrink=5, ax=ax[1])
ax[1].set_title('Tenure histogram', fontsize='14')

plt.show();


Calculate mean and median satisfaction scores of employees who left and those who stayed

In [None]:
df1.groupby(['left'])['satisfaction_level'].agg([np.mean,np.median])


Create a plot as needed

Set figure and axes

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (22,8))


Define short-tenured employees

In [None]:
tenure_short = df1[df1['tenure'] < 7]


Define long-tenured employees

In [None]:
tenure_long = df1[df1['tenure'] > 6]


Plot short-tenured histogram

In [None]:
sns.histplot(data=tenure_short, x='tenure', hue='salary', discrete=1, 
             hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=.5, ax=ax[0])
ax[0].set_title('Salary histogram by tenure: short-tenured people', fontsize='14')


Plot long-tenured histogram

In [None]:
sns.histplot(data=tenure_long, x='tenure', hue='salary', discrete=1, 
             hue_order=['low', 'medium', 'high'], multiple='dodge', shrink=.4, ax=ax[1])
ax[1].set_title('Salary histogram by tenure: long-tenured people', fontsize='14');



The plots above show that long-tenured employees were not disproportionately comprised of higher-paid employees.

Create scatterplot of `average_monthly_hours` versus `last_evaluation`

Create a plot as needed
Create plot to examine relationship between `average_monthly_hours` and `promotion_last_5years`

Display counts for each department

In [None]:
df1["department"].value_counts()


Create a plot as needed
Create stacked histogram to compare department distribution of employees who left to that of employees who didn't

In [None]:
plt.figure(figsize=(11,8))
sns.histplot(data=df1, x='department', hue='left', discrete=1, 
             hue_order=[0, 1], multiple='dodge', shrink=.5)
plt.xticks(rotation='45')
plt.title('Counts of stayed/left by department', fontsize=14);



There doesn't seem to be any department that differs significantly in its proportion of employees who left to those who stayed.

Create a plot as needed

Plot a correlation heatmap

In [None]:
plt.figure(figsize=(16, 9))
heatmap = sns.heatmap(df0.corr(), vmin=-1, vmax=1, annot=True, cmap=sns.color_palette("vlag", as_cmap=True))
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':14}, pad=12);



The correlation heatmap confirms that the number of projects, monthly hours, and evaluation scores all have some positive correlation with each other, and whether an employee leaves is negatively correlated with their satisfaction level.

Step 3. Model Building, Step 4. Results and Evaluation
- Fit a model that predicts the outcome variable using two or more independent variables
- Check model assumptions
- Evaluate the model

Identify the type of prediction task.

Modeling Approach : Logistic Regression Model

Copy the dataframe

In [None]:
df_enc = df1.copy()


Encode the `salary` column as an ordinal numeric category

In [None]:
df_enc['salary'] = (
    df_enc['salary'].astype('category')
    .cat.set_categories(['low', 'medium', 'high'])
    .cat.codes
)


Dummy encode the `department` column

In [None]:
df_enc = pd.get_dummies(df_enc, drop_first=False)


Display the new dataframe

In [None]:
df_enc.head()



Create a heatmap to visualize how correlated variables are

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(df_enc[['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', 'tenure']]
            .corr(), annot=True, cmap="crest")
plt.title('Heatmap of the dataset')
plt.show()




Create a stacked bart plot to visualize number of employees across department, comparing those who left with those who didn't
In the legend, 0 (purple color) represents employees who did not leave, 1 (red color) represents employees who left

Since logistic regression is quite sensitive to outliers, it would be a good idea at this stage to remove the outliers in the `tenure` column that were identified earlier.

Select rows without outliers in `tenure` and save resulting dataframe in a new variable

In [None]:
df_logreg = df_enc[(df_enc['tenure'] >= lower_limit) & (df_enc['tenure'] <= upper_limit)]


Display first few rows of new dataframe

In [None]:
df_logreg.head()



Isolate the outcome variable

In [None]:
y = df_logreg['left']


Display first few rows of the outcome variable

In [None]:
y.head() 



Select the features you want to use in your model

In [None]:
X = df_logreg.drop('left', axis=1)


Display the first few rows of the selected features

In [None]:
X.head()




Split the data into training set and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)



Construct a logistic regression model and fit it to the training dataset

In [None]:
log_clf = LogisticRegression(random_state=42, max_iter=500).fit(X_train, y_train)




Use the logistic regression model to get predictions on the test set

In [None]:
y_pred = log_clf.predict(X_test)



Create a confusion matrix to visualize the results of the logistic regression model.

Compute values for confusion matrix

Create display of confusion matrix

Plot confusion matrix

In [None]:
log_disp.plot(values_format='')


Display plot

In [None]:
plt.show()



True negatives: The number of people who did not leave that the model accurately predicted did not leave.

False positives: The number of people who did not leave the model inaccurately predicted as leaving.

False negatives: The number of people who left that the model inaccurately predicted did not leave

True positives: The number of people who left the model accurately predicted as leaving

In [None]:


df_logreg['left'].value_counts(normalize=True)



There is an approximately 83%-17% split. So the data is not perfectly balanced, but it is not too imbalanced. If it was more severely imbalanced, you might want to resample the data to make it more balanced. In this case, you can use this data without modifying the class balance and continue evaluating the model.

Create classification report for logistic regression model

In [None]:
target_names = ['Predicted would not leave', 'Predicted would leave']
print(classification_report(y_test, y_pred, target_names=target_names))



    table = pd.DataFrame({'model': [model_name],
                          'precision': [precision], 
                          'recall': [recall],
                          'f1': [f1],
                          'accuracy': [accuracy],
                          'AUC': [auc]
                         })
  
    return table




For another project, you could try building a K-means model on this data and analyzing the clusters. This may yield valuable insight.

## 📊 Attrition Distribution

## 🔥 Correlation Heatmap

In [None]:

plt.figure(figsize=(10, 8))
sns.heatmap(df1.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Feature Correlation Heatmap')
plt.show()


## 📦 Boxplot of Satisfaction Level by Attrition

## 📈 ROC Curve for Logistic Regression