# import libraries

In [102]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly import graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier



# read in data

In [2]:
df = pd.read_csv('HR_comma_sep.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [4]:
df['Department'] = df['sales']
df = df.drop(columns=['sales'])

In [5]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department
0,0.38,0.53,2,157,3,0,1,0,low,sales
1,0.8,0.86,5,262,6,0,1,0,medium,sales
2,0.11,0.88,7,272,4,0,1,0,medium,sales
3,0.72,0.87,5,223,5,0,1,0,low,sales
4,0.37,0.52,2,159,3,0,1,0,low,sales


data will need to be scaled (use minmax/robust if not normally distributed and standard if normally distributed)

no null values

salary and sales (department) will need to be encoded

left is the predictor


## EDA

In [6]:
fig = px.imshow(df.corr(numeric_only=True), text_auto=True, aspect="auto", color_continuous_scale='agsunset', title='Correlation Matrix',width=800, height=800)
fig.show()

weak correlations: neg corr between left and satisfacton, pos corr between num projects and monthly hors, pos corr between last eval and monthly hours, pos corr between num hours and last eval

In [7]:
fig = px.histogram(df, x="satisfaction_level", marginal='box', title='Satisfaction Level Distribution', color_discrete_sequence=['indigo'])
fig.show()

not normally distributed

most values between 0.44 and 0.82

In [8]:
fig = px.histogram(df, x="last_evaluation", marginal='box', title='Evaluation Rating Distribution', color_discrete_sequence=['lightcoral'])
fig.show()

not normally distributed

most values between 0.56 and 0.87

there are two peaks around 0.55 and 0.87

In [9]:
fig = px.histogram(df, x="average_montly_hours", marginal='box', title='Average Monthly Hours in Office Distribution', color_discrete_sequence=['mediumvioletred'])
fig.show()

not normally distributed

most values between 156 and 245

two peaks around 145 and 255

In [10]:
fig = px.histogram(df, x="time_spend_company", marginal='box', title='Number of Years with Company Distribution', color_discrete_sequence=['peachpuff'])
fig.show()

left skewed

most employees have been with the company 3 years with the range for most employees being 2 and five years

outliers for values greater than 5 years

In [11]:
fig = px.histogram(df, x="number_project", marginal='box', title='Number of Projects Distribution', color_discrete_sequence=['hotpink'])
fig.show()

left skewed

most employees have 4 projects

2 - 7 projects for employees is the range

In [12]:
fig = px.histogram(df, x="promotion_last_5years", marginal='box', title='Number of Promotions in Last 5 years Distribution', color_discrete_sequence=['darkorchid'])
fig.show()

very few employees received a rasie in the the last 5 years

In [13]:
project_left_counts = df.groupby(['number_project', 'left']).size().reset_index(name='count')
fig = px.histogram(project_left_counts, x='number_project', y='count', color='left', title='Number of Projects by Employee Turnover', barmode='group', color_discrete_sequence=['lightseagreen', 'salmon'], nbins=7)
fig.show()

2, 6, 7 projects - higher turnover than retention

In [14]:
project_left_counts = df.groupby(['number_project', 'left']).size().reset_index(name='count')
fig = px.histogram(project_left_counts, x='number_project', y='count', color='left', title='Number of Projects by Employee Turnover', barmode='group', color_discrete_sequence=['lightseagreen', 'salmon'])
fig.show()

turnover greater than retention if the number of projects an employee has is between 6-7

between 4-5 projects has less turnover

In [15]:
salary_counts = df.groupby(['salary', 'left']).size().reset_index(name='count')
fig = px.histogram(salary_counts, x='salary', y='count', color='left', title='Salary Level by Employee Turnover', barmode='group', color_discrete_sequence=['mediumslateblue', 'tomato'])
fig.show()

higher turnover when salary low and medium

In [16]:
accident_counts = df.groupby(['Work_accident', 'left']).size().reset_index(name='count')
fig = px.histogram(accident_counts, x='Work_accident', y='count', color='left', title='Work Accident by Employee Turnover', barmode='group', color_discrete_sequence=['indigo', 'coral'])
fig.show()

no work accident but more turnover - i think this is more of a volume thing

## Classification Modeling

encoding

In [17]:
df.select_dtypes('O').nunique()

salary         3
Department    10
dtype: int64

In [18]:
df = pd.get_dummies(df, columns=['salary','Department'], drop_first=True)

In [19]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.38,0.53,2,157,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False
1,0.8,0.86,5,262,6,0,1,0,False,True,False,False,False,False,False,False,True,False,False
2,0.11,0.88,7,272,4,0,1,0,False,True,False,False,False,False,False,False,True,False,False
3,0.72,0.87,5,223,5,0,1,0,True,False,False,False,False,False,False,False,True,False,False
4,0.37,0.52,2,159,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False


train test split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['left'], axis=1), df['left'], test_size=0.2)

In [21]:
X_train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
708,0.11,0.87,6,255,4,0,0,True,False,False,False,False,False,False,False,True,False,False
3976,0.67,0.66,3,237,3,0,0,True,False,False,False,False,False,True,False,False,False,False
2368,0.72,0.88,2,247,3,0,0,False,True,False,False,False,False,False,False,False,False,True
13912,0.53,0.73,3,163,3,1,0,True,False,False,False,False,False,False,False,False,False,False
12696,0.1,0.81,7,291,4,0,0,False,False,False,False,False,False,False,False,True,False,False


In [22]:
X_test.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
11917,0.92,0.85,3,151,6,1,0,True,False,False,False,False,False,False,False,True,False,False
12524,0.75,0.99,5,221,5,0,0,True,False,False,False,False,False,False,True,False,False,False
5746,0.55,0.6,4,176,3,0,0,False,True,False,False,False,False,False,False,True,False,False
8145,0.89,0.57,3,252,2,0,0,False,True,False,False,False,False,False,False,False,True,False
1410,0.9,0.98,5,271,5,0,0,True,False,False,False,False,False,False,False,False,False,True


scaling

In [23]:
mm = MinMaxScaler()

X_train_transformed = pd.DataFrame(mm.fit_transform(X_train), columns=X_train.columns)  

X_test_transformed = pd.DataFrame(mm.transform(X_test), columns=X_test.columns)

In [24]:
X_train_transformed.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.021978,0.796875,0.8,0.742991,0.25,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.637363,0.46875,0.2,0.658879,0.125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.692308,0.8125,0.0,0.705607,0.125,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.483516,0.578125,0.2,0.313084,0.125,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.010989,0.703125,1.0,0.911215,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [25]:
X_test_transformed.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.912088,0.765625,0.2,0.257009,0.5,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.725275,0.984375,0.6,0.584112,0.375,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.505495,0.375,0.4,0.373832,0.125,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.879121,0.328125,0.2,0.728972,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.89011,0.96875,0.6,0.817757,0.375,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


smote

In [67]:
y_train.value_counts()

left
0    9184
1    2815
Name: count, dtype: int64

In [75]:
fig = px.histogram(y_train, nbins=2, title='Distribution of (Employee Turnover)', color=y_train.map({0: 'Stayed', 1: 'Left'}), color_discrete_sequence=['teal', 'salmon'])
fig.update_xaxes(title='Left (0 = Stayed, 1 = Left)')
fig.update_yaxes(title='Count')
fig.show()

In [70]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_transformed, y_train)

In [None]:
fig = px.histogram(y_train_smote, nbins=2, title='Distribution of (Employee Turnover)', color=y_train_smote)
fig.updat.map({0: 'Stayed', 1: 'Left'}), color_discrete_sequence=['teal', 'salmon']e_xaxes(title='Left (0 = Stayed, 1 = Left)')
fig.update_yaxes(title='Count')
fig.show()

In [76]:
knn = KNeighborsClassifier(n_neighbors=5)

In [77]:
knn.fit(X_train_smote, y_train_smote)

Train Results

In [78]:
y_pred_train = knn.predict(X_train_smote)

In [79]:
y_pred_train

array([1, 0, 0, ..., 1, 0, 1])

In [81]:
print(classification_report(y_train_smote, y_pred_train))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      9184
           1       0.95      0.99      0.97      9184

    accuracy                           0.97     18368
   macro avg       0.97      0.97      0.97     18368
weighted avg       0.97      0.97      0.97     18368



Test Results

In [82]:
y_pred = knn.predict(X_test_transformed)

In [83]:
y_pred

array([0, 1, 0, ..., 1, 1, 1])

In [84]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.95      2244
           1       0.82      0.95      0.88       756

    accuracy                           0.93      3000
   macro avg       0.90      0.94      0.92      3000
weighted avg       0.94      0.93      0.93      3000



stratified k fold

In [87]:
skf = StratifiedKFold(n_splits=5, shuffle=True)    

logistic regression

In [88]:
lr = LogisticRegression()

In [89]:
lr_kfold_scores = cross_val_score(lr, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [91]:
np.mean(np.abs(lr_kfold_scores))

np.float64(0.7741717614198625)

In [92]:
lr.fit(X_train_smote, y_train_smote)
y_pred_lr = lr.predict(X_test_transformed)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.92      0.74      0.82      2244
           1       0.52      0.81      0.63       756

    accuracy                           0.76      3000
   macro avg       0.72      0.78      0.73      3000
weighted avg       0.82      0.76      0.77      3000



In [117]:
y_score_lr = lr.predict_proba(X_test_transformed)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr)
roc_auc = auc(fpr, tpr)

In [109]:
fig_hist = px.histogram(x=y_score_lr, nbins=50, title='Predicted Probabilities of Employee Turnover (Logistic Regression)', color = y_test.map({0: 'Stayed', 1: 'Left'}), labels={'x': 'Predicted Probability', 'color': 'Actual Turnover'})
fig_hist.show()

In [118]:
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"


In [119]:
df

Rate,False Positive Rate,True Positive Rate
Thresholds,Unnamed: 1_level_1,Unnamed: 2_level_1
inf,0.000000,0.0
0.992696,0.000446,0.0
0.970201,0.004456,0.0
0.969222,0.005348,0.0
0.953730,0.015597,0.0
...,...,...
0.009979,0.988859,1.0
0.009968,0.989750,1.0
0.008926,0.992424,1.0
0.008516,0.993316,1.0


In [120]:
fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

random forest

In [94]:
rf = RandomForestClassifier()

In [95]:
rf_kfold_scores = cross_val_score(rf, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [98]:
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test_transformed)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2244
           1       0.99      0.96      0.97       756

    accuracy                           0.99      3000
   macro avg       0.99      0.98      0.98      3000
weighted avg       0.99      0.99      0.99      3000



gradient boosting

In [99]:
gb = GradientBoostingClassifier()

In [100]:
bg_kfold_scores = cross_val_score(gb, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [101]:
gb.fit(X_train_smote, y_train_smote)
y_pred_gb = gb.predict(X_test_transformed)
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      2244
           1       0.93      0.94      0.93       756

    accuracy                           0.97      3000
   macro avg       0.95      0.96      0.96      3000
weighted avg       0.97      0.97      0.97      3000



based on the classification reports, random forest classifier is the better model

## Clustering Modeling

In [34]:
df_left = df[df['left'] == 1]
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.38,0.53,2,157,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False
1,0.8,0.86,5,262,6,0,1,0,False,True,False,False,False,False,False,False,True,False,False
2,0.11,0.88,7,272,4,0,1,0,False,True,False,False,False,False,False,False,True,False,False
3,0.72,0.87,5,223,5,0,1,0,True,False,False,False,False,False,False,False,True,False,False
4,0.37,0.52,2,159,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False


In [35]:
fig = px.scatter(df_left, x='satisfaction_level', y='last_evaluation')
fig.show()

three distinct clusters - 1. low satisfaction and high eval score 2. mid satisfaction and low eval score 3. high satisfaction and high eval score

In [36]:
mm.fit_transform(df_left[['satisfaction_level', 'last_evaluation']])

array([[0.34939759, 0.14545455],
       [0.85542169, 0.74545455],
       [0.02409639, 0.78181818],
       ...,
       [0.3373494 , 0.14545455],
       [0.02409639, 0.92727273],
       [0.3373494 , 0.12727273]])

In [37]:
df_left_scaled = pd.DataFrame(mm.fit_transform(df_left[['satisfaction_level', 'last_evaluation']]), columns=['satisfaction_level', 'last_evaluation'])
df_left_scaled.head()

Unnamed: 0,satisfaction_level,last_evaluation
0,0.349398,0.145455
1,0.855422,0.745455
2,0.024096,0.781818
3,0.759036,0.763636
4,0.337349,0.127273


In [38]:
kmeans_3 = KMeans(n_clusters=3)

In [39]:
kmeans_3.fit(df_left_scaled)

In [40]:
kmeans_3.cluster_centers_ # centroids of the 3 clusters - scaled values

array([[0.38748519, 0.12177894],
       [0.86395634, 0.84770142],
       [0.02616878, 0.76701632]])

coordinates of the centroids for each cluster

In [41]:
mm.inverse_transform(kmeans_3.cluster_centers_) # coordinates of the centroids for each cluster - actual values

array([[0.41161271, 0.51697842],
       [0.80708376, 0.91623578],
       [0.11172009, 0.87185897]])

In [42]:
print(kmeans_3.labels_)

[0 1 2 ... 0 2 0]


In [43]:
kmeans_3.inertia_

62.92944601364937

sum of squares - distance between the points and the centers

In [44]:
df_left_scaled['Cluster'] = kmeans_3.labels_
df_left_scaled.head()

Unnamed: 0,satisfaction_level,last_evaluation,Cluster
0,0.349398,0.145455,0
1,0.855422,0.745455,1
2,0.024096,0.781818,2
3,0.759036,0.763636,1
4,0.337349,0.127273,0


In [45]:
df_left_scaled['Cluster'].value_counts()

Cluster
0    1668
1     967
2     936
Name: count, dtype: int64

In [46]:
centroids_3 = pd.DataFrame(mm.inverse_transform(kmeans_3.cluster_centers_), columns=['satisfaction_level', 'last_evaluation'])
centroids_3

Unnamed: 0,satisfaction_level,last_evaluation
0,0.411613,0.516978
1,0.807084,0.916236
2,0.11172,0.871859


In [51]:
fig = px.scatter(df_left, x="satisfaction_level", y="last_evaluation", color=df_left_scaled['Cluster'])
fig.add_trace(go.Scatter(x=centroids_3['satisfaction_level'], y=centroids_3['last_evaluation'], mode='markers', marker=dict(color='red', size=10), name='Centroids', text=centroids_3.index))
fig.update_layout(coloraxis_showscale=False)

d = fig.to_dict()
d["data"][0]["type"] = "scatter"

go.Figure(d)

# elbow curve

In [55]:
int_ = {}
cluster_no = range(1, 30)

for i in cluster_no:
    k_ = KMeans(n_clusters=i).fit(df_left_scaled)
    int_['cluster no_' + str(i)] = k_.inertia_

In [56]:
int_

{'cluster no_1': 3276.0927052317647,
 'cluster no_2': 875.4841551714002,
 'cluster no_3': 62.92944601364937,
 'cluster no_4': 52.18234481456534,
 'cluster no_5': 43.225855374512754,
 'cluster no_6': 40.355952185594916,
 'cluster no_7': 33.11803526611932,
 'cluster no_8': 27.10829239730704,
 'cluster no_9': 27.813428333445806,
 'cluster no_10': 24.1781371780258,
 'cluster no_11': 21.19378321496349,
 'cluster no_12': 18.342651809686505,
 'cluster no_13': 17.48771102805622,
 'cluster no_14': 15.767962221754974,
 'cluster no_15': 15.821073528722557,
 'cluster no_16': 12.531339382382075,
 'cluster no_17': 13.384773781803357,
 'cluster no_18': 10.685175000315406,
 'cluster no_19': 10.279732619313357,
 'cluster no_20': 9.608082869905802,
 'cluster no_21': 9.095614007979162,
 'cluster no_22': 9.272392682389789,
 'cluster no_23': 8.236660960911943,
 'cluster no_24': 7.797375514854012,
 'cluster no_25': 7.71937217213171,
 'cluster no_26': 6.968284534561339,
 'cluster no_27': 7.258941900103281,
 

In [None]:
fig = px.line(x=cluster_no, y=list(int_.values()), title='Elbow Curve', labels={'x':'Number of Clusters', 'y':'Inertia'}, markers=True)
fig.show()

based on the elbow curve, 3 is the best number of clusters (in alignment with the scatterplot)

## Silhouette Score

In [59]:
silhouette_score(df_left_scaled, kmeans_3.fit_predict(df_left_scaled))

np.float64(0.884158727982281)

used to compare silhouette scores for k means cluster values - this model only has one so no comparison