# IMPORT LIBRARIES

In [14]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly import graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier



# READ IN DATA

In [15]:
df = pd.read_csv('HR_comma_sep.csv')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [17]:
df['Department'] = df['sales']
df = df.drop(columns=['sales'])

In [18]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department
0,0.38,0.53,2,157,3,0,1,0,low,sales
1,0.8,0.86,5,262,6,0,1,0,medium,sales
2,0.11,0.88,7,272,4,0,1,0,medium,sales
3,0.72,0.87,5,223,5,0,1,0,low,sales
4,0.37,0.52,2,159,3,0,1,0,low,sales


data will need to be scaled (use minmax/robust if not normally distributed and standard if normally distributed)

no null values

salary and sales (department) will need to be encoded

left is the predictor


# EDA

In [19]:
fig = px.imshow(df.corr(numeric_only=True), text_auto=True, aspect="auto", color_continuous_scale='agsunset', title='Correlation Matrix',width=800, height=800)
fig.show()

weak correlations: neg corr between left and satisfacton, pos corr between num projects and monthly hors, pos corr between last eval and monthly hours, pos corr between num hours and last eval

In [20]:
fig = px.histogram(df, x="satisfaction_level", marginal='box', title='Satisfaction Level Distribution', color_discrete_sequence=['indigo'])
fig.show()

not normally distributed

most values between 0.44 and 0.82

In [21]:
fig = px.histogram(df, x="last_evaluation", marginal='box', title='Evaluation Rating Distribution', color_discrete_sequence=['lightcoral'])
fig.show()

not normally distributed

most values between 0.56 and 0.87

there are two peaks around 0.55 and 0.87

In [22]:
fig = px.histogram(df, x="average_montly_hours", marginal='box', title='Average Monthly Hours in Office Distribution', color_discrete_sequence=['mediumvioletred'])
fig.show()

not normally distributed

most values between 156 and 245

two peaks around 145 and 255

In [23]:
fig = px.histogram(df, x="time_spend_company", marginal='box', title='Number of Years with Company Distribution', color_discrete_sequence=['peachpuff'])
fig.show()

left skewed

most employees have been with the company 3 years with the range for most employees being 2 and five years

outliers for values greater than 5 years

In [24]:
fig = px.histogram(df, x="number_project", marginal='box', title='Number of Projects Distribution', color_discrete_sequence=['hotpink'])
fig.show()

left skewed

most employees have 4 projects

2 - 7 projects for employees is the range

In [25]:
fig = px.histogram(df, x="promotion_last_5years", marginal='box', title='Number of Promotions in Last 5 years Distribution', color_discrete_sequence=['darkorchid'])
fig.show()

very few employees received a rasie in the the last 5 years

In [26]:
project_left_counts = df.groupby(['number_project', 'left']).size().reset_index(name='count')
fig = px.histogram(project_left_counts, x='number_project', y='count', color='left', title='Number of Projects by Employee Turnover', barmode='group', color_discrete_sequence=['lightseagreen', 'salmon'], nbins=7)
fig.show()

2, 6, 7 projects - higher turnover than retention

In [27]:
project_left_counts = df.groupby(['number_project', 'left']).size().reset_index(name='count')
fig = px.histogram(project_left_counts, x='number_project', y='count', color='left', title='Number of Projects by Employee Turnover', barmode='group', color_discrete_sequence=['lightseagreen', 'salmon'])
fig.show()

turnover greater than retention if the number of projects an employee has is between 6-7

between 4-5 projects has less turnover

In [28]:
salary_counts = df.groupby(['salary', 'left']).size().reset_index(name='count')
fig = px.histogram(salary_counts, x='salary', y='count', color='left', title='Salary Level by Employee Turnover', barmode='group', color_discrete_sequence=['mediumslateblue', 'tomato'])
fig.show()

higher turnover when salary low and medium

In [29]:
accident_counts = df.groupby(['Work_accident', 'left']).size().reset_index(name='count')
fig = px.histogram(accident_counts, x='Work_accident', y='count', color='left', title='Work Accident by Employee Turnover', barmode='group', color_discrete_sequence=['indigo', 'coral'])
fig.show()

no work accident but more turnover - i think this is more of a volume thing

# CLASSIFICATION MODEL

encoding

In [30]:
df.select_dtypes('O').nunique()

salary         3
Department    10
dtype: int64

In [31]:
df = pd.get_dummies(df, columns=['salary','Department'], drop_first=True)

In [32]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.38,0.53,2,157,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False
1,0.8,0.86,5,262,6,0,1,0,False,True,False,False,False,False,False,False,True,False,False
2,0.11,0.88,7,272,4,0,1,0,False,True,False,False,False,False,False,False,True,False,False
3,0.72,0.87,5,223,5,0,1,0,True,False,False,False,False,False,False,False,True,False,False
4,0.37,0.52,2,159,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False


train test split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['left'], axis=1), df['left'], test_size=0.2)

In [34]:
X_train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
10985,0.59,0.65,5,265,3,0,0,False,True,False,False,False,False,False,False,False,False,False
1661,0.36,0.56,2,140,3,0,0,False,True,False,False,False,False,False,False,False,False,True
8217,0.87,0.74,4,178,2,1,0,False,True,False,False,False,False,False,False,False,True,False
8802,0.77,0.59,4,153,3,0,0,False,True,False,False,False,False,False,False,True,False,False
1308,0.09,0.94,6,266,4,0,0,False,True,False,False,False,False,False,False,True,False,False


In [35]:
X_test.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
46,0.4,0.55,2,147,3,0,0,True,False,False,False,False,False,False,False,False,True,False
11929,0.82,0.76,3,219,8,1,0,True,False,False,False,False,False,False,False,False,False,False
7002,0.68,0.75,5,243,3,1,0,True,False,False,False,False,False,False,False,False,True,False
12850,0.75,0.83,4,133,4,0,0,False,False,False,False,True,False,False,False,False,False,False
116,0.86,0.68,5,263,2,0,0,False,True,False,False,False,False,False,False,False,False,True


scaling

In [36]:
mm = MinMaxScaler()

X_train_transformed = pd.DataFrame(mm.fit_transform(X_train), columns=X_train.columns)  

X_test_transformed = pd.DataFrame(mm.transform(X_test), columns=X_test.columns)

In [37]:
X_train_transformed.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.549451,0.453125,0.6,0.78972,0.125,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.296703,0.3125,0.0,0.205607,0.125,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.857143,0.59375,0.4,0.383178,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.747253,0.359375,0.4,0.266355,0.125,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.90625,0.8,0.794393,0.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
X_test_transformed.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.340659,0.296875,0.0,0.238318,0.125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.802198,0.625,0.2,0.574766,0.75,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.648352,0.609375,0.6,0.686916,0.125,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.725275,0.734375,0.4,0.172897,0.25,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.846154,0.5,0.6,0.780374,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


smote

In [39]:
y_train.value_counts()

left
0    9149
1    2850
Name: count, dtype: int64

In [40]:
fig = px.histogram(y_train, nbins=2, title='Distribution of (Employee Turnover)', color=y_train.map({0: 'Stayed', 1: 'Left'}), color_discrete_sequence=['teal', 'salmon'])
fig.update_xaxes(title='Left (0 = Stayed, 1 = Left)')
fig.update_yaxes(title='Count')
fig.show()

In [41]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_transformed, y_train)

In [42]:
fig = px.histogram(
	y_train_smote, 
	nbins=2, 
	title='Distribution of (Employee Turnover)', 
	color=y_train_smote.map({0: 'Stayed', 1: 'Left'}),
	color_discrete_sequence=['teal', 'salmon']
)
fig.update_xaxes(title='Left (0 = Stayed, 1 = Left)')
fig.update_yaxes(title='Count')
fig.show()

### KNN

In [43]:
knn = KNeighborsClassifier(n_neighbors=5)

In [44]:
knn.fit(X_train_smote, y_train_smote)

Train Results

In [45]:
y_pred_train = knn.predict(X_train_smote)

In [46]:
y_pred_train

array([0, 1, 0, ..., 1, 1, 1])

In [47]:
print(classification_report(y_train_smote, y_pred_train))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      9149
           1       0.95      0.99      0.97      9149

    accuracy                           0.97     18298
   macro avg       0.97      0.97      0.97     18298
weighted avg       0.97      0.97      0.97     18298



Test Results

In [48]:
y_pred = knn.predict(X_test_transformed)

In [49]:
y_pred

array([1, 1, 0, ..., 1, 1, 0])

In [50]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.93      0.96      2279
           1       0.81      0.96      0.88       721

    accuracy                           0.94      3000
   macro avg       0.90      0.95      0.92      3000
weighted avg       0.95      0.94      0.94      3000



stratified k fold

In [51]:
skf = StratifiedKFold(n_splits=5, shuffle=True)    

### LOGISITIC REGRESSION

In [52]:
lr = LogisticRegression()

In [53]:
lr_kfold_scores = cross_val_score(lr, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [54]:
np.mean(np.abs(lr_kfold_scores))

np.float64(0.7801948336088722)

In [55]:
lr.fit(X_train_smote, y_train_smote)
y_pred_lr = lr.predict(X_test_transformed)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.92      0.75      0.83      2279
           1       0.50      0.79      0.62       721

    accuracy                           0.76      3000
   macro avg       0.71      0.77      0.72      3000
weighted avg       0.82      0.76      0.78      3000



In [56]:
y_score_lr = lr.predict_proba(X_test_transformed)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr)
roc_auc = auc(fpr, tpr)

In [57]:
roc_auc

np.float64(0.8286915630197687)

In [58]:
fig_hist = px.histogram(x=y_score_lr, nbins=50, title='Predicted Probabilities of Employee Turnover (Logistic Regression)', color = y_test.map({0: 'Stayed', 1: 'Left'}), labels={'x': 'Predicted Probability', 'color': 'Actual Turnover'})
fig_hist.show()

In [59]:
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500, render_mode='SVG'
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

note: if you have over a certain amount of points, the lines may not show due to webgl limitations...use render_mode='SVG' to fix

source: https://community.plotly.com/t/plotly-express-line-charts-are-not-shown/39715

In [60]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

confusion matrix

In [61]:
conf_matrix = confusion_matrix(y_test, y_pred_lr, labels=lr.classes_)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[1710  569]
 [ 148  573]]


In [62]:
fig = px.imshow(conf_matrix, labels=dict(x="Predicted Label", y="True Label", color="Count"), x=['Stayed', 'Left'], y=['Stayed', 'Left'], title='Confusion Matrix (Logistic Regression)', text_auto=True)
fig.show()

### RANDOM FOREST

In [63]:
rf = RandomForestClassifier()

In [64]:
rf_kfold_scores = cross_val_score(rf, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [65]:
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test_transformed)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2279
           1       0.99      0.98      0.98       721

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000



In [66]:
y_score_rf = rf.predict_proba(X_test_transformed)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score_rf)
roc_auc = auc(fpr, tpr)

In [67]:
y_score_rf

array([1.  , 0.08, 0.01, ..., 0.99, 1.  , 0.  ])

In [68]:
roc_auc

np.float64(0.9924556905326873)

In [69]:
fig_hist = px.histogram(x=y_score_rf, nbins=50, title='Predicted Probabilities of Employee Turnover (Random Forest)', color = y_test.map({0: 'Stayed', 1: 'Left'}), labels={'x': 'Predicted Probability', 'color': 'Actual Turnover'})
fig_hist.show()

In [70]:
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500, render_mode='SVG'
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

In [71]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

random forest probability predictions

In [72]:
y_pred_prob_rf = rf.predict_proba(X_train_smote)

In [73]:
class_order = rf.classes_
print('Class order:', class_order)

Class order: [0 1]


In [74]:
rf_results_df = pd.DataFrame({
    'Actual Label': y_test,
    'Predicted Label': y_pred_rf,
    'Predicted Probability': y_score_rf
})

In [75]:
rf_results_df.sample(15)

Unnamed: 0,Actual Label,Predicted Label,Predicted Probability
489,1,1,0.9
81,1,1,1.0
6692,0,0,0.03
8659,0,0,0.0
12783,0,0,0.0
12536,1,1,1.0
14393,1,1,1.0
11459,0,0,0.01
12991,0,0,0.0
13571,0,0,0.01


In [76]:
rf_results_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 46 to 3312
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Actual Label           3000 non-null   int64  
 1   Predicted Label        3000 non-null   int64  
 2   Predicted Probability  3000 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 93.8 KB


confusion matrix

In [77]:
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[2269   10]
 [  16  705]]


In [78]:
fig = px.imshow(conf_matrix, labels=dict(x="Predicted Label", y="True Label", color="Count"), x=['Stayed', 'Left'], y=['Stayed', 'Left'], title='Confusion Matrix (Random Forest)', text_auto=True)
fig.show()

employee zones

In [79]:
def risk_zone(prob):
    if prob < 0.2:
        return "Safe Zone"
    elif 0.2 <= prob < 0.6:
        return "Low-Risk Zone"
    elif 0.6 <= prob < 0.9:
        return "Medium-Risk Zone"
    elif prob >= 0.9:
        return "High-Risk Zone"
    else:
        return "n/a"

rf_results_df['Risk Zone'] = rf_results_df['Predicted Probability'].apply(risk_zone)
rf_results_df[['Predicted Probability', 'Risk Zone']].head()

Unnamed: 0,Predicted Probability,Risk Zone
46,1.0,High-Risk Zone
11929,0.08,Safe Zone
7002,0.01,Safe Zone
12850,0.04,Safe Zone
116,0.99,High-Risk Zone


In [80]:
rf_results_df['Risk Zone'].value_counts()

Risk Zone
Safe Zone           2197
High-Risk Zone       676
Low-Risk Zone         99
Medium-Risk Zone      28
Name: count, dtype: int64

In [81]:
zone_counts = rf_results_df.groupby(['Risk Zone']).size().reset_index(name='count')
fig = px.bar(zone_counts, x='Risk Zone', y='count', title='Zone Distribution by Employee Turnover', color_discrete_map={'Safe Zone': 'green', 'Low-Risk Zone': 'yellow', 'Medium-Risk Zone': 'orange', 'High-Risk Zone': 'red'}, color='Risk Zone')
fig.update_xaxes(categoryorder='total descending')
fig.show()

#### **Suggested Retention Strategies**

Safe Zone - since there is a high proportion within the safe zone, keep implementing what is working. 

High-Risk Zone - has the second highest proportion. Review work load, hours spent, salary, last promotion, and last evaluation rating. how do these factors align with the safe zone employees? do some of the strategies implemented with the safe zone need to be implemented with the high risk zone? 

2, 6, 7 projects had higher turnover, if these high risk employees fall into these categories, delegation of tasks/assistance may be needed. High peformers may feel burnout, and lower performers may need coaching (specifically low number of projects and low performance). High performers, with higher projects but lower salary may feel undervalued. The ability to provide promotion may need to be evaluated. Low salary, high amount of projects and hours in office, high performance rating, and no promotion will surely push employees to leave. 

Low-Risk Zone and Medium-Risk Zones - monitor these employees and their satisfaction...this is a place of opportunity to try retention strategies to move them into the Safe Zone. Make a conscious effort to monitor workload, review satisfaction with employees on a regular basis through planned meetings, be proactive with coaching and learning opportunities, provide more flexible work options (remote, hybrid, etc.). 

### GRADIENT BOOSTING

In [82]:
gb = GradientBoostingClassifier()

In [83]:
bg_kfold_scores = cross_val_score(gb, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [84]:
gb.fit(X_train_smote, y_train_smote)
y_pred_gb = gb.predict(X_test_transformed)
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.99      0.97      0.98      2279
           1       0.92      0.96      0.94       721

    accuracy                           0.97      3000
   macro avg       0.96      0.97      0.96      3000
weighted avg       0.97      0.97      0.97      3000



In [85]:
y_score_gb = gb.predict_proba(X_test_transformed)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr)
roc_auc = auc(fpr, tpr)

In [86]:
roc_auc

np.float64(0.8286915630197687)

In [87]:
fig_hist = px.histogram(x=y_score_gb, nbins=50, title='Predicted Probabilities of Employee Turnover (Gradient Boost)', color = y_test.map({0: 'Stayed', 1: 'Left'}), labels={'x': 'Predicted Probability', 'color': 'Actual Turnover'})
fig_hist.show()

In [88]:
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500, render_mode='SVG'
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

In [89]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

confusion matrix

In [90]:
conf_matrix = confusion_matrix(y_test, y_pred_gb)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[2222   57]
 [  28  693]]


In [91]:
fig = px.imshow(conf_matrix, labels=dict(x="Predicted Label", y="True Label", color="Count"), x=['Stayed', 'Left'], y=['Stayed', 'Left'], title='Confusion Matrix (Gradient Boost)', text_auto=True)
fig.show()

## Based on the classification reports, confusion matrix, roc curve, tpr/fpr curve , random forest classifier is the better model.

## Recall is the best metric to use because not being able to recoginze when an employee is about to leave can be costly.


# CLUSTERING MODEL

In [92]:
df = pd.read_csv('HR_comma_sep.csv')
df['Department'] = df['sales']
df = df.drop(columns=['sales'])
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department
0,0.38,0.53,2,157,3,0,1,0,low,sales
1,0.8,0.86,5,262,6,0,1,0,medium,sales
2,0.11,0.88,7,272,4,0,1,0,medium,sales
3,0.72,0.87,5,223,5,0,1,0,low,sales
4,0.37,0.52,2,159,3,0,1,0,low,sales


In [93]:
df_left = df[df['left'] == 1]
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department
0,0.38,0.53,2,157,3,0,1,0,low,sales
1,0.8,0.86,5,262,6,0,1,0,medium,sales
2,0.11,0.88,7,272,4,0,1,0,medium,sales
3,0.72,0.87,5,223,5,0,1,0,low,sales
4,0.37,0.52,2,159,3,0,1,0,low,sales


In [94]:
fig = px.scatter(df_left, x='satisfaction_level', y='last_evaluation')
fig.show()

three distinct clusters - 1. low satisfaction and high eval score 2. mid satisfaction and low eval score 3. high satisfaction and high eval score

In [95]:
mm.fit_transform(df_left[['satisfaction_level', 'last_evaluation']])

array([[0.34939759, 0.14545455],
       [0.85542169, 0.74545455],
       [0.02409639, 0.78181818],
       ...,
       [0.3373494 , 0.14545455],
       [0.02409639, 0.92727273],
       [0.3373494 , 0.12727273]])

In [96]:
df_left_scaled = pd.DataFrame(mm.fit_transform(df_left[['satisfaction_level', 'last_evaluation']]), columns=['satisfaction_level', 'last_evaluation'])
df_left_scaled.head()

Unnamed: 0,satisfaction_level,last_evaluation
0,0.349398,0.145455
1,0.855422,0.745455
2,0.024096,0.781818
3,0.759036,0.763636
4,0.337349,0.127273


In [97]:
kmeans_3 = KMeans(n_clusters=3)

In [98]:
kmeans_3.fit(df_left_scaled)

In [99]:
kmeans_3.cluster_centers_ # centroids of the 3 clusters - scaled values

array([[0.38748519, 0.12177894],
       [0.86395634, 0.84770142],
       [0.02616878, 0.76701632]])

coordinates of the centroids for each cluster

In [100]:
mm.inverse_transform(kmeans_3.cluster_centers_) # coordinates of the centroids for each cluster - actual values

array([[0.41161271, 0.51697842],
       [0.80708376, 0.91623578],
       [0.11172009, 0.87185897]])

In [101]:
print(kmeans_3.labels_)

[0 1 2 ... 0 2 0]


In [102]:
kmeans_3.inertia_

62.92944601364937

sum of squares - distance between the points and the centers

In [103]:
df_left_scaled['Cluster'] = kmeans_3.labels_
df_left_scaled.head()

Unnamed: 0,satisfaction_level,last_evaluation,Cluster
0,0.349398,0.145455,0
1,0.855422,0.745455,1
2,0.024096,0.781818,2
3,0.759036,0.763636,1
4,0.337349,0.127273,0


In [104]:
df_left_scaled['Cluster'].value_counts()

Cluster
0    1668
1     967
2     936
Name: count, dtype: int64

In [105]:
centroids_3 = pd.DataFrame(mm.inverse_transform(kmeans_3.cluster_centers_), columns=['satisfaction_level', 'last_evaluation'])
centroids_3

Unnamed: 0,satisfaction_level,last_evaluation
0,0.411613,0.516978
1,0.807084,0.916236
2,0.11172,0.871859


In [106]:
fig = px.scatter(df_left, x="satisfaction_level", y="last_evaluation", color=df_left_scaled['Cluster'])
fig.add_trace(go.Scatter(x=centroids_3['satisfaction_level'], y=centroids_3['last_evaluation'], mode='markers', marker=dict(color='red', size=10), name='Centroids', text=centroids_3.index))
fig.update_layout(coloraxis_showscale=False)

d = fig.to_dict()
d["data"][0]["type"] = "scatter"

go.Figure(d)

### ELBOW CURVE

In [107]:
int_ = {}
cluster_no = range(1, 30)

for i in cluster_no:
    k_ = KMeans(n_clusters=i).fit(df_left_scaled)
    int_['cluster no_' + str(i)] = k_.inertia_

In [108]:
int_

{'cluster no_1': 3276.0927052317647,
 'cluster no_2': 875.4841551714003,
 'cluster no_3': 62.92944601364937,
 'cluster no_4': 52.18234481456534,
 'cluster no_5': 50.11288939264506,
 'cluster no_6': 42.84230920466822,
 'cluster no_7': 30.502850027931743,
 'cluster no_8': 29.244543456439118,
 'cluster no_9': 25.124597848668106,
 'cluster no_10': 23.645764172469676,
 'cluster no_11': 20.88578800644524,
 'cluster no_12': 19.293574019596107,
 'cluster no_13': 18.43887035118679,
 'cluster no_14': 15.912314264973567,
 'cluster no_15': 13.743517983123269,
 'cluster no_16': 14.049913919007144,
 'cluster no_17': 11.515770404771391,
 'cluster no_18': 11.538510979585723,
 'cluster no_19': 10.528292479015896,
 'cluster no_20': 9.666631951991913,
 'cluster no_21': 9.566537556188932,
 'cluster no_22': 9.054894564735632,
 'cluster no_23': 8.667627703233839,
 'cluster no_24': 7.8204112218197235,
 'cluster no_25': 7.66767239109044,
 'cluster no_26': 7.489070886258906,
 'cluster no_27': 7.027184561780684

In [109]:
fig = px.line(x=cluster_no, y=list(int_.values()), title='Elbow Curve', labels={'x':'Number of Clusters', 'y':'Inertia'}, markers=True)
fig.show()

based on the elbow curve, 3 is the best number of clusters (in alignment with the scatterplot)

### SILHOUETTE SCORE

In [110]:
silhouette_score(df_left_scaled, kmeans_3.fit_predict(df_left_scaled))

np.float64(0.884158727982281)

used to compare silhouette scores for k means cluster values - this model only has one so no comparison