# import libraries

In [472]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly import graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier



# read in data

In [473]:
df = pd.read_csv('HR_comma_sep.csv')

In [474]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   sales                  14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [475]:
df['Department'] = df['sales']
df = df.drop(columns=['sales'])

In [476]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department
0,0.38,0.53,2,157,3,0,1,0,low,sales
1,0.8,0.86,5,262,6,0,1,0,medium,sales
2,0.11,0.88,7,272,4,0,1,0,medium,sales
3,0.72,0.87,5,223,5,0,1,0,low,sales
4,0.37,0.52,2,159,3,0,1,0,low,sales


data will need to be scaled (use minmax/robust if not normally distributed and standard if normally distributed)

no null values

salary and sales (department) will need to be encoded

left is the predictor


## EDA

In [477]:
fig = px.imshow(df.corr(numeric_only=True), text_auto=True, aspect="auto", color_continuous_scale='agsunset', title='Correlation Matrix',width=800, height=800)
fig.show()

weak correlations: neg corr between left and satisfacton, pos corr between num projects and monthly hors, pos corr between last eval and monthly hours, pos corr between num hours and last eval

In [478]:
fig = px.histogram(df, x="satisfaction_level", marginal='box', title='Satisfaction Level Distribution', color_discrete_sequence=['indigo'])
fig.show()

not normally distributed

most values between 0.44 and 0.82

In [479]:
fig = px.histogram(df, x="last_evaluation", marginal='box', title='Evaluation Rating Distribution', color_discrete_sequence=['lightcoral'])
fig.show()

not normally distributed

most values between 0.56 and 0.87

there are two peaks around 0.55 and 0.87

In [480]:
fig = px.histogram(df, x="average_montly_hours", marginal='box', title='Average Monthly Hours in Office Distribution', color_discrete_sequence=['mediumvioletred'])
fig.show()

not normally distributed

most values between 156 and 245

two peaks around 145 and 255

In [481]:
fig = px.histogram(df, x="time_spend_company", marginal='box', title='Number of Years with Company Distribution', color_discrete_sequence=['peachpuff'])
fig.show()

left skewed

most employees have been with the company 3 years with the range for most employees being 2 and five years

outliers for values greater than 5 years

In [482]:
fig = px.histogram(df, x="number_project", marginal='box', title='Number of Projects Distribution', color_discrete_sequence=['hotpink'])
fig.show()

left skewed

most employees have 4 projects

2 - 7 projects for employees is the range

In [483]:
fig = px.histogram(df, x="promotion_last_5years", marginal='box', title='Number of Promotions in Last 5 years Distribution', color_discrete_sequence=['darkorchid'])
fig.show()

very few employees received a rasie in the the last 5 years

In [484]:
project_left_counts = df.groupby(['number_project', 'left']).size().reset_index(name='count')
fig = px.histogram(project_left_counts, x='number_project', y='count', color='left', title='Number of Projects by Employee Turnover', barmode='group', color_discrete_sequence=['lightseagreen', 'salmon'], nbins=7)
fig.show()

2, 6, 7 projects - higher turnover than retention

In [485]:
project_left_counts = df.groupby(['number_project', 'left']).size().reset_index(name='count')
fig = px.histogram(project_left_counts, x='number_project', y='count', color='left', title='Number of Projects by Employee Turnover', barmode='group', color_discrete_sequence=['lightseagreen', 'salmon'])
fig.show()

turnover greater than retention if the number of projects an employee has is between 6-7

between 4-5 projects has less turnover

In [486]:
salary_counts = df.groupby(['salary', 'left']).size().reset_index(name='count')
fig = px.histogram(salary_counts, x='salary', y='count', color='left', title='Salary Level by Employee Turnover', barmode='group', color_discrete_sequence=['mediumslateblue', 'tomato'])
fig.show()

higher turnover when salary low and medium

In [487]:
accident_counts = df.groupby(['Work_accident', 'left']).size().reset_index(name='count')
fig = px.histogram(accident_counts, x='Work_accident', y='count', color='left', title='Work Accident by Employee Turnover', barmode='group', color_discrete_sequence=['indigo', 'coral'])
fig.show()

no work accident but more turnover - i think this is more of a volume thing

## Classification Modeling

encoding

In [488]:
df.select_dtypes('O').nunique()

salary         3
Department    10
dtype: int64

In [489]:
df = pd.get_dummies(df, columns=['salary','Department'], drop_first=True)

In [490]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.38,0.53,2,157,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False
1,0.8,0.86,5,262,6,0,1,0,False,True,False,False,False,False,False,False,True,False,False
2,0.11,0.88,7,272,4,0,1,0,False,True,False,False,False,False,False,False,True,False,False
3,0.72,0.87,5,223,5,0,1,0,True,False,False,False,False,False,False,False,True,False,False
4,0.37,0.52,2,159,3,0,1,0,True,False,False,False,False,False,False,False,True,False,False


train test split

In [491]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['left'], axis=1), df['left'], test_size=0.2)

In [492]:
X_train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
13154,0.91,0.74,3,150,2,0,0,False,True,False,True,False,False,False,False,False,False,False
11317,0.96,0.83,3,177,4,0,0,True,False,False,False,False,False,False,False,False,False,True
9817,0.56,0.59,5,254,4,0,0,False,False,False,False,False,False,False,False,False,True,False
12271,0.11,0.84,6,251,4,0,0,False,True,False,False,False,False,False,False,False,False,True
13784,0.24,0.81,6,263,7,0,0,False,False,False,False,False,True,False,False,False,False,False


In [493]:
X_test.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
7775,0.67,0.94,2,192,3,0,0,True,False,False,False,False,False,False,True,False,False,False
8237,0.59,0.89,5,143,3,1,0,True,False,False,False,False,False,False,False,False,False,False
5110,0.71,0.76,3,201,2,0,0,False,True,True,False,False,False,False,False,False,False,False
4289,0.8,0.41,3,188,4,0,0,False,False,False,False,False,False,False,False,True,False,False
5860,0.9,0.62,5,236,6,0,0,False,True,False,False,False,False,False,False,False,False,True


scaling

In [494]:
mm = MinMaxScaler()

X_train_transformed = pd.DataFrame(mm.fit_transform(X_train), columns=X_train.columns)  

X_test_transformed = pd.DataFrame(mm.transform(X_test), columns=X_test.columns)

In [495]:
X_train_transformed.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.9010989011,0.59375,0.2,0.2523364486,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.956043956,0.734375,0.2,0.3785046729,0.25,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.5164835165,0.359375,0.6,0.738317757,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.021978022,0.75,0.8,0.7242990654,0.25,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.1648351648,0.703125,0.8,0.7803738318,0.625,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [496]:
X_test_transformed.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,salary_low,salary_medium,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.6373626374,0.90625,0.0,0.4485981308,0.125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.5494505495,0.828125,0.6,0.2196261682,0.125,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.6813186813,0.625,0.2,0.4906542056,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.7802197802,0.078125,0.2,0.4299065421,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.8901098901,0.40625,0.6,0.6542056075,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


smote

In [497]:
y_train.value_counts()

left
0    9151
1    2848
Name: count, dtype: int64

In [498]:
fig = px.histogram(y_train, nbins=2, title='Distribution of (Employee Turnover)', color=y_train.map({0: 'Stayed', 1: 'Left'}), color_discrete_sequence=['teal', 'salmon'])
fig.update_xaxes(title='Left (0 = Stayed, 1 = Left)')
fig.update_yaxes(title='Count')
fig.show()

In [499]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_transformed, y_train)

In [500]:
fig = px.histogram(
	y_train_smote, 
	nbins=2, 
	title='Distribution of (Employee Turnover)', 
	color=y_train_smote.map({0: 'Stayed', 1: 'Left'}),
	color_discrete_sequence=['teal', 'salmon']
)
fig.update_xaxes(title='Left (0 = Stayed, 1 = Left)')
fig.update_yaxes(title='Count')
fig.show()

In [501]:
knn = KNeighborsClassifier(n_neighbors=5)

In [502]:
knn.fit(X_train_smote, y_train_smote)

Train Results

In [503]:
y_pred_train = knn.predict(X_train_smote)

In [504]:
y_pred_train

array([0, 0, 0, ..., 1, 1, 1])

In [505]:
print(classification_report(y_train_smote, y_pred_train))

              precision    recall  f1-score   support

           0       0.99      0.95      0.97      9151
           1       0.96      0.99      0.97      9151

    accuracy                           0.97     18302
   macro avg       0.97      0.97      0.97     18302
weighted avg       0.97      0.97      0.97     18302



Test Results

In [506]:
y_pred = knn.predict(X_test_transformed)

In [507]:
y_pred

array([0, 0, 0, ..., 1, 0, 0])

In [508]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96      2277
           1       0.81      0.95      0.88       723

    accuracy                           0.93      3000
   macro avg       0.90      0.94      0.92      3000
weighted avg       0.94      0.93      0.94      3000



stratified k fold

In [509]:
skf = StratifiedKFold(n_splits=5, shuffle=True)    

logistic regression

In [510]:
lr = LogisticRegression()

In [511]:
lr_kfold_scores = cross_val_score(lr, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [512]:
np.mean(np.abs(lr_kfold_scores))

np.float64(0.7714456768508111)

In [513]:
lr.fit(X_train_smote, y_train_smote)
y_pred_lr = lr.predict(X_test_transformed)
print(classification_report(y_test, y_pred_lr))

              precision    recall  f1-score   support

           0       0.92      0.75      0.83      2277
           1       0.50      0.81      0.62       723

    accuracy                           0.76      3000
   macro avg       0.71      0.78      0.72      3000
weighted avg       0.82      0.76      0.78      3000



In [514]:
y_score_lr = lr.predict_proba(X_test_transformed)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr)
roc_auc = auc(fpr, tpr)

In [515]:
roc_auc

np.float64(0.8283453939236006)

In [516]:
fig_hist = px.histogram(x=y_score_lr, nbins=50, title='Predicted Probabilities of Employee Turnover (Logistic Regression)', color = y_test.map({0: 'Stayed', 1: 'Left'}), labels={'x': 'Predicted Probability', 'color': 'Actual Turnover'})
fig_hist.show()

In [517]:
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500, render_mode='SVG'
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

note: if you have over a certain amount of points, the lines may not show due to webgl limitations...use render_mode='SVG' to fix

source: https://community.plotly.com/t/plotly-express-line-charts-are-not-shown/39715

In [518]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

confusion matrix

In [519]:
conf_matrix = confusion_matrix(y_test, y_pred_lr, labels=lr.classes_)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[1702  575]
 [ 140  583]]


In [520]:
fig = px.imshow(conf_matrix, labels=dict(x="Predicted Label", y="True Label", color="Count"), x=['Stayed', 'Left'], y=['Stayed', 'Left'], title='Confusion Matrix (Logistic Regression)', text_auto=True)
fig.show()

random forest

In [521]:
rf = RandomForestClassifier()

In [522]:
rf_kfold_scores = cross_val_score(rf, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [523]:
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test_transformed)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2277
           1       0.99      0.98      0.98       723

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000



In [524]:
y_score_rf = rf.predict_proba(X_test_transformed)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score_rf)
roc_auc = auc(fpr, tpr)

In [525]:
roc_auc

np.float64(0.9961722583948815)

In [526]:
fig_hist = px.histogram(x=y_score_rf, nbins=50, title='Predicted Probabilities of Employee Turnover (Random Forest)', color = y_test.map({0: 'Stayed', 1: 'Left'}), labels={'x': 'Predicted Probability', 'color': 'Actual Turnover'})
fig_hist.show()

In [527]:
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500, render_mode='SVG'
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

In [528]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

confusion matrix

In [529]:
conf_matrix = confusion_matrix(y_test, y_pred_rf)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[2267   10]
 [  17  706]]


In [530]:
fig = px.imshow(conf_matrix, labels=dict(x="Predicted Label", y="True Label", color="Count"), x=['Stayed', 'Left'], y=['Stayed', 'Left'], title='Confusion Matrix (Random Forest)', text_auto=True)
fig.show()

gradient boosting

In [531]:
gb = GradientBoostingClassifier()

In [532]:
bg_kfold_scores = cross_val_score(gb, X_train_smote, y_train_smote, cv=skf, scoring='accuracy') 

In [533]:
gb.fit(X_train_smote, y_train_smote)
y_pred_gb = gb.predict(X_test_transformed)
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      2277
           1       0.91      0.93      0.92       723

    accuracy                           0.96      3000
   macro avg       0.95      0.95      0.95      3000
weighted avg       0.96      0.96      0.96      3000



In [534]:
y_score_gb = gb.predict_proba(X_test_transformed)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score_lr)
roc_auc = auc(fpr, tpr)

In [535]:
roc_auc

np.float64(0.8283453939236006)

In [536]:
fig_hist = px.histogram(x=y_score_gb, nbins=50, title='Predicted Probabilities of Employee Turnover (Gradient Boost)', color = y_test.map({0: 'Stayed', 1: 'Left'}), labels={'x': 'Predicted Probability', 'color': 'Actual Turnover'})
fig_hist.show()

In [537]:
df = pd.DataFrame({
    'False Positive Rate': fpr,
    'True Positive Rate': tpr
}, index=thresholds)
df.index.name = "Thresholds"
df.columns.name = "Rate"

fig_thresh = px.line(
    df, title='TPR and FPR at every threshold',
    width=700, height=500, render_mode='SVG'
)

fig_thresh.update_yaxes(scaleanchor="x", scaleratio=1)
fig_thresh.update_xaxes(range=[0, 1], constrain='domain')
fig_thresh.show()

In [538]:
fig = px.area(
    x=fpr, y=tpr,
    title=f'ROC Curve (AUC={auc(fpr, tpr):.4f})',
    labels=dict(x='False Positive Rate', y='True Positive Rate'),
    width=700, height=500
)
fig.add_shape(
    type='line', line=dict(dash='dash'),
    x0=0, x1=1, y0=0, y1=1
)

fig.update_yaxes(scaleanchor="x", scaleratio=1)
fig.update_xaxes(constrain='domain')
fig.show()

confusion matrix

In [539]:
conf_matrix = confusion_matrix(y_test, y_pred_gb)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[2214   63]
 [  50  673]]


In [540]:
fig = px.imshow(conf_matrix, labels=dict(x="Predicted Label", y="True Label", color="Count"), x=['Stayed', 'Left'], y=['Stayed', 'Left'], title='Confusion Matrix (Gradient Boost)', text_auto=True)
fig.show()

based on the classification reports, confusion matrix, roc curve, tpr/fpr curve , random forest classifier is the better model

## Clustering Modeling

In [541]:
df = pd.read_csv('HR_comma_sep.csv')
df['Department'] = df['sales']
df = df.drop(columns=['sales'])
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department
0,0.38,0.53,2,157,3,0,1,0,low,sales
1,0.8,0.86,5,262,6,0,1,0,medium,sales
2,0.11,0.88,7,272,4,0,1,0,medium,sales
3,0.72,0.87,5,223,5,0,1,0,low,sales
4,0.37,0.52,2,159,3,0,1,0,low,sales


In [542]:
df_left = df[df['left'] == 1]
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,salary,Department
0,0.38,0.53,2,157,3,0,1,0,low,sales
1,0.8,0.86,5,262,6,0,1,0,medium,sales
2,0.11,0.88,7,272,4,0,1,0,medium,sales
3,0.72,0.87,5,223,5,0,1,0,low,sales
4,0.37,0.52,2,159,3,0,1,0,low,sales


In [543]:
fig = px.scatter(df_left, x='satisfaction_level', y='last_evaluation')
fig.show()

three distinct clusters - 1. low satisfaction and high eval score 2. mid satisfaction and low eval score 3. high satisfaction and high eval score

In [544]:
mm.fit_transform(df_left[['satisfaction_level', 'last_evaluation']])

array([[0.34939759, 0.14545455],
       [0.85542169, 0.74545455],
       [0.02409639, 0.78181818],
       ...,
       [0.3373494 , 0.14545455],
       [0.02409639, 0.92727273],
       [0.3373494 , 0.12727273]])

In [545]:
df_left_scaled = pd.DataFrame(mm.fit_transform(df_left[['satisfaction_level', 'last_evaluation']]), columns=['satisfaction_level', 'last_evaluation'])
df_left_scaled.head()

Unnamed: 0,satisfaction_level,last_evaluation
0,0.3493975904,0.1454545455
1,0.8554216867,0.7454545455
2,0.0240963855,0.7818181818
3,0.7590361446,0.7636363636
4,0.3373493976,0.1272727273


In [546]:
kmeans_3 = KMeans(n_clusters=3)

In [547]:
kmeans_3.fit(df_left_scaled)

In [548]:
kmeans_3.cluster_centers_ # centroids of the 3 clusters - scaled values

array([[0.02616878, 0.76701632],
       [0.38748519, 0.12177894],
       [0.86395634, 0.84770142]])

coordinates of the centroids for each cluster

In [549]:
mm.inverse_transform(kmeans_3.cluster_centers_) # coordinates of the centroids for each cluster - actual values

array([[0.11172009, 0.87185897],
       [0.41161271, 0.51697842],
       [0.80708376, 0.91623578]])

In [550]:
print(kmeans_3.labels_)

[1 2 0 ... 1 0 1]


In [551]:
kmeans_3.inertia_

62.92944601364937

sum of squares - distance between the points and the centers

In [552]:
df_left_scaled['Cluster'] = kmeans_3.labels_
df_left_scaled.head()

Unnamed: 0,satisfaction_level,last_evaluation,Cluster
0,0.3493975904,0.1454545455,1
1,0.8554216867,0.7454545455,2
2,0.0240963855,0.7818181818,0
3,0.7590361446,0.7636363636,2
4,0.3373493976,0.1272727273,1


In [553]:
df_left_scaled['Cluster'].value_counts()

Cluster
1    1668
2     967
0     936
Name: count, dtype: int64

In [554]:
centroids_3 = pd.DataFrame(mm.inverse_transform(kmeans_3.cluster_centers_), columns=['satisfaction_level', 'last_evaluation'])
centroids_3

Unnamed: 0,satisfaction_level,last_evaluation
0,0.1117200855,0.8718589744
1,0.4116127098,0.5169784173
2,0.8070837642,0.9162357808


In [555]:
fig = px.scatter(df_left, x="satisfaction_level", y="last_evaluation", color=df_left_scaled['Cluster'])
fig.add_trace(go.Scatter(x=centroids_3['satisfaction_level'], y=centroids_3['last_evaluation'], mode='markers', marker=dict(color='red', size=10), name='Centroids', text=centroids_3.index))
fig.update_layout(coloraxis_showscale=False)

d = fig.to_dict()
d["data"][0]["type"] = "scatter"

go.Figure(d)

# elbow curve

In [556]:
int_ = {}
cluster_no = range(1, 30)

for i in cluster_no:
    k_ = KMeans(n_clusters=i).fit(df_left_scaled)
    int_['cluster no_' + str(i)] = k_.inertia_

In [557]:
int_

{'cluster no_1': 2724.872318785391,
 'cluster no_2': 1136.5938842787555,
 'cluster no_3': 62.92944601364937,
 'cluster no_4': 53.966222002879334,
 'cluster no_5': 45.34282600919304,
 'cluster no_6': 40.44294997747657,
 'cluster no_7': 38.170404608358766,
 'cluster no_8': 30.163436819316715,
 'cluster no_9': 25.454208975994074,
 'cluster no_10': 24.08140992754954,
 'cluster no_11': 22.84132318053305,
 'cluster no_12': 20.19431059451721,
 'cluster no_13': 17.221902098942913,
 'cluster no_14': 17.087285162033083,
 'cluster no_15': 13.828852648841163,
 'cluster no_16': 13.322742890482749,
 'cluster no_17': 11.728634872869737,
 'cluster no_18': 11.243548265008915,
 'cluster no_19': 10.931796552335063,
 'cluster no_20': 10.173562448690769,
 'cluster no_21': 9.74511504922922,
 'cluster no_22': 8.415412200733982,
 'cluster no_23': 8.465138934946623,
 'cluster no_24': 8.435232834798743,
 'cluster no_25': 7.924449568658378,
 'cluster no_26': 7.172280950601567,
 'cluster no_27': 6.97923549200119,

In [558]:
fig = px.line(x=cluster_no, y=list(int_.values()), title='Elbow Curve', labels={'x':'Number of Clusters', 'y':'Inertia'}, markers=True)
fig.show()

based on the elbow curve, 3 is the best number of clusters (in alignment with the scatterplot)

## Silhouette Score

In [559]:
silhouette_score(df_left_scaled, kmeans_3.fit_predict(df_left_scaled))

np.float64(0.8805525232616056)

used to compare silhouette scores for k means cluster values - this model only has one so no comparison