In [310]:
# import libraries
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, Normalizer, RobustScaler, MaxAbsScaler
import numpy as np
from sklearn.pipeline import make_pipeline

In [311]:
# Load the dataset
df = pd.read_csv('employee_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,EmployeeID,Name,Age,Gender,Department,JobTitle,Salary,YearsAtCompany,PerformanceRating,EducationLevel
0,0,1,Julie Rodriguez,51,Female,HR,Coordinator,69329.59,27,2,3
1,1,2,Cole Valenzuela,58,Male,IT,Analyst,81490.38,23,3,4
2,2,3,Brian Hampton,26,Female,IT,Consultant,58526.86,13,4,4
3,3,4,Christian Leonard,51,Female,Finance,Manager,61573.58,19,5,2
4,4,5,Nancy Nash,62,Female,Marketing,Engineer,44924.45,20,1,2


In [312]:
# drop non-numeric columns
df_numeric = df.drop(columns=['Unnamed: 0', 'EmployeeID', 'Name', 'Gender', 'Department', 'JobTitle'], axis=1)

In [313]:
n_clusters = len(df['JobTitle'].unique())
n_clusters

6

In [314]:
# fit to kmeans
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
clusters = kmeans.fit_predict(df_numeric)

In [315]:
# Add the cluster labels to the dataframe
df['clusters'] = clusters
df.head()

Unnamed: 0.1,Unnamed: 0,EmployeeID,Name,Age,Gender,Department,JobTitle,Salary,YearsAtCompany,PerformanceRating,EducationLevel,clusters
0,0,1,Julie Rodriguez,51,Female,HR,Coordinator,69329.59,27,2,3,1
1,1,2,Cole Valenzuela,58,Male,IT,Analyst,81490.38,23,3,4,5
2,2,3,Brian Hampton,26,Female,IT,Consultant,58526.86,13,4,4,2
3,3,4,Christian Leonard,51,Female,Finance,Manager,61573.58,19,5,2,1
4,4,5,Nancy Nash,62,Female,Marketing,Engineer,44924.45,20,1,2,0


In [316]:
# Use label and country to create a dataframe
df_dt = pd.DataFrame({'JobTitle': df['JobTitle'], 'clusters': df['clusters']})
df_dt.head()

Unnamed: 0,JobTitle,clusters
0,Coordinator,1
1,Analyst,5
2,Consultant,2
3,Manager,1
4,Engineer,0


In [317]:
# cross tabulate the cluster and country columns
ct = pd.crosstab(df_dt['clusters'], df_dt['JobTitle'])
ct

JobTitle,Analyst,Consultant,Coordinator,Engineer,Executive,Manager
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,324,332,322,341,338,361
1,386,387,367,392,399,428
2,421,410,403,442,423,418
3,155,138,162,153,146,143
4,92,97,91,114,96,99
5,280,277,262,245,288,268


Data Scaling

In [318]:
df_numeric.head()

Unnamed: 0,Age,Salary,YearsAtCompany,PerformanceRating,EducationLevel
0,51,69329.59,27,2,3
1,58,81490.38,23,3,4
2,26,58526.86,13,4,4
3,51,61573.58,19,5,2
4,62,44924.45,20,1,2


Normalizer

In [319]:
# Create a normalizer: normalizer
normalize = Normalizer()

# Create a KMeans model with 10 clusters: kmeans
kmeans = KMeans(n_clusters=n_clusters)

# Make a pipeline chaining normalizer and kmeans: pipeline
pipeline = make_pipeline(normalize, kmeans)

# Fit pipeline to the daily price movements
clusters_norm = pipeline.fit_predict(df_numeric)

In [320]:
# Add the cluster labels to the dataframe
df['cluster_norm'] = clusters_norm
df.head()

Unnamed: 0.1,Unnamed: 0,EmployeeID,Name,Age,Gender,Department,JobTitle,Salary,YearsAtCompany,PerformanceRating,EducationLevel,clusters,cluster_norm
0,0,1,Julie Rodriguez,51,Female,HR,Coordinator,69329.59,27,2,3,1,2
1,1,2,Cole Valenzuela,58,Male,IT,Analyst,81490.38,23,3,4,5,2
2,2,3,Brian Hampton,26,Female,IT,Consultant,58526.86,13,4,4,2,3
3,3,4,Christian Leonard,51,Female,Finance,Manager,61573.58,19,5,2,1,2
4,4,5,Nancy Nash,62,Female,Marketing,Engineer,44924.45,20,1,2,0,5


In [321]:
# Use label and country to create a dataframe
df_dt_norm = pd.DataFrame({'JobTitle': df['JobTitle'], 'cluster_norm': df['cluster_norm']})
df_dt_norm.head()

Unnamed: 0,JobTitle,cluster_norm
0,Coordinator,2
1,Analyst,2
2,Consultant,3
3,Manager,2
4,Engineer,5


In [322]:
# cross tabulate the cluster and country columns
clusters_norm = pd.crosstab(df_dt_norm['cluster_norm'], df_dt_norm['JobTitle'])
clusters_norm

JobTitle,Analyst,Consultant,Coordinator,Engineer,Executive,Manager
cluster_norm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,199,186,171,202,207,209
1,0,0,0,0,1,0
2,609,571,564,654,601,626
3,602,636,617,581,633,630
4,39,30,33,32,32,32
5,209,218,222,218,216,220


Standard Scaler

In [323]:
# Create a scaler: scaler
scaler = StandardScaler()

# Create a KMeans model with 10 clusters: kmeans
kmeans = KMeans(n_clusters=n_clusters)

# Make a pipeline chaining normalizer and kmeans: pipeline
pipeline = make_pipeline(scaler, kmeans)

# Fit pipeline to the daily price movements
clusters_scaled = pipeline.fit_predict(df_numeric)

In [324]:
# Add the cluster labels to the dataframe
df['cluster_scaled'] = clusters_scaled
df.head()

Unnamed: 0.1,Unnamed: 0,EmployeeID,Name,Age,Gender,Department,JobTitle,Salary,YearsAtCompany,PerformanceRating,EducationLevel,clusters,cluster_norm,cluster_scaled
0,0,1,Julie Rodriguez,51,Female,HR,Coordinator,69329.59,27,2,3,1,2,0
1,1,2,Cole Valenzuela,58,Male,IT,Analyst,81490.38,23,3,4,5,2,0
2,2,3,Brian Hampton,26,Female,IT,Consultant,58526.86,13,4,4,2,3,5
3,3,4,Christian Leonard,51,Female,Finance,Manager,61573.58,19,5,2,1,2,3
4,4,5,Nancy Nash,62,Female,Marketing,Engineer,44924.45,20,1,2,0,5,2


In [325]:
# Use label and country to create a dataframe
df_dt_scaled = pd.DataFrame({'JobTitle': df['JobTitle'], 'cluster_scaled': df['cluster_scaled']})
df_dt_scaled.head()

Unnamed: 0,JobTitle,cluster_scaled
0,Coordinator,0
1,Analyst,0
2,Consultant,5
3,Manager,3
4,Engineer,2


In [326]:
# cross tabulate the cluster and country columns
clusters_scaled = pd.crosstab(df_dt_scaled['cluster_scaled'], df_dt_scaled['JobTitle'])
clusters_scaled

JobTitle,Analyst,Consultant,Coordinator,Engineer,Executive,Manager
cluster_scaled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,302,272,262,318,298,310
1,303,281,311,323,321,300
2,263,291,256,264,267,292
3,269,287,252,262,270,267
4,264,271,258,264,253,295
5,257,239,268,256,281,253


Robust Scaler

In [328]:
# Create a scaler: scaler
robust_scaler = RobustScaler()

# Create a KMeans model with 10 clusters: kmeans
kmeans = KMeans(n_clusters=n_clusters)

# Make a pipeline chaining normalizer and kmeans: pipeline
pipeline = make_pipeline(robust_scaler, kmeans)

# Fit pipeline to the daily price movements
clusters_rscaled = pipeline.fit_predict(df_numeric)

# Add the cluster labels to the dataframe
df['cluster_rscaled'] = clusters_rscaled

# Use label and country to create a dataframe
df_dt_rscaled = pd.DataFrame({'JobTitle': df['JobTitle'], 'cluster_rscaled': df['cluster_rscaled']})
print(df_dt_rscaled.head())
# cross tabulate the cluster and country columns
clusters_rscaled = pd.crosstab(df_dt_rscaled['cluster_rscaled'], df_dt_rscaled['JobTitle'])
clusters_rscaled

      JobTitle  cluster_rscaled
0  Coordinator                1
1      Analyst                2
2   Consultant                2
3      Manager                4
4     Engineer                5


JobTitle,Analyst,Consultant,Coordinator,Engineer,Executive,Manager
cluster_rscaled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,250,254,231,258,254,251
1,270,309,289,289,310,324
2,289,279,257,255,274,273
3,263,224,261,282,273,263
4,310,287,302,318,293,316
5,276,288,267,285,286,290
