## 1. Load Dataset and Fit Data for Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load CSV file
df = pd.read_csv('../MA_BDO_cleaned_updated.csv')

X = df.drop('Austritt', axis=1)   # Replace 'Label_Column' with target column
y = df['Austritt']               # Replace 'Label_Column' with target column

X

## Room for experimenting with decoding the categorial variables into numeric ones:
- For ordinal data (e.g. school grade 1,2,3,4,5) label encoding should be used
- For nominal data (e.g. red, yellow, blue) One-Hot-Encoding needs to be used

### Exploring how many different values each categorial variable has:

In [None]:
print(X['PER_FIRMENNR'].unique())
print(len(X['PER_FIRMENNR'].unique()))

In [None]:
print(X['PER_LANDNR'].unique())
print(len(X['PER_LANDNR'].unique()))

In [None]:
print(X['PER_KOSTENSTELLE'].unique())
print(len(X['PER_KOSTENSTELLE'].unique()))

In [None]:
print(X['Job_Category'].unique())
print(len(X['Job_Category'].unique()))

In [6]:
print(X['PER_TITEL_VORNE_Category'].unique())
print(len(X['PER_TITEL_VORNE_Category'].unique()))

['No_Title' 'Magister' 'Doctorate' 'Engineering' 'Other']
5


In [7]:
print(X['PER_TITEL_HINTEN_Category'].unique())
print(len(X['PER_TITEL_HINTEN_Category'].unique()))

['Bachelor' 'No_Title' 'Master' 'Bachelor_Law' 'MBA' 'Master_Law'
 'Certification']
7


### Label encoding for Ordinal Data:

In [None]:
# Define Mapping
label_encoding = {
    'No_Title': 0,
    'Certification': 1,
    'Bachelor': 2,
    'Bachelor_Law': 2,
    'MBA': 2,
    'Master_Law': 3,
    'Master': 3,
    'Magister': 3,
    'Doctorate': 4,
    'Engineering': 3,
    'Other': 1
}

# Label-Encoding
X['PER_TITEL_HINTEN_Category'] = X['PER_TITEL_HINTEN_Category'].map(label_encoding)
X['PER_TITEL_VORNE_Category'] = X['PER_TITEL_VORNE_Category'].map(label_encoding)
X

### One-Hot-Encoding for Nominal Data:

In [9]:
# Definition of groups
deutschsprachige_länder = ['Deutschland', 'Österreich', 'Schweiz']
europäische_länder = [
    'Italien', 'Ungarn', 'Bulgarien', 'Ukraine', 'Polen', 'Schweden',
    'Russland', 'Georgien', 'Tschechien', 'Kosovo', 'Island', 'Rumänien',
    'Spanien', 'Niederlande', 'Slowenien', 'Slowakei', 'Kroatien', 
    'Serbien', 'Bosnien und Herzegowina', 'Nordmazedonien'
]

# Application of the defined groups
def categorize_country(country):
    if country in deutschsprachige_länder:
        return 'deutschsprachig'
    elif country in europäische_länder:
        return 'europäisch'
    else:
        return 'weltweit'

# Applicate in Dataframe
X['PER_LANDNR'] = X['PER_LANDNR'].apply(categorize_country)

# Perform One-Hot-Encoding for this feature
X = pd.get_dummies(X, columns=['PER_LANDNR'], drop_first=True)
X = X.drop(['PER_FIRMENNR', 'PER_KOSTENSTELLE', 'Job_Category'], axis=1)
X

Unnamed: 0,MIT_TEILZEITPROZENT,PER_GESCHLECHT,Alter,Distanz_zum_Arbeitsort,Zugehörigkeit,PER_TITEL_VORNE_Category,PER_TITEL_HINTEN_Category,Average_Daily_Hours_Weighted_Avg,Average_Daily_Holiday_Hours_Weighted_Avg,Average_Daily_Sick_Hours_Weighted_Avg,Overtime_Ratio_Weighted_Avg,Adjusted_Overtime_Ratio_Weighted_Avg,Daily_Hours_Variability_Weighted_Avg,PER_LANDNR_europäisch,PER_LANDNR_weltweit
0,71.714,0,27.73,12.215375,0.31,0,2,0.171815,2.529560,1.945656,0.021477,-0.598846,0.520606,False,True
1,84.136,1,22.61,11.010152,1.85,0,0,0.181214,0.198983,1.668648,0.022652,-0.903763,0.554767,False,False
2,99.568,1,23.26,14.846066,4.56,0,0,0.222289,0.443122,2.045984,0.027786,-0.767844,0.668501,False,False
3,99.484,1,26.21,86.618310,0.81,3,2,0.102564,0.244599,1.905315,0.012820,-0.894854,0.407193,False,False
4,66.288,1,26.51,15.743103,1.16,0,0,0.127880,2.375393,1.922350,0.015985,-0.612686,0.355487,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62329,10.062,0,20.01,6.701731,0.45,0,0,11.201102,0.656643,0.000000,1.400138,1.400138,0.597363,False,False
62330,10.562,1,31.70,7.560604,1.06,0,0,10.971587,0.491649,0.000000,1.371448,1.360030,0.590721,False,False
62331,10.012,1,39.02,2.521392,0.89,0,0,11.442210,0.490745,0.000000,1.430276,1.424040,0.599871,False,False
62332,10.439,1,21.51,0.145575,3.65,0,0,11.559983,0.212032,0.012513,1.444998,1.441514,0.614898,False,False


### Data needs to be scaled for Clustering-Algorithms

In [10]:
# Initialize StandardScaler
scaler = StandardScaler()

# Scale features
x_scaled = scaler.fit_transform(X)

# Transform to pandas dataframe format
X = pd.DataFrame(x_scaled, columns=X.columns)

## 2. Create Model

In [11]:
from sklearn.cluster import KMeans

# Define K-Means algorithm
def apply_kmeans(X, n_clusters=2):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_
    
    # Display results
    print("K-Means:")
    print(f"Number of clusters: {n_clusters}")
    print("Cluster centers:", kmeans.cluster_centers_)
    print("-" * 40)
    return labels, kmeans.cluster_centers_

## 3. Train Model

In [12]:
# Apply to data
labels_kmeans, centers_kmeans = apply_kmeans(X, n_clusters=30)
labels_kmeans

K-Means:
Number of clusters: 30
Cluster centers: [[ 3.82564233e-01 -1.41651380e+00 -1.02509480e-01 -1.08556621e-01
  -1.60940379e-01  1.98063577e+00 -5.33565850e-01  3.56907010e-01
  -2.75185303e-01 -2.61497098e-01  3.56907010e-01  3.30100546e-01
   4.64243016e-01 -4.13984867e-01 -2.19772051e-01]
 [ 3.62030737e-02  3.52213139e-01 -1.20457947e-01 -2.41345729e-02
   4.54209910e-02 -9.83584941e-02  2.07741220e-01 -3.52199773e+00
   1.03174203e-02  3.74745436e+00 -3.52199773e+00 -3.66352724e+00
  -2.59746278e+00 -4.13984867e-01  4.55016912e+00]
 [ 1.57301622e-01  1.53221448e-01  2.80881219e+00 -1.35335246e-01
  -9.13876540e-02 -5.03233187e-01 -2.90942351e-01  2.94999948e-01
  -1.87639998e-01 -2.46219064e-01  2.94999948e-01  2.79757010e-01
   2.83528488e-01 -2.37693984e-01 -2.17634023e-01]
 [ 1.52761068e-01  1.77256033e-01 -2.28898293e-01 -1.08705718e-01
  -1.18710566e-01 -3.71200208e-01 -3.87386944e-01 -1.37402877e+00
  -7.74084368e-02  1.48254212e+00 -1.37402877e+00 -1.43858622e+00
  -3.6

array([ 1, 12, 12, ..., 20, 20, 20], dtype=int32)

### Measure performance with Silhouette-Score-Metric

In [13]:
from sklearn.metrics import silhouette_score

# Calculate Silhouette-Score for K-Means
kmeans_silhouette = silhouette_score(X, labels_kmeans)
print(f"K-Means Silhouette Score: {kmeans_silhouette:.2f}")

K-Means Silhouette Score: 0.21


## 4. Create Cluster Table
#### 1) First descaling the Dataframe from above

In [14]:
# Descale
x_descaled = scaler.inverse_transform(X)

# Descale data within Dataframe
X = pd.DataFrame(x_descaled, columns=X.columns)
X

Unnamed: 0,MIT_TEILZEITPROZENT,PER_GESCHLECHT,Alter,Distanz_zum_Arbeitsort,Zugehörigkeit,PER_TITEL_VORNE_Category,PER_TITEL_HINTEN_Category,Average_Daily_Hours_Weighted_Avg,Average_Daily_Holiday_Hours_Weighted_Avg,Average_Daily_Sick_Hours_Weighted_Avg,Overtime_Ratio_Weighted_Avg,Adjusted_Overtime_Ratio_Weighted_Avg,Daily_Hours_Variability_Weighted_Avg,PER_LANDNR_europäisch,PER_LANDNR_weltweit
0,71.714,0.0,27.73,12.215375,0.31,0.0,2.0,0.171815,2.529560,1.945656,0.021477,-0.598846,0.520606,0.0,1.0
1,84.136,1.0,22.61,11.010152,1.85,0.0,0.0,0.181214,0.198983,1.668648,0.022652,-0.903763,0.554767,0.0,0.0
2,99.568,1.0,23.26,14.846066,4.56,0.0,0.0,0.222289,0.443122,2.045984,0.027786,-0.767844,0.668501,0.0,0.0
3,99.484,1.0,26.21,86.618310,0.81,3.0,2.0,0.102564,0.244599,1.905315,0.012820,-0.894854,0.407193,0.0,0.0
4,66.288,1.0,26.51,15.743103,1.16,0.0,0.0,0.127880,2.375393,1.922350,0.015985,-0.612686,0.355487,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62329,10.062,0.0,20.01,6.701731,0.45,0.0,0.0,11.201102,0.656643,0.000000,1.400138,1.400138,0.597363,0.0,0.0
62330,10.562,1.0,31.70,7.560604,1.06,0.0,0.0,10.971587,0.491649,0.000000,1.371448,1.360030,0.590721,0.0,0.0
62331,10.012,1.0,39.02,2.521392,0.89,0.0,0.0,11.442210,0.490745,0.000000,1.430276,1.424040,0.599871,0.0,0.0
62332,10.439,1.0,21.51,0.145575,3.65,0.0,0.0,11.559983,0.212032,0.012513,1.444998,1.441514,0.614898,0.0,0.0


#### 2) Add Clusters as variable and Resignation Rate (y) to Dataframe 

In [15]:
# Add Cluster labels to Cluster-DataFrame
X['Cluster'] = labels_kmeans
# Add Resigantion-rate variable (y) to Cluster-DataFrame
X['Resignation Rate'] = y
X

Unnamed: 0,MIT_TEILZEITPROZENT,PER_GESCHLECHT,Alter,Distanz_zum_Arbeitsort,Zugehörigkeit,PER_TITEL_VORNE_Category,PER_TITEL_HINTEN_Category,Average_Daily_Hours_Weighted_Avg,Average_Daily_Holiday_Hours_Weighted_Avg,Average_Daily_Sick_Hours_Weighted_Avg,Overtime_Ratio_Weighted_Avg,Adjusted_Overtime_Ratio_Weighted_Avg,Daily_Hours_Variability_Weighted_Avg,PER_LANDNR_europäisch,PER_LANDNR_weltweit,Cluster,Resignation Rate
0,71.714,0.0,27.73,12.215375,0.31,0.0,2.0,0.171815,2.529560,1.945656,0.021477,-0.598846,0.520606,0.0,1.0,1,0
1,84.136,1.0,22.61,11.010152,1.85,0.0,0.0,0.181214,0.198983,1.668648,0.022652,-0.903763,0.554767,0.0,0.0,12,0
2,99.568,1.0,23.26,14.846066,4.56,0.0,0.0,0.222289,0.443122,2.045984,0.027786,-0.767844,0.668501,0.0,0.0,12,1
3,99.484,1.0,26.21,86.618310,0.81,3.0,2.0,0.102564,0.244599,1.905315,0.012820,-0.894854,0.407193,0.0,0.0,6,0
4,66.288,1.0,26.51,15.743103,1.16,0.0,0.0,0.127880,2.375393,1.922350,0.015985,-0.612686,0.355487,0.0,1.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62329,10.062,0.0,20.01,6.701731,0.45,0.0,0.0,11.201102,0.656643,0.000000,1.400138,1.400138,0.597363,0.0,0.0,20,1
62330,10.562,1.0,31.70,7.560604,1.06,0.0,0.0,10.971587,0.491649,0.000000,1.371448,1.360030,0.590721,0.0,0.0,20,0
62331,10.012,1.0,39.02,2.521392,0.89,0.0,0.0,11.442210,0.490745,0.000000,1.430276,1.424040,0.599871,0.0,0.0,20,0
62332,10.439,1.0,21.51,0.145575,3.65,0.0,0.0,11.559983,0.212032,0.012513,1.444998,1.441514,0.614898,0.0,0.0,20,0


#### 3) Now create the Cluster Table
Dataframe gets grouped by Cluster-Numbers 
Characteristics including the Resignation-Rate for every Cluster can now easily be analysed

In [16]:
# Calculate Resignation Rate for every cluster
resignation_rate_df = X.groupby('Cluster')['Resignation Rate'].mean().reset_index()
resignation_rate_df.columns = ['Cluster', 'Resignation Rate']

# Show results
print("Resignation Rate per Cluster:")
resignation_rate_df

Resignation Rate per Cluster:


Unnamed: 0,Cluster,Resignation Rate
0,0,0.365881
1,1,0.370968
2,2,0.443747
3,3,0.43562
4,4,0.535153
5,5,0.443751
6,6,0.377962
7,7,0.48145
8,8,0.416235
9,9,0.462057


#### 4) Adding and sorting the additional features

In [17]:
# Average properties for every cluster
cluster_features_df = X.groupby('Cluster').mean().reset_index()

# Add number of how many entrances are in every cluster
cluster_counts = X.groupby('Cluster').size() 
cluster_features_df['Cluster_Count'] = cluster_counts.values


# Move "TargetColumn" to second position
column_to_move = 'Resignation Rate'
columns = list(cluster_features_df.columns)
columns.insert(1, columns.pop(columns.index(column_to_move)))

# DataFrame with new column order
cluster_features_df = cluster_features_df[columns]

# Sorting "Resignation Rate" in descending order
cluster_features_df = cluster_features_df.sort_values(by='Resignation Rate', ascending=False)
cluster_features_df

Unnamed: 0,Cluster,Resignation Rate,MIT_TEILZEITPROZENT,PER_GESCHLECHT,Alter,Distanz_zum_Arbeitsort,Zugehörigkeit,PER_TITEL_VORNE_Category,PER_TITEL_HINTEN_Category,Average_Daily_Hours_Weighted_Avg,Average_Daily_Holiday_Hours_Weighted_Avg,Average_Daily_Sick_Hours_Weighted_Avg,Overtime_Ratio_Weighted_Avg,Adjusted_Overtime_Ratio_Weighted_Avg,Daily_Hours_Variability_Weighted_Avg,PER_LANDNR_europäisch,PER_LANDNR_weltweit,Cluster_Count
4,4,0.535153,96.327301,0.0,28.701226,36.118677,1.031082,0.0,2.479186,8.514077,0.652317,0.049477,1.06426,1.044869,4.485633,0.0,0.0,2162
21,21,0.517085,96.610768,1.0,28.243044,30.128214,1.004212,0.000349,2.478033,8.47155,0.684869,0.047789,1.058944,1.039082,4.496254,0.0,0.0,2868
14,14,0.515562,92.924617,0.573748,29.440839,40.112209,6.207185,0.733424,2.503383,8.584135,0.599519,0.04777,1.073017,1.053655,4.375809,0.217862,0.0,739
7,7,0.48145,47.207187,0.607926,29.010185,40.235549,1.229013,0.369309,2.539629,8.326342,0.786992,0.046751,1.040793,1.021648,2.187873,0.150084,0.0,1186
18,18,0.47412,93.074774,0.0,28.332919,46.129961,5.986749,0.473085,0.001035,8.588214,0.545537,0.05641,1.073527,1.050488,4.37036,0.093168,0.0,966
15,15,0.470935,96.194757,0.832772,27.21575,170.562109,1.128976,0.021483,0.094356,8.499728,0.643882,0.044984,1.062466,1.043095,4.465155,0.001685,0.0,2374
12,12,0.47048,88.402521,0.655351,28.279756,48.826911,1.529247,0.000369,0.535055,0.31605,0.799326,1.927753,0.039506,-0.746576,0.811461,0.162731,0.0,2710
19,19,0.463738,88.226229,0.61576,28.559219,319.783219,1.467273,0.617155,0.451185,8.313558,0.782343,0.054462,1.039195,1.017009,4.094757,0.137378,0.009763,1434
10,10,0.463214,95.259658,1.0,27.340857,33.607172,1.130709,0.40029,0.654163,8.416872,0.718766,0.052054,1.052109,1.030673,4.428703,1.0,0.0,4132
29,29,0.462074,92.663701,0.0,28.19453,38.905179,0.895449,0.0,0.478775,6.680672,2.327238,0.059147,0.835084,0.811678,4.315372,0.083507,0.0,1437


In [18]:
# Export DataFrame via CSV
df.to_csv('output_kmeans.csv', index=False)
