# K-Medoids clustering on socio-economic dataset

In [6]:
import pandas as pd
from sklearn_extra.cluster import KMedoids
from sklearn.metrics import silhouette_score

df = pd.read_excel('Preprocessed_data_standardscaler.xlsx')

# Dropping the ID Column, as it is not required in clustering.

new_df = df.drop(columns=['ID'])
new_df

Unnamed: 0,Normalised Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask)
0,-0.927352,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
1,0.387063,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2,-0.739578,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1
3,0.762611,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,-1.678446,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,-0.739578,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1
477,-1.302899,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1
478,2.077026,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1
479,-0.551804,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1


In [7]:
# Now the Silhouette Score method is used to determine the number of clusters to achieve the best possible clustering
# Here the scores are calculated for k = {2,3,4,5,6}

scores = {}

for i in range(2,7):
    kmedoids = KMedoids(n_clusters=i,method='pam',init='build',random_state=42)
    kmedoids.fit(new_df)
    score = silhouette_score(new_df,kmedoids.labels_)
    scores[i]=score

df_scores = pd.DataFrame(list(scores.items()), columns=['no. of clusters', 'silhouette score'])
df_scores

# since setting k = 5 yields the highest score, the number of clusters are chosen to be 5.
# As can be seen in the table below:

Unnamed: 0,no. of clusters,silhouette score
0,2,0.137599
1,3,0.15238
2,4,0.170523
3,5,0.17254
4,6,0.16244


In [8]:
# Now applying the K-Medoids algorithm.

optimal_clusters = 5

kmedoids_1 = KMedoids(n_clusters=optimal_clusters,method='pam',init='build',random_state=42)
kmedoids_1.fit(new_df)

list1 = list(kmedoids_1.labels_)

df_clustered = new_df.copy()

df_clustered['Cluster'] = list1
df_clustered

# As can be seen in the below dataset, a new 'Cluster' column has been added at the very end:

Unnamed: 0,Normalised Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
0,-0.927352,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,2
1,0.387063,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,-0.739578,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,4
3,0.762611,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,2
4,-1.678446,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,-0.739578,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,4
477,-1.302899,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,4
478,2.077026,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
479,-0.551804,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1


In [27]:
final_df = df_clustered.copy()

old_df = pd.read_excel('Asansol socio-economic data 1.xlsx')
age = list(old_df['Age (Yrs)'])

final_df['Age (Yrs)'] = age

final_df = final_df.drop(columns = ['Normalised Age (Yrs)'])

list2 = list(final_df.columns)
new_order = [list2[-1]] + list2[0:-1]

final_df = final_df[new_order]
final_df

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
0,26,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,2
1,40,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,28,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,4
3,44,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,2
4,18,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,28,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,4
477,22,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,4
478,58,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
479,30,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1


In [32]:
cluster_1 = final_df.loc[final_df['Cluster']==0]
cluster_1
cluster_1.describe()

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
count,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0,71.0
mean,51.873239,0.943662,0.140845,0.183099,0.577465,0.098592,0.070423,0.239437,0.267606,0.380282,0.042254,0.0,0.014085,0.028169,0.056338,0.169014,0.591549,0.140845,0.760563,0.0
std,8.349388,0.232214,0.350338,0.3895,0.497479,0.300235,0.257679,0.429777,0.445862,0.488911,0.202599,0.0,0.118678,0.166633,0.232214,0.377432,0.495046,0.350338,0.429777,0.0
min,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,46.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,51.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
75%,58.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
max,73.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [30]:
cluster_2 = final_df.loc[final_df['Cluster']==1]
cluster_2.describe()

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
count,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0
mean,38.329268,0.95122,0.0,0.085366,0.878049,0.036585,0.073171,0.073171,0.658537,0.170732,0.02439,0.0,0.04878,0.04878,0.02439,0.853659,0.02439,0.0,0.878049,1.0
std,6.299235,0.216734,0.0,0.281145,0.329243,0.188897,0.262019,0.262019,0.477119,0.37859,0.155207,0.0,0.216734,0.216734,0.155207,0.355623,0.155207,0.0,0.329243,0.0
min,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,35.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
50%,37.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
75%,40.75,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,58.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0


In [31]:
cluster_3 = final_df.loc[final_df['Cluster']==2]
cluster_3.describe()

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
count,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0
mean,32.298969,0.958763,0.742268,0.195876,0.030928,0.030928,0.865979,0.092784,0.020619,0.020619,0.0,0.051546,0.103093,0.14433,0.494845,0.175258,0.030928,0.0,0.824742,2.0
std,8.21478,0.199871,0.439658,0.398935,0.174022,0.174022,0.342444,0.291636,0.142842,0.142842,0.0,0.222258,0.30566,0.35325,0.502571,0.382162,0.174022,0.0,0.382162,0.0
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
25%,26.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
50%,32.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0
75%,38.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0
max,50.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,2.0


In [33]:
cluster_4 = final_df.loc[final_df['Cluster']==3]
cluster_4.describe()

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
count,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0,114.0
mean,35.350877,0.964912,0.061404,0.894737,0.017544,0.026316,0.070175,0.912281,0.0,0.008772,0.008772,0.052632,0.149123,0.096491,0.263158,0.412281,0.017544,0.008772,0.780702,3.0
std,9.370432,0.184814,0.241129,0.308247,0.131866,0.160779,0.25657,0.284135,0.0,0.093659,0.093659,0.224283,0.357782,0.296567,0.442292,0.494418,0.131866,0.093659,0.415598,0.0
min,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,28.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
50%,34.5,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0
75%,42.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0
max,59.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0


In [34]:
cluster_5 = final_df.loc[final_df['Cluster']==4]
cluster_5.describe()

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
count,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0,117.0
mean,27.931624,0.931624,0.017094,0.051282,0.888889,0.042735,0.111111,0.299145,0.495726,0.068376,0.025641,0.034188,0.059829,0.282051,0.623932,0.0,0.0,0.0,0.880342,4.0
std,4.872029,0.253476,0.130179,0.221521,0.315621,0.203129,0.315621,0.459853,0.502132,0.253476,0.158742,0.182493,0.23819,0.451934,0.486481,0.0,0.0,0.0,0.325957,0.0
min,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
25%,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4.0
50%,28.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,4.0
75%,30.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,4.0
max,45.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,4.0


In [37]:
overall_mean = final_df.groupby('Cluster').mean()

overall_mean

Unnamed: 0_level_0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask)
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,51.873239,0.943662,0.140845,0.183099,0.577465,0.098592,0.070423,0.239437,0.267606,0.380282,0.042254,0.0,0.014085,0.028169,0.056338,0.169014,0.591549,0.140845,0.760563
1,38.329268,0.95122,0.0,0.085366,0.878049,0.036585,0.073171,0.073171,0.658537,0.170732,0.02439,0.0,0.04878,0.04878,0.02439,0.853659,0.02439,0.0,0.878049
2,32.298969,0.958763,0.742268,0.195876,0.030928,0.030928,0.865979,0.092784,0.020619,0.020619,0.0,0.051546,0.103093,0.14433,0.494845,0.175258,0.030928,0.0,0.824742
3,35.350877,0.964912,0.061404,0.894737,0.017544,0.026316,0.070175,0.912281,0.0,0.008772,0.008772,0.052632,0.149123,0.096491,0.263158,0.412281,0.017544,0.008772,0.780702
4,27.931624,0.931624,0.017094,0.051282,0.888889,0.042735,0.111111,0.299145,0.495726,0.068376,0.025641,0.034188,0.059829,0.282051,0.623932,0.0,0.0,0.0,0.880342


In [40]:
# Exporting the datasets to excel:

final_df.to_excel('clustered data KMedoids.xlsx')
overall_mean.to_excel('mean_for_clusters_kmedoids.xlsx')

cluster_1.to_excel('cluster_1_kmedoids.xlsx')
cluster_2.to_excel('cluster_2_kmedoids.xlsx')
cluster_3.to_excel('cluster_3_kmedoids.xlsx')
cluster_4.to_excel('cluster_4_kmedoids.xlsx')
cluster_5.to_excel('cluster_5_kmedoids.xlsx')