# Fuzzy c-Means on socio-economic dataset


In [1]:
import pandas as pd
from fcmeans import FCM
from sklearn.metrics import silhouette_score
import numpy as np

df = pd.read_excel('Preprocessed_data_standardscaler.xlsx')

# Dropping the ID Column, as it is not required in clustering.

new_df = df.drop(columns=['ID'])
new_df

Unnamed: 0,Normalised Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask)
0,-0.927352,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1
1,0.387063,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2,-0.739578,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1
3,0.762611,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
4,-1.678446,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,-0.739578,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1
477,-1.302899,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1
478,2.077026,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1
479,-0.551804,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1


In [2]:
def part_coeff(U):
    U_1 = U**2
    U_1 = np.sum(U_1,axis=1)
    U_1 = np.sum(U_1)
    U_1 = U_1/U.shape[0]
    return U_1

In [3]:
array_df = new_df.to_numpy()

fuzz = [1.5,2,2.5]

list1,list2,list3 = [],[],[]
coefficients = [list1,list2,list3]

for i in range(0,len(fuzz)):
    for j in range(2,7):
        fcm = FCM(n_clusters=j,m=fuzz[i],random_state=42)
        fcm.fit(array_df)
        coeff = part_coeff(fcm.u)
        coefficients[i].append(coeff)
        
list_ = list(range(2,7))

part_coefficients = pd.DataFrame({
    'no. of clusters': list_,
    'm = 1.5': list1,
    'm = 2': list2,
    'm = 2.5': list3
})

part_coefficients       

Unnamed: 0,no. of clusters,m = 1.5,m = 2,m = 2.5
0,2,0.664656,0.500308,0.5
1,3,0.506294,0.333606,0.333333
2,4,0.435992,0.250006,0.25
3,5,0.444104,0.200142,0.2
4,6,0.432245,0.166695,0.166667


In [4]:
scores = {}

for i in range(2,7):
    fcm = FCM(n_clusters=i,m=1.5,random_state=42)
    fcm.fit(array_df)
    score = silhouette_score(array_df,fcm.predict(array_df))
    scores[i] = score

df_scores = pd.DataFrame(list(scores.items()), columns=['no. of clusters', 'silhouette score'])
df_scores

Unnamed: 0,no. of clusters,silhouette score
0,2,0.197952
1,3,0.156246
2,4,0.147711
3,5,0.179893
4,6,0.188116


In [5]:
fcm = FCM(n_clusters=2,m=1.5,random_state=42)
fcm.fit(array_df)

cluster_list = list(fcm.predict(array_df))

column_list = list(new_df.columns)

clustered_data = pd.DataFrame(array_df,columns=column_list)
clustered_data['Cluster'] = cluster_list
clustered_data

Unnamed: 0,Normalised Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
0,-0.927352,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
1,0.387063,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
2,-0.739578,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
3,0.762611,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,-1.678446,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,-0.739578,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
477,-1.302899,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
478,2.077026,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
479,-0.551804,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [6]:
# Now that the clustering is completed, the 'Normalised Age (Yrs)' column is dropped and replaced with the 'Age (Yrs)' column.
 
final_df = clustered_data.copy()

old_df = pd.read_excel('Asansol socio-economic data 1.xlsx')
age = list(old_df['Age (Yrs)'])

final_df['Age (Yrs)'] = age

final_df = final_df.drop(columns = ['Normalised Age (Yrs)'])

list2 = list(final_df.columns)
new_order = [list2[-1]] + list2[0:-1]

final_df = final_df[new_order]
final_df

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
0,26,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
1,40,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
2,28,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
3,44,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1
4,18,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
476,28,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
477,22,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
478,58,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1
479,30,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0


In [7]:
# now we seperate out the various clusters from the dataset

cluster_1 = final_df.loc[final_df['Cluster']==0]
cluster_1
cluster_1.describe()

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
count,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0,272.0
mean,28.470588,0.948529,0.198529,0.3125,0.455882,0.033088,0.297794,0.393382,0.246324,0.047794,0.014706,0.047794,0.128676,0.202206,0.481618,0.136029,0.003676,0.0,0.882353,0.0
std,4.892107,0.221363,0.399628,0.464367,0.498968,0.179197,0.458131,0.489401,0.431663,0.213724,0.120595,0.213724,0.335459,0.402385,0.500583,0.343452,0.060634,0.0,0.322784,0.0
min,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25.75,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,28.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,32.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
max,39.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0


In [8]:
cluster_2 = final_df.loc[final_df['Cluster']==1]
cluster_2.describe()

Unnamed: 0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask),Cluster
count,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0,209.0
mean,45.516746,0.952153,0.177033,0.296651,0.4689,0.057416,0.167464,0.30622,0.315789,0.186603,0.023923,0.009569,0.019139,0.043062,0.124402,0.521531,0.229665,0.052632,0.755981,1.0
std,8.127645,0.213955,0.382613,0.457878,0.50023,0.233195,0.374286,0.462029,0.465946,0.390528,0.153178,0.097588,0.137342,0.203485,0.330832,0.500736,0.421627,0.223833,0.430535,0.0
min,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,40.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
50%,44.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
75%,50.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
max,73.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [9]:
# The following dataset contains the mean value for each parameter of each cluster.

overall_mean = final_df.groupby('Cluster').mean()

overall_mean

Unnamed: 0_level_0,Age (Yrs),Gender (Male),Education (<Class 10),Education (Class 10-12),Graduate,Higher Edu,Income <20k,Income 20-40k,Income 40-60k,Income 60-80k,Income >80k,Experience (<1yr),Experience (1-2yr),Experience (2-5yr),Experience (5-10yr),Experience (10-20yr),Experience (20-30yr),Experience (>30yr),Helmet (Full-mask)
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,28.470588,0.948529,0.198529,0.3125,0.455882,0.033088,0.297794,0.393382,0.246324,0.047794,0.014706,0.047794,0.128676,0.202206,0.481618,0.136029,0.003676,0.0,0.882353
1,45.516746,0.952153,0.177033,0.296651,0.4689,0.057416,0.167464,0.30622,0.315789,0.186603,0.023923,0.009569,0.019139,0.043062,0.124402,0.521531,0.229665,0.052632,0.755981


In [10]:
# Exporting the datasets to Excel:

cluster_1.to_excel('Cluster_1_FCM.xlsx')
cluster_2.to_excel('Cluster_2_FCM.xlsx')

overall_mean.to_excel('Means_of_parameters_FCM.xlsx')
final_df.to_excel('clustered_data_FCM.xlsx')