# Name : Rohit Kulkarni USC ID : 5402749044

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.metrics import hamming_loss
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler 
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import LabelEncoder
from collections import Counter
from scipy.spatial import distance
from sklearn.neighbors import DistanceMetric
from statistics import mean


Reading the data

In [2]:
data=pd.read_csv("Frogs_MFCCs.csv")

Randomly splitting data to training and test

In [None]:
training_data,test_data=train_test_split(data,test_size=0.30,random_state=42)

In [4]:
X_Training=training_data.iloc[:,:22]
data_label_family=training_data["Family"]
data_label_genus=training_data["Genus"]
data_label_species=training_data["Species"]
X_Test=test_data.iloc[:,:22]
test_data_label_family=test_data["Family"]
test_data_label_genus=test_data["Genus"]
test_data_label_species=test_data["Species"]

1.b.ii)SVM for each of the labels, using Gaussian kernels and one versus all classifiers.

In [5]:
from sklearn import svm
parameter_candidates = [{'C': [1, 10, 100, 1000], 'gamma': [0.01,0.1,0.2,0.5]},]
clf = GridSearchCV(cv=10,estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1)

In [6]:
clf.fit(X_Training,data_label_family)   
print('Best C:',clf.best_estimator_.C) 
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 100
Best Gamma: 0.5


In [7]:
from sklearn import svm
svm = svm.SVC(kernel='rbf', random_state=0, gamma=0.5, C=100)
svm.fit(X_Training, data_label_family)
y_pred1=svm.predict(X_Test)
print("Hamming loss:",hamming_loss(test_data_label_family, y_pred1))
print("Exact Score:",accuracy_score(test_data_label_family, y_pred1))

Hamming loss: 0.007410838351088467
Exact Score: 0.9925891616489115


In [8]:
clf.fit(X_Training,data_label_genus)   
print('Best C:',clf.best_estimator_.C) 
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 10
Best Gamma: 0.5


In [9]:
from sklearn import svm
svm = svm.SVC(kernel='rbf',gamma=0.5, C=10)
svm.fit(X_Training, data_label_genus)
y_pred2=svm.predict(X_Test)
print("Hamming loss:",hamming_loss(test_data_label_genus, y_pred2))
print("Exact Score:",accuracy_score(test_data_label_genus, y_pred2))

Hamming loss: 0.012505789717461788
Exact Score: 0.9874942102825383


In [10]:
clf.fit(X_Training,data_label_species)   
print('Best C:',clf.best_estimator_.C) 
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 10
Best Gamma: 0.5


In [11]:
from sklearn import svm
svm = svm.SVC(kernel='rbf', random_state=0, gamma=0.5, C=10)
svm.fit(X_Training, data_label_species)
y_pred3=svm.predict(X_Test)
print("Hamming loss:",hamming_loss(test_data_label_species, y_pred3))
print("Exact Score:",accuracy_score(test_data_label_species, y_pred3))

Hamming loss: 0.0111162575266327
Exact Score: 0.9888837424733673


Standardizing the data

In [12]:
scaler = StandardScaler()  
scaler.fit(X_Training)
X_Training_std = scaler.transform(X_Training)
X_Test_std = scaler.transform(X_Test)

In [13]:
clf.fit(X_Training_std,data_label_family)   
print('Best C:',clf.best_estimator_.C) 
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 10
Best Gamma: 0.1


In [14]:
from sklearn import svm
svm = svm.SVC(kernel='rbf', random_state=0, gamma=0.1, C=10)
svm.fit(X_Training_std, data_label_family)
y_pred=svm.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_family, y_pred))
print("Exact Score:",accuracy_score(test_data_label_family, y_pred))

Hamming loss: 0.007410838351088467
Exact Score: 0.9925891616489115


In [15]:
clf.fit(X_Training_std,data_label_genus)   
print('Best C:',clf.best_estimator_.C) 
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 10
Best Gamma: 0.1


In [16]:
from sklearn import svm
svm = svm.SVC(kernel='rbf', random_state=0, gamma=0.1, C=10)
svm.fit(X_Training_std, data_label_genus)
y_pred=svm.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_genus, y_pred))
print("Exact Score:",accuracy_score(test_data_label_genus, y_pred))

Hamming loss: 0.012042612320518759
Exact Score: 0.9879573876794813


In [17]:
clf.fit(X_Training_std,data_label_genus)   
print('Best C:',clf.best_estimator_.C) 
print('Best Gamma:',clf.best_estimator_.gamma)

Best C: 10
Best Gamma: 0.1


In [18]:
from sklearn import svm
svm = svm.SVC(kernel='rbf', random_state=0, gamma=0.1, C=10)
svm.fit(X_Training_std, data_label_species)
y_pred=svm.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_species, y_pred))
print("Exact Score:",accuracy_score(test_data_label_species, y_pred))

Hamming loss: 0.015748031496062992
Exact Score: 0.984251968503937


1.b.iii) L1 penalized SVM

In [19]:
from sklearn import svm
parameter_candidates = [{'C': [1, 10, 100, 1000]},]
clf = GridSearchCV(cv=10,estimator=LinearSVC(penalty='l1',random_state=0,loss='squared_hinge',dual=False,max_iter=10000), param_grid=parameter_candidates, n_jobs=-1)
clf.fit(X_Training_std,data_label_family) 
print('Best C:',clf.best_estimator_.C) 

Best C: 1


In [20]:
svc = svm.LinearSVC(penalty='l1',random_state=0,C=100,loss='squared_hinge',dual=False,max_iter=10000)
svc.fit(X_Training_std, data_label_family)
y_pred=svc.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_family, y_pred))
print("Exact Score:",accuracy_score(test_data_label_family, y_pred))

Hamming loss: 0.07179249652616952
Exact Score: 0.9282075034738305


In [21]:
from sklearn import svm
parameter_candidates = [{'C': [1, 10, 100, 1000]},]
clf = GridSearchCV(cv=10,estimator=LinearSVC(penalty='l1',random_state=0,loss='squared_hinge',dual=False,max_iter=10000), param_grid=parameter_candidates, n_jobs=-1)
clf.fit(X_Training_std,data_label_genus) 
print('Best C:',clf.best_estimator_.C) 

Best C: 10


In [22]:
svc = svm.LinearSVC(penalty='l1',random_state=0,C=10,loss='squared_hinge',dual=False,max_iter=10000)
svc.fit(X_Training_std, data_label_genus)
y_pred=svc.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_genus, y_pred))
print("Exact Score:",accuracy_score(test_data_label_genus, y_pred))

Hamming loss: 0.058360352014821676
Exact Score: 0.9416396479851783


In [23]:
from sklearn import svm
parameter_candidates = [{'C': [1, 10, 100, 1000]},]
clf = GridSearchCV(cv=10,estimator=LinearSVC(penalty='l1',random_state=0,loss='squared_hinge',dual=False,max_iter=10000), param_grid=parameter_candidates, n_jobs=-1)
clf.fit(X_Training_std,data_label_species) 
print('Best C:',clf.best_estimator_.C) 

Best C: 1


In [24]:
svc = svm.LinearSVC(penalty='l1',random_state=0,C=1,loss='squared_hinge',dual=False,max_iter=10000)
svc.fit(X_Training_std, data_label_species)
y_pred=svc.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_species, y_pred))
print("Exact Score:",accuracy_score(test_data_label_species, y_pred))

Hamming loss: 0.04075961093098657
Exact Score: 0.9592403890690134


1.b.iv) SMOTE and L1 penalized SVM

In [25]:
sm = SMOTE(random_state=2)
X_train_family, y_train_family = sm.fit_sample(X_Training_std, data_label_family.ravel())

In [26]:
from sklearn import svm
parameter_candidates = [{'C': [1, 10, 100, 1000]},]
clf = GridSearchCV(cv=10,estimator=LinearSVC(penalty='l1',random_state=0,loss='squared_hinge',dual=False,max_iter=10000), param_grid=parameter_candidates, n_jobs=-1)
clf.fit(X_train_family,y_train_family) 
print('Best C:',clf.best_estimator_.C) 

Best C: 10


In [27]:
svc = svm.LinearSVC(penalty='l1',random_state=0,C=10,loss='squared_hinge',dual=False,max_iter=10000)
svc.fit(X_train_family, y_train_family)
y_pred=svc.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_family, y_pred))
print("Exact Score:",accuracy_score(test_data_label_family, y_pred))

Hamming loss: 0.0921723019916628
Exact Score: 0.9078276980083372


In [28]:
sm = SMOTE(random_state=2)
X_train_genus, y_train_genus = sm.fit_sample(X_Training_std, data_label_genus.ravel())

In [29]:
from sklearn import svm
parameter_candidates = [{'C': [1, 10, 100, 1000]},]
clf = GridSearchCV(cv=10,estimator=LinearSVC(penalty='l1',random_state=0,loss='squared_hinge',dual=False,max_iter=10000), param_grid=parameter_candidates, n_jobs=-1)
clf.fit(X_train_genus,y_train_genus) 
print('Best C:',clf.best_estimator_.C) 

Best C: 10


In [30]:
svc = svm.LinearSVC(penalty='l1',random_state=0,C=10,loss='squared_hinge',dual=False,max_iter=10000)
svc.fit(X_train_genus, y_train_genus)
y_pred=svc.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_genus, y_pred))
print("Exact Score:",accuracy_score(test_data_label_genus, y_pred))

Hamming loss: 0.0968040759610931
Exact Score: 0.9031959240389069


In [31]:
sm = SMOTE(random_state=2)
X_train_species, y_train_species = sm.fit_sample(X_Training_std, data_label_species.ravel())

In [32]:
from sklearn import svm
parameter_candidates = [{'C': [1, 10, 100, 1000]},]
clf = GridSearchCV(cv=10,estimator=LinearSVC(penalty='l1',random_state=0,loss='squared_hinge',dual=False,max_iter=10000), param_grid=parameter_candidates, n_jobs=-1)
clf.fit(X_train_species,y_train_species) 
print('Best C:',clf.best_estimator_.C) 

Best C: 100


In [33]:
svc = svm.LinearSVC(penalty='l1',random_state=0,C=100,loss='squared_hinge',dual=False,max_iter=10000)
svc.fit(X_train_species, y_train_species)
y_pred=svc.predict(X_Test_std)
print("Hamming loss:",hamming_loss(test_data_label_species, y_pred))
print("Exact Score:",accuracy_score(test_data_label_species, y_pred))

Hamming loss: 0.042612320518758684
Exact Score: 0.9573876794812413


2.K-Means Clustering on a Multi-Class and Multi-Label Data Set

2.a) K-Means clustering with k={1,2,3.....50}

In [34]:
data = data.drop('RecordID',axis=1)
X_train = data.iloc[:,:-3]
labels = data.iloc[:,-3:]

In [35]:
best_scores=[]
for n_cluster in range(2, 51):
    kmeans = KMeans(n_clusters=n_cluster).fit(X_train)
    label = kmeans.labels_
    sil_coeff = silhouette_score(X_train, label, metric='euclidean')
    print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))
    best_scores.append(sil_coeff)

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.371653028394425
For n_clusters=6, The Silhouette Coefficient is 0.2644681179713745
For n_clusters=7, The Silhouette Coefficient is 0.2605424786999264
For n_clusters=8, The Silhouette Coefficient is 0.2704424015842344
For n_clusters=9, The Silhouette Coefficient is 0.27622334239300217
For n_clusters=10, The Silhouette Coefficient is 0.26326026633432037
For n_clusters=11, The Silhouette Coefficient is 0.2626630243035382
For n_clusters=12, The Silhouette Coefficient is 0.27267239700050294
For n_clusters=13, The Silhouette Coefficient is 0.27841098766353944
For n_clusters=14, The Silhouette Coefficient is 0.26511636884177964
For n_clusters=15, The Silhouette Coefficient is 0.26922648843513997
For n_clusters=16, The Silhouette Coefficient is 0

In [36]:
best_k=best_scores.index(max(best_scores))
print("Best K:",best_k+2)

Best K: 4


In [37]:
clusterer = KMeans(n_clusters=4, random_state=10)
clusterer = clusterer.fit(X_train)
preds = clusterer.predict(X_train)

2.b) Majority for each label

For Family

In [38]:
cluster0_family=data['Family'][clusterer.labels_==0]
cluster1_family=data['Family'][clusterer.labels_==1]
cluster2_family=data['Family'][clusterer.labels_==2]
cluster3_family=data['Family'][clusterer.labels_==3]

In [39]:
count=Counter(cluster0_family)
print(count.most_common(1))

[('Leptodactylidae', 3467)]


In [40]:
count=Counter(cluster1_family)
print(count.most_common(1))

[('Hylidae', 1245)]


In [41]:
count=Counter(cluster2_family)
print(count.most_common(1))

[('Dendrobatidae', 500)]


In [42]:
count=Counter(cluster3_family)
print(count.most_common(1))

[('Hylidae', 590)]


For Genus

In [43]:
cluster0_genus=data['Genus'][clusterer.labels_==0]
cluster1_genus=data['Genus'][clusterer.labels_==1]
cluster2_genus=data['Genus'][clusterer.labels_==2]
cluster3_genus=data['Genus'][clusterer.labels_==3]

In [44]:
count=Counter(cluster0_genus)
print(count.most_common(1))

[('Adenomera', 3466)]


In [45]:
count=Counter(cluster1_genus)
print(count.most_common(1))

[('Hypsiboas', 1038)]


In [46]:
count=Counter(cluster2_genus)
print(count.most_common(1))

[('Ameerega', 500)]


In [47]:
count=Counter(cluster3_genus)
print(count.most_common(1))

[('Hypsiboas', 542)]


For Species

In [48]:
cluster0_species=data['Species'][clusterer.labels_==0]
cluster1_species=data['Species'][clusterer.labels_==1]
cluster2_species=data['Species'][clusterer.labels_==2]
cluster3_species=data['Species'][clusterer.labels_==3]

In [49]:
count=Counter(cluster0_species)
print(count.most_common(1))

[('AdenomeraHylaedactylus', 3466)]


In [50]:
count=Counter(cluster1_species)
print(count.most_common(1))

[('HypsiboasCordobae', 1018)]


In [51]:
count=Counter(cluster2_species)
print(count.most_common(1))

[('Ameeregatrivittata', 500)]


In [52]:
count=Counter(cluster3_species)
print(count.most_common(1))

[('HypsiboasCinerascens', 452)]


In [53]:
labels = clusterer.labels_
labels = pd.DataFrame(labels,columns=['cluster_num'])

In [54]:
data.reset_index(drop=True, inplace=True)
labels.reset_index(drop=True, inplace=True)
data_clustered = pd.concat([data,labels],axis=1)

In [55]:
data_clustered_sorted = data_clustered.sort_values(['cluster_num'], ascending=[1])
cluster_0_df = data_clustered_sorted.loc[data_clustered_sorted['cluster_num'] == 0]
cluster_1_df = data_clustered_sorted.loc[data_clustered_sorted['cluster_num'] == 1]
cluster_2_df = data_clustered_sorted.loc[data_clustered_sorted['cluster_num'] == 2]
cluster_3_df = data_clustered_sorted.loc[data_clustered_sorted['cluster_num'] == 3]

In [None]:
cluster_0_df['pred_Family']=cluster_0_df['Family'].mode()[0]
cluster_0_df['pred_Genus']=cluster_0_df['Genus'].mode()[0]
cluster_0_df['pred_Species']=cluster_0_df['Species'].mode()[0]

cluster_1_df['pred_Family']=cluster_1_df['Family'].mode()[0]
cluster_1_df['pred_Genus']=cluster_1_df['Genus'].mode()[0]
cluster_1_df['pred_Species']=cluster_1_df['Species'].mode()[0]

cluster_2_df['pred_Family']=cluster_2_df['Family'].mode()[0]
cluster_2_df['pred_Genus']=cluster_2_df['Genus'].mode()[0]
cluster_2_df['pred_Species']=cluster_2_df['Species'].mode()[0]

cluster_3_df['pred_Family']=cluster_3_df['Family'].mode()[0]
cluster_3_df['pred_Genus']=cluster_3_df['Genus'].mode()[0]
cluster_3_df['pred_Species']=cluster_3_df['Species'].mode()[0]

In [57]:
mapping_families = {'Bufonidae': 0, 'Dendrobatidae': 1, 'Hylidae':2, 'Leptodactylidae':3}
mapping_genus = {'Adenomera': 0, 'Ameerega': 1, 'Dendropsophus':2, 'Hypsiboas':3, 'Leptodactylus': 4, 'Osteocephalus': 5, 'Rhinella':6, 'Scinax':7}
mapping_species = {'AdenomeraAndre':0,'AdenomeraHylaedactylus': 1, 'Ameeregatrivittata':2, 'HylaMinuta':3, 'HypsiboasCinerascens': 4, 'HypsiboasCordobae':5 , 'LeptodactylusFuscus':6, 'OsteocephalusOophagus' :7,'Rhinellagranulosa':8,'ScinaxRuber':9 }

In [58]:
cluster_0_df=cluster_0_df.replace(mapping_families).replace(mapping_genus).replace(mapping_species)
cluster_1_df=cluster_1_df.replace(mapping_families).replace(mapping_genus).replace(mapping_species)
cluster_2_df=cluster_2_df.replace(mapping_families).replace(mapping_genus).replace(mapping_species)
cluster_3_df=cluster_3_df.replace(mapping_families).replace(mapping_genus).replace(mapping_species)

In [None]:
columns1 = ['Family','Genus','Species']
columns2 = ['pred_Family','pred_Genus','pred_Species']

cluster0_true_labels = cluster_0_df[columns1].as_matrix()
cluster0_pred_labels = cluster_0_df[columns2].as_matrix()

cluster1_true_labels = cluster_1_df[columns1].as_matrix()
cluster1_pred_labels = cluster_1_df[columns2].as_matrix()

cluster2_true_labels = cluster_2_df[columns1].as_matrix()
cluster2_pred_labels = cluster_2_df[columns2].as_matrix()

cluster3_true_labels = cluster_3_df[columns1].as_matrix()
cluster3_pred_labels = cluster_3_df[columns2].as_matrix()

2.c) Hamming distance, hamming loss and hamming score for each cluster

In [60]:
cluster0_Hamming_Distance=[]
for i in range(len(cluster0_true_labels)):
    cluster0_true_label=cluster0_true_labels[i]
    cluster0_pred_label=cluster0_pred_labels[i]
    Hamming_Distance=distance.hamming(cluster0_true_label,cluster0_pred_label)
    cluster0_Hamming_Distance.append(Hamming_Distance)
print("Average Hamming Distance of Cluster 0: ",str(mean(cluster0_Hamming_Distance)))

cluster1_Hamming_Distance=[]
for i in range(len(cluster1_true_labels)):
    cluster1_true_label=cluster1_true_labels[i]
    cluster1_pred_label=cluster1_pred_labels[i]
    Hamming_Distance=distance.hamming(cluster1_true_label,cluster1_pred_label)
    cluster1_Hamming_Distance.append(Hamming_Distance)
print("Average Hamming Distance of Cluster 1: ",str(mean(cluster1_Hamming_Distance)))


cluster2_Hamming_Distance=[]
for i in range(len(cluster2_true_labels)):
    cluster2_true_label=cluster2_true_labels[i]
    cluster2_pred_label=cluster2_pred_labels[i]
    Hamming_Distance=distance.hamming(cluster2_true_label,cluster2_pred_label)
    cluster2_Hamming_Distance.append(Hamming_Distance)
print("Average Hamming Distance of Cluster 2: ",str(mean(cluster2_Hamming_Distance)))


cluster3_Hamming_Distance=[]
for i in range(len(cluster3_true_labels)):
    cluster3_true_label=cluster3_true_labels[i]
    cluster3_pred_label=cluster3_pred_labels[i]
    Hamming_Distance=distance.hamming(cluster3_true_label,cluster3_pred_label)
    cluster3_Hamming_Distance.append(Hamming_Distance)
print("Average Hamming Distance of Cluster 3: "+str(mean(cluster3_Hamming_Distance)))

Average Hamming Distance of Cluster 0:  0.028494020926756354
Average Hamming Distance of Cluster 1:  0.444836865119408
Average Hamming Distance of Cluster 2:  0.5150339476236664
Average Hamming Distance of Cluster 3: 0.14006514657980457


In [61]:
cluster0_Hamming_Loss=[]
for i in range(len(cluster0_true_labels)):
    cluster0_true_label=cluster0_true_labels[i]
    cluster0_pred_label=cluster0_pred_labels[i]
    Hamming_Loss=hamming_loss(cluster0_true_label,cluster0_pred_label)
    cluster0_Hamming_Loss.append(Hamming_Loss)
print("Average Hamming Loss of Cluster 0: ",str(mean(cluster0_Hamming_Loss)))

cluster1_Hamming_Loss=[]
for i in range(len(cluster1_true_labels)):
    cluster1_true_label=cluster1_true_labels[i]
    cluster1_pred_label=cluster1_pred_labels[i]
    Hamming_Loss=hamming_loss(cluster1_true_label,cluster1_pred_label)
    cluster1_Hamming_Loss.append(Hamming_Loss)
print("Average Hamming Loss of Cluster 1: ",str(mean(cluster1_Hamming_Loss)))

cluster2_Hamming_Loss=[]
for i in range(len(cluster2_true_labels)):
    cluster2_true_label=cluster2_true_labels[i]
    cluster2_pred_label=cluster2_pred_labels[i]
    Hamming_Loss=hamming_loss(cluster2_true_label,cluster2_pred_label)
    cluster2_Hamming_Loss.append(Hamming_Loss)
print("Average Hamming Loss of Cluster 2: ",str(mean(cluster2_Hamming_Loss)))

cluster3_Hamming_Loss=[]
for i in range(len(cluster3_true_labels)):
    cluster3_true_label=cluster3_true_labels[i]
    cluster3_pred_label=cluster3_pred_labels[i]
    Hamming_Loss=hamming_loss(cluster3_true_label,cluster3_pred_label)
    cluster3_Hamming_Loss.append(Hamming_Loss)
print("Average Hamming Loss of Cluster 3: ",str(mean(cluster3_Hamming_Loss)))

Average Hamming Loss of Cluster 0:  0.028494020926756354
Average Hamming Loss of Cluster 1:  0.444836865119408
Average Hamming Loss of Cluster 2:  0.5150339476236664
Average Hamming Loss of Cluster 3:  0.14006514657980457


In [62]:
acc_list0 = []
for i in range(len(cluster0_true_labels)):
    set_true = set(cluster0_true_labels[i])
    set_pred = set(cluster0_pred_labels[i])
    tmp_a = None
    tmp_a = len(set_true.intersection(set_pred))/\
                    len(set_true.union(set_pred))
    acc_list0.append(tmp_a)
print("Average Hamming Score of Cluster 0: ",np.mean(acc_list0))

acc_list1 = []
for i in range(len(cluster1_true_labels)):
    set_true = set(cluster1_true_labels[i])
    set_pred = set(cluster1_pred_labels[i])
    tmp_a = None
    tmp_a = len(set_true.intersection(set_pred))/\
                    len(set_true.union(set_pred))
    acc_list1.append(tmp_a)
print("Average Hamming Score of Cluster 1: ",np.mean(acc_list1))

acc_list2 = []
for i in range(len(cluster2_true_labels)):
    set_true = set(cluster2_true_labels[i])
    set_pred = set(cluster2_pred_labels[i])
    tmp_a = None
    tmp_a = len(set_true.intersection(set_pred))/\
                    len(set_true.union(set_pred))
    acc_list2.append(tmp_a)
print("Average Hamming Score of Cluster 2: ",np.mean(acc_list2))

acc_list3 = []
for i in range(len(cluster3_true_labels)):
    set_true = set(cluster3_true_labels[i])
    set_pred = set(cluster3_pred_labels[i])
    tmp_a = None
    tmp_a = len(set_true.intersection(set_pred))/\
                    len(set_true.union(set_pred))
    acc_list3.append(tmp_a)
print("Average Hamming Score of Cluster 3: ",np.mean(acc_list3))

Average Hamming Score of Cluster 0:  0.9781670403587444
Average Hamming Score of Cluster 1:  0.6374032963336697
Average Hamming Score of Cluster 2:  0.5557710960232783
Average Hamming Score of Cluster 3:  0.8376221498371335
