# 1. Multi-class and Multi-Label Classification Using Support Vector Machines

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
from tabulate import tabulate
import statistics

### (a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. Choose 70% of the data randomly as the training set.

In [2]:
data = pd.read_csv("../AnuranCalls/Frogs_MFCCs.csv")

In [3]:
train = data.sample(frac = 0.7, random_state = 42)
test = data.drop(train.index)

train.reset_index(drop = True, inplace = True)
test.reset_index(drop = True, inplace = True)

X_train = train.iloc[:, :22]
y_train = train.iloc[:, 22:-1]

X_test = test.iloc[:, :22]
y_test = test.iloc[:, 22:-1]

### (b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance).

#### (i) Research exact match and hamming score/ loss methods for evaluating multilabel classification and use them in evaluating the classifiers in this problem.

Hamming loss - 
It is computed as fraction of labels that are mislabelled. 

Hamming score -
It is computed as fraction of labels that are correctly labelled. 

Exact Match - 
It checks if all labels for a data point are correct or not. If all are correct, only then classification is considered as correct.

Hamming Distance -
How many classifications do not match for each data point. Bit comparison can be done here.


In [4]:
def hamming_loss_function(y_test, y_pred):
    y_test['family_mismatch'] = np.where(y_test['Family'] != y_pred['Family'], 1, 0)
    y_test['genus_mismatch'] = np.where(y_test['Genus'] != y_pred['Genus'], 1, 0)
    y_test['species_mismatch'] = np.where(y_test['Species'] != y_pred['Species'], 1, 0)
    
    sum_val = 0
    sum_val += y_test["family_mismatch"].sum()
    sum_val += y_test["genus_mismatch"].sum()
    sum_val += y_test["species_mismatch"].sum()
    return (sum_val/(3*len(y_test)))

In [5]:
def exact_match(y_test, y_pred):
    y_test["matches"]=(y_test['Family']==y_pred['Family'])&(y_test['Genus']==y_pred['Genus'])&(y_test['Species']==y_pred['Species'])
    y_test["matches"] = y_test['matches'].astype(int)
    sum_val = 0
    sum_val += y_test["matches"].sum()
    return (sum_val/len(y_test))

#### (ii) Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation. You are welcome to try to solve the problem with both standardized 2 and raw attributes and report the results.

##### RAW Data

In [6]:
svm = make_pipeline(SVC(kernel = 'rbf', decision_function_shape = 'ovr'))

params = {}
params["svc__C"] = [1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3]
params["svc__gamma"] = np.linspace(0.1, 2, 5).tolist()

grid_search = GridSearchCV(estimator = svm, param_grid = params, cv = 10)

In [7]:
grid_result_family = grid_search.fit(X_train, y_train["Family"])
print("Best Train Accuracy: %f using %s" % (grid_result_family.best_score_, grid_result_family.best_params_))
print("Best Train Error: %f" % (1-grid_result_family.best_score_))
test_score = grid_result_family.best_estimator_.score(X_test, y_test["Family"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_family.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_family = grid_result_family.best_estimator_.predict(X_train)
y_pred_family = grid_result_family.best_estimator_.predict(X_test)

Best Train Accuracy: 0.992654 using {'svc__C': 10.0, 'svc__gamma': 2.0}
Best Train Error: 0.007346
Best Test Accuracy: 0.992589 using {'svc__C': 10.0, 'svc__gamma': 2.0}
Best Test Error: 0.007411


In [8]:
grid_result_genus = grid_search.fit(X_train, y_train["Genus"])
print("Best Train Accuracy: %f using %s" % (grid_result_genus.best_score_, grid_result_genus.best_params_))
print("Best Train Error: %f" % (1-grid_result_genus.best_score_))
test_score = grid_result_genus.best_estimator_.score(X_test, y_test["Genus"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_genus.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_genus = grid_result_genus.best_estimator_.predict(X_train)
y_pred_genus = grid_result_genus.best_estimator_.predict(X_test)

Best Train Accuracy: 0.991660 using {'svc__C': 10.0, 'svc__gamma': 2.0}
Best Train Error: 0.008340
Best Test Accuracy: 0.991200 using {'svc__C': 10.0, 'svc__gamma': 2.0}
Best Test Error: 0.008800


In [9]:
grid_result_species = grid_search.fit(X_train, y_train["Species"])
print("Best Train Accuracy: %f using %s" % (grid_result_species.best_score_, grid_result_species.best_params_))
print("Best Train Error: %f" % (1-grid_result_species.best_score_))
test_score = grid_result_species.best_estimator_.score(X_test, y_test["Species"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_species.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_species = grid_result_species.best_estimator_.predict(X_train)
y_pred_species = grid_result_species.best_estimator_.predict(X_test)

Best Train Accuracy: 0.991263 using {'svc__C': 10.0, 'svc__gamma': 1.525}
Best Train Error: 0.008737
Best Test Accuracy: 0.989347 using {'svc__C': 10.0, 'svc__gamma': 1.525}
Best Test Error: 0.010653


In [10]:
y_pred_train = pd.concat([pd.Series(y_pred_train_family), pd.Series(y_pred_train_genus), pd.Series(y_pred_train_species)], axis=1, sort=False)
y_pred_train.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

y_pred = pd.concat([pd.Series(y_pred_family), pd.Series(y_pred_genus), pd.Series(y_pred_species)], axis=1, sort=False)
y_pred.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

#### Hamming Score/Loss

In [11]:
print("Train Scores")
loss = hamming_loss_function(y_train, y_pred_train)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

print("Test Scores")
loss = hamming_loss_function(y_test, y_pred)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

Train Scores
Hamming Loss obtained: 0.00013238019592268997
Hamming Score obtained: 0.9998676198040773
Test Scores
Hamming Loss obtained: 0.00895476300756523
Hamming Score obtained: 0.9910452369924347


#### Exact Match Score/Loss

In [12]:
print("Train Scores")
exact_match_score = exact_match(y_train, y_pred_train)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

print("Test Scores")
exact_match_score = exact_match(y_test, y_pred)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

Train Scores
Exact match Loss obtained: 0.0003971405877680745
Exact match Score obtained: 0.9996028594122319
Test Scores
Exact match Loss obtained: 0.011579434923575715
Exact match Score obtained: 0.9884205650764243


##### Standardized Data

In [13]:
svm = make_pipeline(StandardScaler(), SVC(kernel = 'rbf', decision_function_shape = 'ovr'))

params = {}
params["svc__C"] = [1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3]
params["svc__gamma"] = np.linspace(0.1, 2, 5).tolist()

grid_search = GridSearchCV(estimator = svm, param_grid = params, cv = 10)

In [14]:
grid_result_family = grid_search.fit(X_train, y_train["Family"])
print("Best Train Accuracy: %f using %s" % (grid_result_family.best_score_, grid_result_family.best_params_))
print("Best Train Error: %f" % (1-grid_result_family.best_score_))
test_score = grid_result_family.best_estimator_.score(X_test, y_test["Family"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_family.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_family = grid_result_family.best_estimator_.predict(X_train)
y_pred_family = grid_result_family.best_estimator_.predict(X_test)

Best Train Accuracy: 0.989871 using {'svc__C': 10.0, 'svc__gamma': 0.1}
Best Train Error: 0.010129
Best Test Accuracy: 0.989810 using {'svc__C': 10.0, 'svc__gamma': 0.1}
Best Test Error: 0.010190


In [15]:
grid_result_genus = grid_search.fit(X_train, y_train["Genus"])
print("Best Train Accuracy: %f using %s" % (grid_result_genus.best_score_, grid_result_genus.best_params_))
print("Best Train Error: %f" % (1-grid_result_genus.best_score_))
test_score = grid_result_genus.best_estimator_.score(X_test, y_test["Genus"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_genus.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_genus = grid_result_genus.best_estimator_.predict(X_train)
y_pred_genus = grid_result_genus.best_estimator_.predict(X_test)

Best Train Accuracy: 0.986696 using {'svc__C': 10.0, 'svc__gamma': 0.1}
Best Train Error: 0.013304
Best Test Accuracy: 0.987957 using {'svc__C': 10.0, 'svc__gamma': 0.1}
Best Test Error: 0.012043


In [16]:
grid_result_species = grid_search.fit(X_train, y_train["Species"])
print("Best Train Accuracy: %f using %s" % (grid_result_species.best_score_, grid_result_species.best_params_))
print("Best Train Error: %f" % (1-grid_result_species.best_score_))
test_score = grid_result_species.best_estimator_.score(X_test, y_test["Species"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_species.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_species = grid_result_species.best_estimator_.predict(X_train)
y_pred_species = grid_result_species.best_estimator_.predict(X_test)

Best Train Accuracy: 0.984710 using {'svc__C': 10.0, 'svc__gamma': 0.1}
Best Train Error: 0.015290
Best Test Accuracy: 0.986105 using {'svc__C': 10.0, 'svc__gamma': 0.1}
Best Test Error: 0.013895


In [17]:
y_pred_train = pd.concat([pd.Series(y_pred_train_family), pd.Series(y_pred_train_genus), pd.Series(y_pred_train_species)], axis=1, sort=False)
y_pred_train.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

y_pred = pd.concat([pd.Series(y_pred_family), pd.Series(y_pred_genus), pd.Series(y_pred_species)], axis=1, sort=False)
y_pred.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

#### Hamming Score/Loss

In [18]:
print("Train Scores")
loss = hamming_loss_function(y_train, y_pred_train)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

print("Test Scores")
loss = hamming_loss_function(y_test, y_pred)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

Train Scores
Hamming Loss obtained: 0.0
Hamming Score obtained: 1.0
Test Scores
Hamming Loss obtained: 0.012042612320518759
Hamming Score obtained: 0.9879573876794813


#### Exact Match Score/Loss

In [19]:
print("Train Scores")
exact_match_score = exact_match(y_train, y_pred_train)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

print("Test Scores")
exact_match_score = exact_match(y_test, y_pred)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

Train Scores
Exact match Loss obtained: 0.0
Exact match Score obtained: 1.0
Test Scores
Exact match Loss obtained: 0.016674386289949106
Exact match Score obtained: 0.9833256137100509


#### Report Results 

| Model               | Data                | Hamming Loss | Hamming Score | Exact Match Loss | Exact Match Score |
|---------------------|---------------------|--------------|---------------|------------------|-------------------|
| SVM - Gaussian, OVR | Raw Train           | 0.00013      | 0.99987       | 0.00039          | 0.99960           |
| SVM - Gaussian, OVR | Raw Test            | 0.00895      | 0.99104       | 0.01157          | 0.98842           |
| SVM - Gaussian, OVR | Standardized Train  | 0.0          | 1.0           | 0.0              | 1.0               |
| SVM - Gaussian, OVR | Standardized Test   | 0.01204      | 0.98796       | 0.01667          | 0.98333           |

We notice that exact match loss is higher than Hamming loss. This is because exact match considers a prediction is wrong even if one of the 3 labels predicted is wrong. 

We can also see that the SVM on raw data has outperformed svm on standardized data on test data. But on train data, svm on standardized data has performed better. This could possibly be result of overfitting train data which led it to not be able to generalize well on test data.

#### (iii) Repeat 1(b)ii with L1-penalized SVMs. Remember to standardize the attributes. Determine the weight of the SVM penalty using 10 fold cross validation.

In [20]:
svm_l1 = make_pipeline(StandardScaler(), LinearSVC(random_state=42, max_iter = 10000, penalty = 'l1', dual = False))
params = {}
params["linearsvc__C"] = [1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3]
grid_search = GridSearchCV(estimator = svm_l1, param_grid = params, cv = 10)

In [21]:
grid_result_family = grid_search.fit(X_train, y_train["Family"])
print("Best Train Accuracy: %f using %s" % (grid_result_family.best_score_, grid_result_family.best_params_))
print("Best Train Error: %f" % (1-grid_result_family.best_score_))
test_score = grid_result_family.best_estimator_.score(X_test, y_test["Family"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_family.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_family = grid_result_family.best_estimator_.predict(X_train)
y_pred_family = grid_result_family.best_estimator_.predict(X_test)

Best Train Accuracy: 0.932493 using {'linearsvc__C': 1}
Best Train Error: 0.067507
Best Test Accuracy: 0.941176 using {'linearsvc__C': 1}
Best Test Error: 0.058824


In [22]:
grid_result_genus = grid_search.fit(X_train, y_train["Genus"])
print("Best Train Accuracy: %f using %s" % (grid_result_genus.best_score_, grid_result_genus.best_params_))
print("Best Train Error: %f" % (1-grid_result_genus.best_score_))
test_score = grid_result_genus.best_estimator_.score(X_test, y_test["Genus"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_genus.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_genus = grid_result_genus.best_estimator_.predict(X_train)
y_pred_genus = grid_result_genus.best_estimator_.predict(X_test)

Best Train Accuracy: 0.950162 using {'linearsvc__C': 10.0}
Best Train Error: 0.049838
Best Test Accuracy: 0.956925 using {'linearsvc__C': 10.0}
Best Test Error: 0.043075


In [23]:
grid_result_species = grid_search.fit(X_train, y_train["Species"])
print("Best Train Accuracy: %f using %s" % (grid_result_species.best_score_, grid_result_species.best_params_))
print("Best Train Error: %f" % (1-grid_result_species.best_score_))
test_score = grid_result_species.best_estimator_.score(X_test, y_test["Species"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_species.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_species = grid_result_species.best_estimator_.predict(X_train)
y_pred_species = grid_result_species.best_estimator_.predict(X_test)

Best Train Accuracy: 0.956913 using {'linearsvc__C': 10.0}
Best Train Error: 0.043087
Best Test Accuracy: 0.956925 using {'linearsvc__C': 10.0}
Best Test Error: 0.043075


In [24]:
y_pred_train = pd.concat([pd.Series(y_pred_train_family), pd.Series(y_pred_train_genus), pd.Series(y_pred_train_species)], axis=1, sort=False)
y_pred_train.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

y_pred = pd.concat([pd.Series(y_pred_family), pd.Series(y_pred_genus), pd.Series(y_pred_species)], axis=1, sort=False)
y_pred.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

#### Hamming Score/Loss

In [25]:
print("Train Scores")
loss = hamming_loss_function(y_train, y_pred_train)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

print("Test Scores")
loss = hamming_loss_function(y_test, y_pred)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

Train Scores
Hamming Loss obtained: 0.05003971405877681
Hamming Score obtained: 0.9499602859412232
Test Scores
Hamming Loss obtained: 0.04832484174772271
Hamming Score obtained: 0.9516751582522773


#### Exact Match Score/Loss

In [26]:
print("Train Scores")
exact_match_score = exact_match(y_train, y_pred_train)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

print("Test Scores")
exact_match_score = exact_match(y_test, y_pred)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

Train Scores
Exact match Loss obtained: 0.08200953137410638
Exact match Score obtained: 0.9179904686258936
Test Scores
Exact match Loss obtained: 0.08244557665585917
Exact match Score obtained: 0.9175544233441408


#### Report Results 

| Model                 |   Data   |   Hamming Loss   |   Hamming Score   |   Exact Match Loss  | Exact Match Score  |
|-----------------------|----------|------------------|-------------------|---------------------|--------------------|
| SVM - L1 Penalized    |  Train   | 0.05004          | 0.94996           | 0.08200             | 0.91799            |
| SVM - L1 Penalized    |  Test    | 0.04832          | 0.95168           | 0.08245             | 0.91755            |

The SVM model with L1 penalization has performed a bit lower than SVM with Gaussian Kernel. The loss obtained are higher than previous one. 

We notice that exact match loss is higher than Hamming loss. This is because exact match considers a prediction is wrong even if one of the 3 labels predicted is wrong. 

sklearn LinearSVC function by default uses the one vs rest logic.

#### (iv) Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [27]:
svm_l1 = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                              ['standardize', StandardScaler()],
                              ['classifier', 
                               LinearSVC(random_state=42, max_iter = 1000000, penalty = 'l1', dual = False)]])

params = {}
params["classifier__C"] = [1e-3, 1e-2, 1e-1, 1, 1e+1, 1e+2, 1e+3]
grid_search = GridSearchCV(estimator = svm_l1, param_grid = params, cv = 10)

In [28]:
grid_result_family = grid_search.fit(X_train, y_train["Family"])
print("Best Train Accuracy: %f using %s" % (grid_result_family.best_score_, grid_result_family.best_params_))
print("Best Train Error: %f" % (1-grid_result_family.best_score_))
test_score = grid_result_family.best_estimator_.score(X_test, y_test["Family"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_family.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_family = grid_result_family.best_estimator_.predict(X_train)
y_pred_family = grid_result_family.best_estimator_.predict(X_test)

Best Train Accuracy: 0.914821 using {'classifier__C': 100.0}
Best Train Error: 0.085179
Best Test Accuracy: 0.925428 using {'classifier__C': 100.0}
Best Test Error: 0.074572


In [29]:
grid_result_genus = grid_search.fit(X_train, y_train["Genus"])
print("Best Train Accuracy: %f using %s" % (grid_result_genus.best_score_, grid_result_genus.best_params_))
print("Best Train Error: %f" % (1-grid_result_genus.best_score_))
test_score = grid_result_genus.best_estimator_.score(X_test, y_test["Genus"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_genus.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_genus = grid_result_genus.best_estimator_.predict(X_train)
y_pred_genus = grid_result_genus.best_estimator_.predict(X_test)

Best Train Accuracy: 0.913234 using {'classifier__C': 10.0}
Best Train Error: 0.086766
Best Test Accuracy: 0.924039 using {'classifier__C': 10.0}
Best Test Error: 0.075961


In [30]:
grid_result_species = grid_search.fit(X_train, y_train["Species"])
print("Best Train Accuracy: %f using %s" % (grid_result_species.best_score_, grid_result_species.best_params_))
print("Best Train Error: %f" % (1-grid_result_species.best_score_))
test_score = grid_result_species.best_estimator_.score(X_test, y_test["Species"])
print("Best Test Accuracy: %f using %s" % (test_score, grid_result_species.best_params_))
print("Best Test Error: %f" % (1-test_score))
y_pred_train_species = grid_result_species.best_estimator_.predict(X_train)
y_pred_species = grid_result_species.best_estimator_.predict(X_test)

Best Train Accuracy: 0.953339 using {'classifier__C': 100.0}
Best Train Error: 0.046661
Best Test Accuracy: 0.957388 using {'classifier__C': 100.0}
Best Test Error: 0.042612


In [31]:
y_pred_train = pd.concat([pd.Series(y_pred_train_family), pd.Series(y_pred_train_genus), pd.Series(y_pred_train_species)], axis=1, sort=False)
y_pred_train.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

y_pred = pd.concat([pd.Series(y_pred_family), pd.Series(y_pred_genus), pd.Series(y_pred_species)], axis=1, sort=False)
y_pred.rename(columns={0: "Family", 1: "Genus", 2: "Species"}, errors="raise", inplace = True)

#### Hamming Score/Loss for Test Data 

In [32]:
print("Train Scores")
loss = hamming_loss_function(y_train, y_pred_train)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

print("Test Scores")
loss = hamming_loss_function(y_test, y_pred)
print("Hamming Loss obtained: " + str(loss))
print("Hamming Score obtained: " + str(1-loss))

Train Scores
Hamming Loss obtained: 0.06817580090018534
Hamming Score obtained: 0.9318241990998146
Test Scores
Hamming Loss obtained: 0.06438165817508106
Hamming Score obtained: 0.935618341824919


#### Exact Match Score/Loss for Test Data 

In [33]:
print("Train Scores")
exact_match_score = exact_match(y_train, y_pred_train)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

print("Test Scores")
exact_match_score = exact_match(y_test, y_pred)
print("Exact match Loss obtained: " + str(1-exact_match_score))
print("Exact match Score obtained: " + str(exact_match_score))

Train Scores
Exact match Loss obtained: 0.13602065131056396
Exact match Score obtained: 0.863979348689436
Test Scores
Exact match Loss obtained: 0.12459471977767489
Exact match Score obtained: 0.8754052802223251


#### Report Results 

| Model                 |   Data   |   Hamming Loss   |   Hamming Score   |   Exact Match Loss  | Exact Match Score  |
|-----------------------|----------|------------------|-------------------|---------------------|--------------------|
| SVM-L1 Penalty, Smote |  Train   | 0.06818          | 0.93182           | 0.13602             | 0.86398            |
| SVM-L1 Penalty, Smote |  Test    | 0.06438          | 0.93562           | 0.12459             | 0.87541            |

The SVM model with L1 penalization and Smote has performed lower compared to the two SVM models tried before. The test Hamming score is 93% and exact match score is as low as 87%

We notice that exact match loss is higher than Hamming loss. This is because exact match considers a prediction is wrong even if one of the 3 labels predicted is wrong. 

sklearn LinearSVC function by default uses the one vs rest logic.

# 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

### Monte-Carlo Simulation: Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances that you calculate.

In [34]:
def hamming_distance_function(y_test, y_pred):
    y_test['family_mismatch'] = np.where(y_test['Family'] != y_pred['Family'], 1, 0)
    y_test['genus_mismatch'] = np.where(y_test['Genus'] != y_pred['Genus'], 1, 0)
    y_test['species_mismatch'] = np.where(y_test['Species'] != y_pred['Species'], 1, 0)
    
    col_list = ["family_mismatch", "genus_mismatch", "species_mismatch"]
    y_test['hamming_distance'] = y_test[col_list].sum(axis=1)
    sum_val = y_test["hamming_distance"].sum()
    return (sum_val/(len(y_test))), y_test['hamming_distance'].std()

### (a) Use k-means clustering on the whole Anuran Calls (MFCCs) Data Set (do not split the data into train and test, as we are not performing supervised learning in this exercise). Choose k ∈ {1, 2, . . . , 50} automatically based on one of the methods provided in the slides (CH or Gap Statistics or scree plots or Silhouettes) or any other method you know

In [35]:
X = data.iloc[:, :22]
y = data.iloc[:, 22:-1]

In [36]:
def kmeans_silhouette():
    ks_pair = {}
    for n_cluster in range(1, 51):
        kmeans = KMeans(n_clusters=n_cluster).fit(X)
        label = kmeans.labels_
        sil_coeff = 0
        if n_cluster != 1:
            sil_coeff = silhouette_score(X, label, metric='euclidean')
        print("For n_clusters={}, The Silhouette Coefficient is {}".format(n_cluster, sil_coeff))
        ks_pair[n_cluster] = sil_coeff
    return (max(ks_pair, key=ks_pair.get),max(ks_pair.values()))

def find_best_k_refit():
    k, sil_score = kmeans_silhouette()
    print("Best k is " + str(k) + " based on highest Silhouette coefficient score: " + str(sil_score))
    kmeans = KMeans(n_clusters=k).fit(X)
    return kmeans

### (b) In each cluster, determine which family is the majority by reading the true labels. Repeat for genus and species.

In [37]:
def cluster_majority():
    kmeans = find_best_k_refit()
    y["kmeans_label"] = pd.Series(kmeans.labels_)
    y_family_label = y[["Family", "kmeans_label"]]
    kmeans_label_groupedby = y_family_label.groupby(['kmeans_label'])['Family'].value_counts().rename('kmeans_family').reset_index().drop_duplicates('kmeans_label')
    print("Family cluster majority\n")
    print(kmeans_label_groupedby.to_markdown())
    print("\n")
    kmeans_label_groupedby.drop(["kmeans_family"], axis = 1, inplace = True)
    joined_fam = y_family_label.join(kmeans_label_groupedby.set_index('kmeans_label'), on='kmeans_label', lsuffix = "_l", rsuffix = "_r")
    y_genus_label = y[["Genus", "kmeans_label"]]
    kmeans_label_groupedby = y_genus_label.groupby(['kmeans_label'])['Genus'].value_counts().rename('kmeans_genus').reset_index().drop_duplicates('kmeans_label')
    print("Genus cluster majority\n")
    print(kmeans_label_groupedby.to_markdown())
    print("\n")
    kmeans_label_groupedby.drop(["kmeans_genus"], axis = 1, inplace = True)
    joined_gen = y_genus_label.join(kmeans_label_groupedby.set_index('kmeans_label'), on = 'kmeans_label', lsuffix = "_l", rsuffix = "_r")
    y_species_label = y[["Species", "kmeans_label"]]
    kmeans_label_groupedby = y_species_label.groupby(['kmeans_label'])['Species'].value_counts().rename('kmeans_species').reset_index().drop_duplicates('kmeans_label')
    print("Species cluster majority\n")
    print(kmeans_label_groupedby.to_markdown())
    print("\n")
    kmeans_label_groupedby.drop(["kmeans_species"], axis = 1, inplace = True)
    joined_spe = y_species_label.join(kmeans_label_groupedby.set_index('kmeans_label'), on = 'kmeans_label', lsuffix = "_l", rsuffix = "_r")
    all_joined_test = pd.concat([joined_fam["Family_l"], joined_gen["Genus_l"], joined_spe["Species_l"]], axis=1, sort=False)
    all_joined_pred = pd.concat([joined_fam["Family_r"], joined_gen["Genus_r"], joined_spe["Species_r"]], axis=1, sort=False)
    all_joined_test.rename(columns={"Family_l": "Family", "Genus_l": "Genus", "Species_l": "Species"}, errors="raise", inplace = True)
    all_joined_pred.rename(columns={"Family_r": "Family", "Genus_r": "Genus", "Species_r": "Species"}, errors="raise", inplace = True)
    return all_joined_test, all_joined_pred

### (c) Now for each cluster you have a majority label triplet (family, genus, species). Calculate the average Hamming distance, Hamming score, and Hamming loss between the true labels and the labels assigned by clusters.

In [38]:
def kmeans_hamming(all_joined_test, all_joined_pred):
    hamming_loss = hamming_loss_function(all_joined_test, all_joined_pred)
    hamming_score = 1-hamming_loss
    hamming_dist, hamming_dist_stddev = hamming_distance_function(all_joined_test, all_joined_pred)
    print("Hamming Loss obtained: " + str(hamming_loss))
    print("Hamming Score obtained: " + str(hamming_score))
    print("Hamming Distance obtained: " + str(hamming_dist))
    print("Hamming Distance Standard Deviation obtained: " + str(hamming_dist_stddev))
    return hamming_loss, hamming_score, hamming_dist, hamming_dist_stddev

#### Run 50 times and find average hamming distance

In [39]:
hamming_loss_dict = []
hamming_score_dict = []
hamming_dist_dict = []
hamming_dist_std_dict = []
for i in range(1, 51):
    print("Monte Carlo Simulation Iteration: " + str(i))
    all_joined_test, all_joined_pred = cluster_majority()
    hamming_loss, hamming_score, hamming_dist, hamming_dist_stddev = kmeans_hamming(all_joined_test, all_joined_pred)
    hamming_loss_dict.append(hamming_loss)
    hamming_score_dict.append(hamming_score)
    hamming_dist_dict.append(hamming_dist)
    hamming_dist_std_dict.append(hamming_dist_stddev)

Monte Carlo Simulation Iteration: 1
For n_clusters=1, The Silhouette Coefficient is 0
For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.2644444827073779
For n_clusters=7, The Silhouette Coefficient is 0.2711094888906035
For n_clusters=8, The Silhouette Coefficient is 0.2808269101044866
For n_clusters=9, The Silhouette Coefficient is 0.26382411375888454
For n_clusters=10, The Silhouette Coefficient is 0.2731030040246207
For n_clusters=11, The Silhouette Coefficient is 0.2736414071621672
For n_clusters=12, The Silhouette Coefficient is 0.2837375255329622
For n_clusters=13, The Silhouette Coefficient is 0.2588879935751163
For n_clusters=14, The Silhouette Coefficient is 0.2826271255287889
For n_clusters=15, The Silhouette Co

For n_clusters=50, The Silhouette Coefficient is 0.22975952529854923
Best k is 4 based on highest Silhouette coefficient score: 0.3787509343305295
Family cluster majority

|    |   kmeans_label | Family          |   kmeans_family |
|---:|---------------:|:----------------|----------------:|
|  0 |              0 | Leptodactylidae |            3467 |
|  2 |              1 | Hylidae         |            1245 |
|  6 |              2 | Dendrobatidae   |             500 |
|  9 |              3 | Hylidae         |             590 |


Genus cluster majority

|    |   kmeans_label | Genus     |   kmeans_genus |
|---:|---------------:|:----------|---------------:|
|  0 |              0 | Adenomera |           3466 |
|  5 |              1 | Hypsiboas |           1038 |
| 13 |              2 | Ameerega  |            500 |
| 19 |              3 | Hypsiboas |            542 |


Species cluster majority

|    |   kmeans_label | Species                |   kmeans_species |
|---:|---------------:|:----

For n_clusters=28, The Silhouette Coefficient is 0.24373003481958608
For n_clusters=29, The Silhouette Coefficient is 0.2684441322699683
For n_clusters=30, The Silhouette Coefficient is 0.26253821018598117
For n_clusters=31, The Silhouette Coefficient is 0.2641189481588573
For n_clusters=32, The Silhouette Coefficient is 0.2611438481567203
For n_clusters=33, The Silhouette Coefficient is 0.24208749404260188
For n_clusters=34, The Silhouette Coefficient is 0.2644255963399897
For n_clusters=35, The Silhouette Coefficient is 0.24229663206583113
For n_clusters=36, The Silhouette Coefficient is 0.26456940052637573
For n_clusters=37, The Silhouette Coefficient is 0.2583173104079802
For n_clusters=38, The Silhouette Coefficient is 0.22199671568416068
For n_clusters=39, The Silhouette Coefficient is 0.23760463863437406
For n_clusters=40, The Silhouette Coefficient is 0.24901895624755147
For n_clusters=41, The Silhouette Coefficient is 0.21738693094615114
For n_clusters=42, The Silhouette Coeff

For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.27088941333497224
For n_clusters=8, The Silhouette Coefficient is 0.27012114209263827
For n_clusters=9, The Silhouette Coefficient is 0.26153212403634274
For n_clusters=10, The Silhouette Coefficient is 0.2636494732462223
For n_clusters=11, The Silhouette Coefficient is 0.2616964787293548
For n_clusters=12, The Silhouette Coefficient is 0.2743155711381428
For n_clusters=13, The Silhouette Coefficient is 0.27800396569597385
For n_clusters=14, The Silhouette Coefficient is 0.28070139350041134
For n_clusters=15, The Silhouette Coefficient is 0.2702577845891809
For n_clusters=16, The Silhouette Coefficient is 0.2599976191323615
For n_clusters=17, The Silhouette Coefficient is 0.2636772959168412
For n_clusters=18, The Silhouette Coefficient is 0.2808921552912572
For n_clusters=19, The Silhouette Coefficient is 0.26533489733973437
For n_clusters=20, The Silhouette Coefficient i

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.37149147864771287
For n_clusters=6, The Silhouette Coefficient is 0.2644130229717876
For n_clusters=7, The Silhouette Coefficient is 0.2708824275211291
For n_clusters=8, The Silhouette Coefficient is 0.2701445428442861
For n_clusters=9, The Silhouette Coefficient is 0.2639257049296656
For n_clusters=10, The Silhouette Coefficient is 0.26259858917906165
For n_clusters=11, The Silhouette Coefficient is 0.2690578442867476
For n_clusters=12, The Silhouette Coefficient is 0.2720032759206621
For n_clusters=13, The Silhouette Coefficient is 0.2782611143964412
For n_clusters=14, The Silhouette Coefficient is 0.26573093646333557
For n_clusters=15, The Silhouette Coefficient is 0.2891472172141786
For n_clusters=16, The Silhouette Coefficient is 0.2

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.37170236413433505
For n_clusters=6, The Silhouette Coefficient is 0.2644568489306402
For n_clusters=7, The Silhouette Coefficient is 0.2710721593378213
For n_clusters=8, The Silhouette Coefficient is 0.27018500453263783
For n_clusters=9, The Silhouette Coefficient is 0.26393715199884393
For n_clusters=10, The Silhouette Coefficient is 0.2780019016350448
For n_clusters=11, The Silhouette Coefficient is 0.27172821908571415
For n_clusters=12, The Silhouette Coefficient is 0.27203057402355924
For n_clusters=13, The Silhouette Coefficient is 0.2594674477015496
For n_clusters=14, The Silhouette Coefficient is 0.2822515166755487
For n_clusters=15, The Silhouette Coefficient is 0.2891273205848587
For n_clusters=16, The Silhouette Coefficient is 0

Family cluster majority

|    |   kmeans_label | Family          |   kmeans_family |
|---:|---------------:|:----------------|----------------:|
|  0 |              0 | Leptodactylidae |            3467 |
|  2 |              1 | Hylidae         |            1245 |
|  6 |              2 | Dendrobatidae   |             500 |
|  9 |              3 | Hylidae         |             590 |


Genus cluster majority

|    |   kmeans_label | Genus     |   kmeans_genus |
|---:|---------------:|:----------|---------------:|
|  0 |              0 | Adenomera |           3466 |
|  5 |              1 | Hypsiboas |           1038 |
| 13 |              2 | Ameerega  |            500 |
| 19 |              3 | Hypsiboas |            542 |


Species cluster majority

|    |   kmeans_label | Species                |   kmeans_species |
|---:|---------------:|:-----------------------|-----------------:|
|  0 |              0 | AdenomeraHylaedactylus |             3466 |
|  5 |              1 | HypsiboasCordob

For n_clusters=30, The Silhouette Coefficient is 0.2644842006773677
For n_clusters=31, The Silhouette Coefficient is 0.2632823402387838
For n_clusters=32, The Silhouette Coefficient is 0.24855892556725098
For n_clusters=33, The Silhouette Coefficient is 0.2650967350756459
For n_clusters=34, The Silhouette Coefficient is 0.2548026851093173
For n_clusters=35, The Silhouette Coefficient is 0.2620245250546696
For n_clusters=36, The Silhouette Coefficient is 0.26100249596030234
For n_clusters=37, The Silhouette Coefficient is 0.2548033974913928
For n_clusters=38, The Silhouette Coefficient is 0.23875243382726408
For n_clusters=39, The Silhouette Coefficient is 0.2479132060099851
For n_clusters=40, The Silhouette Coefficient is 0.24200319702156223
For n_clusters=41, The Silhouette Coefficient is 0.24283709900033953
For n_clusters=42, The Silhouette Coefficient is 0.23756129056766062
For n_clusters=43, The Silhouette Coefficient is 0.23799706176082455
For n_clusters=44, The Silhouette Coeffic

For n_clusters=8, The Silhouette Coefficient is 0.2537257899637584
For n_clusters=9, The Silhouette Coefficient is 0.2639296413474319
For n_clusters=10, The Silhouette Coefficient is 0.26353951040288964
For n_clusters=11, The Silhouette Coefficient is 0.2798829810778672
For n_clusters=12, The Silhouette Coefficient is 0.2682085893669204
For n_clusters=13, The Silhouette Coefficient is 0.2597698623280665
For n_clusters=14, The Silhouette Coefficient is 0.28440912166410787
For n_clusters=15, The Silhouette Coefficient is 0.26893084244080656
For n_clusters=16, The Silhouette Coefficient is 0.2726247494805073
For n_clusters=17, The Silhouette Coefficient is 0.29559957113428476
For n_clusters=18, The Silhouette Coefficient is 0.2643718543358035
For n_clusters=19, The Silhouette Coefficient is 0.2721220562310111
For n_clusters=20, The Silhouette Coefficient is 0.26442990548303374
For n_clusters=21, The Silhouette Coefficient is 0.2733018844966309
For n_clusters=22, The Silhouette Coefficient

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.3676927081149251
For n_clusters=4, The Silhouette Coefficient is 0.37888514720477384
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.2644052337316085
For n_clusters=7, The Silhouette Coefficient is 0.26001099600275107
For n_clusters=8, The Silhouette Coefficient is 0.2704655522639581
For n_clusters=9, The Silhouette Coefficient is 0.27637826276031513
For n_clusters=10, The Silhouette Coefficient is 0.27308959139801775
For n_clusters=11, The Silhouette Coefficient is 0.27139579462180874
For n_clusters=12, The Silhouette Coefficient is 0.27251489716106725
For n_clusters=13, The Silhouette Coefficient is 0.25906372048025667
For n_clusters=14, The Silhouette Coefficient is 0.2651567290041752
For n_clusters=15, The Silhouette Coefficient is 0.25826144097937176
For n_clusters=16, The Silhouette Coefficient is

Hamming Loss obtained: 0.2224229789205467
Hamming Score obtained: 0.7775770210794533
Hamming Distance obtained: 0.66726893676164
Hamming Distance Standard Deviation obtained: 1.206243709132836
Monte Carlo Simulation Iteration: 19
For n_clusters=1, The Silhouette Coefficient is 0
For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.371653028394425
For n_clusters=6, The Silhouette Coefficient is 0.26441520675915026
For n_clusters=7, The Silhouette Coefficient is 0.260540085155543
For n_clusters=8, The Silhouette Coefficient is 0.25466586371038136
For n_clusters=9, The Silhouette Coefficient is 0.2761174502715092
For n_clusters=10, The Silhouette Coefficient is 0.2731130606590093
For n_clusters=11, The Silhouette Coefficient is 0.2713797464168357
For n_clusters=12, The Silhouette Coefficient i

For n_clusters=47, The Silhouette Coefficient is 0.2415557285706742
For n_clusters=48, The Silhouette Coefficient is 0.22693184998839552
For n_clusters=49, The Silhouette Coefficient is 0.22774802876957012
For n_clusters=50, The Silhouette Coefficient is 0.2146313990544839
Best k is 4 based on highest Silhouette coefficient score: 0.3787509343305295
Family cluster majority

|    |   kmeans_label | Family          |   kmeans_family |
|---:|---------------:|:----------------|----------------:|
|  0 |              0 | Leptodactylidae |            3467 |
|  2 |              1 | Dendrobatidae   |             500 |
|  5 |              2 | Hylidae         |            1244 |
|  9 |              3 | Hylidae         |             591 |


Genus cluster majority

|    |   kmeans_label | Genus     |   kmeans_genus |
|---:|---------------:|:----------|---------------:|
|  0 |              0 | Adenomera |           3466 |
|  5 |              1 | Ameerega  |            500 |
| 11 |              2 | H

For n_clusters=25, The Silhouette Coefficient is 0.2677871039340608
For n_clusters=26, The Silhouette Coefficient is 0.26287068028935207
For n_clusters=27, The Silhouette Coefficient is 0.27278650622142453
For n_clusters=28, The Silhouette Coefficient is 0.26718633488120797
For n_clusters=29, The Silhouette Coefficient is 0.2676195590525264
For n_clusters=30, The Silhouette Coefficient is 0.2500176699165326
For n_clusters=31, The Silhouette Coefficient is 0.2586888481574702
For n_clusters=32, The Silhouette Coefficient is 0.26402516503951307
For n_clusters=33, The Silhouette Coefficient is 0.26291857499278265
For n_clusters=34, The Silhouette Coefficient is 0.24718805345943617
For n_clusters=35, The Silhouette Coefficient is 0.24681965745914636
For n_clusters=36, The Silhouette Coefficient is 0.24870085990016072
For n_clusters=37, The Silhouette Coefficient is 0.24681285635263087
For n_clusters=38, The Silhouette Coefficient is 0.22629692708373628
For n_clusters=39, The Silhouette Coef

For n_clusters=3, The Silhouette Coefficient is 0.3676927081149251
For n_clusters=4, The Silhouette Coefficient is 0.38404734963425025
For n_clusters=5, The Silhouette Coefficient is 0.371653028394425
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.26063587785642767
For n_clusters=8, The Silhouette Coefficient is 0.2701127124725302
For n_clusters=9, The Silhouette Coefficient is 0.27663637768982774
For n_clusters=10, The Silhouette Coefficient is 0.2729953145084747
For n_clusters=11, The Silhouette Coefficient is 0.270525617640974
For n_clusters=12, The Silhouette Coefficient is 0.27444866905543114
For n_clusters=13, The Silhouette Coefficient is 0.27843735229795386
For n_clusters=14, The Silhouette Coefficient is 0.26550280899622636
For n_clusters=15, The Silhouette Coefficient is 0.2710589397674448
For n_clusters=16, The Silhouette Coefficient is 0.26403596576597965
For n_clusters=17, The Silhouette Coefficient is 0.

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.3676927081149251
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.26063587785642767
For n_clusters=8, The Silhouette Coefficient is 0.270443828661163
For n_clusters=9, The Silhouette Coefficient is 0.2759782814533313
For n_clusters=10, The Silhouette Coefficient is 0.27305352114671194
For n_clusters=11, The Silhouette Coefficient is 0.2635806845433318
For n_clusters=12, The Silhouette Coefficient is 0.2743866503572291
For n_clusters=13, The Silhouette Coefficient is 0.2759432339376672
For n_clusters=14, The Silhouette Coefficient is 0.28161411732078045
For n_clusters=15, The Silhouette Coefficient is 0.27024741121359697
For n_clusters=16, The Silhouette Coefficient is 0.275

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.37867825251598397
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.2643074902054466
For n_clusters=7, The Silhouette Coefficient is 0.2712655551506004
For n_clusters=8, The Silhouette Coefficient is 0.27008857610630843
For n_clusters=9, The Silhouette Coefficient is 0.2639377372578391
For n_clusters=10, The Silhouette Coefficient is 0.2611743674212734
For n_clusters=11, The Silhouette Coefficient is 0.2798158961278641
For n_clusters=12, The Silhouette Coefficient is 0.2799703369538172
For n_clusters=13, The Silhouette Coefficient is 0.2757728174853602
For n_clusters=14, The Silhouette Coefficient is 0.25821946641995464
For n_clusters=15, The Silhouette Coefficient is 0.2660127507226213
For n_clusters=16, The Silhouette Coefficient is 0.2

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.38523395202479643
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.2680030799175728
For n_clusters=8, The Silhouette Coefficient is 0.2806337732466243
For n_clusters=9, The Silhouette Coefficient is 0.26391535328857374
For n_clusters=10, The Silhouette Coefficient is 0.27471503309341033
For n_clusters=11, The Silhouette Coefficient is 0.2716504890088418
For n_clusters=12, The Silhouette Coefficient is 0.27197745591824657
For n_clusters=13, The Silhouette Coefficient is 0.27838079474803706
For n_clusters=14, The Silhouette Coefficient is 0.264753437331088
For n_clusters=15, The Silhouette Coefficient is 0.2708627515103414
For n_clusters=16, The Silhouette Coefficient is 0.2

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.3676927081149251
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.37149147864771287
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.2677098979882448
For n_clusters=8, The Silhouette Coefficient is 0.2658526932301917
For n_clusters=9, The Silhouette Coefficient is 0.2756130465737996
For n_clusters=10, The Silhouette Coefficient is 0.2733159834026707
For n_clusters=11, The Silhouette Coefficient is 0.26447962789075646
For n_clusters=12, The Silhouette Coefficient is 0.2712256818591839
For n_clusters=13, The Silhouette Coefficient is 0.2782938275567454
For n_clusters=14, The Silhouette Coefficient is 0.2653863838206336
For n_clusters=15, The Silhouette Coefficient is 0.26914628341129226
For n_clusters=16, The Silhouette Coefficient is 0.275

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.26786472707644854
For n_clusters=8, The Silhouette Coefficient is 0.27012114209263827
For n_clusters=9, The Silhouette Coefficient is 0.27618884482168643
For n_clusters=10, The Silhouette Coefficient is 0.26227977174933337
For n_clusters=11, The Silhouette Coefficient is 0.27133115945608555
For n_clusters=12, The Silhouette Coefficient is 0.2728038459762724
For n_clusters=13, The Silhouette Coefficient is 0.2780934145655085
For n_clusters=14, The Silhouette Coefficient is 0.265255642559836
For n_clusters=15, The Silhouette Coefficient is 0.2646427458353713
For n_clusters=16, The Silhouette Coefficient is 0.2

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.260540085155543
For n_clusters=8, The Silhouette Coefficient is 0.27010471371460554
For n_clusters=9, The Silhouette Coefficient is 0.27618884482168643
For n_clusters=10, The Silhouette Coefficient is 0.26341830968055097
For n_clusters=11, The Silhouette Coefficient is 0.271470920922513
For n_clusters=12, The Silhouette Coefficient is 0.2726157420581023
For n_clusters=13, The Silhouette Coefficient is 0.27768910551276993
For n_clusters=14, The Silhouette Coefficient is 0.265212539603585
For n_clusters=15, The Silhouette Coefficient is 0.2704977947444924
For n_clusters=16, The Silhouette Coefficient is 0.2740

For n_clusters=1, The Silhouette Coefficient is 0
For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.2706777515817654
For n_clusters=8, The Silhouette Coefficient is 0.27027069466099757
For n_clusters=9, The Silhouette Coefficient is 0.27604842276388336
For n_clusters=10, The Silhouette Coefficient is 0.26105796182231916
For n_clusters=11, The Silhouette Coefficient is 0.27358653035242303
For n_clusters=12, The Silhouette Coefficient is 0.27422150050583394
For n_clusters=13, The Silhouette Coefficient is 0.2781006874869569
For n_clusters=14, The Silhouette Coefficient is 0.26554683150387887
For n_clusters=15, The Silhouette Coefficient is 0.2693139655720745


For n_clusters=50, The Silhouette Coefficient is 0.2249504521597444
Best k is 4 based on highest Silhouette coefficient score: 0.37863353074850936
Family cluster majority

|    |   kmeans_label | Family          |   kmeans_family |
|---:|---------------:|:----------------|----------------:|
|  0 |              0 | Leptodactylidae |            3467 |
|  2 |              1 | Hylidae         |            1248 |
|  6 |              2 | Dendrobatidae   |             504 |
|  9 |              3 | Hylidae         |             587 |


Genus cluster majority

|    |   kmeans_label | Genus     |   kmeans_genus |
|---:|---------------:|:----------|---------------:|
|  0 |              0 | Adenomera |           3466 |
|  5 |              1 | Hypsiboas |           1040 |
| 13 |              2 | Ameerega  |            504 |
| 19 |              3 | Hypsiboas |            540 |


Species cluster majority

|    |   kmeans_label | Species                |   kmeans_species |
|---:|---------------:|:----

For n_clusters=28, The Silhouette Coefficient is 0.2664619431089983
For n_clusters=29, The Silhouette Coefficient is 0.26100749262951495
For n_clusters=30, The Silhouette Coefficient is 0.2669833629485695
For n_clusters=31, The Silhouette Coefficient is 0.2632364631452654
For n_clusters=32, The Silhouette Coefficient is 0.2659757356597608
For n_clusters=33, The Silhouette Coefficient is 0.2595264394983023
For n_clusters=34, The Silhouette Coefficient is 0.26592036660380886
For n_clusters=35, The Silhouette Coefficient is 0.26111526164054305
For n_clusters=36, The Silhouette Coefficient is 0.25457202987323035
For n_clusters=37, The Silhouette Coefficient is 0.23991766627732436
For n_clusters=38, The Silhouette Coefficient is 0.2468437906463237
For n_clusters=39, The Silhouette Coefficient is 0.23631084359318996
For n_clusters=40, The Silhouette Coefficient is 0.26330249861955024
For n_clusters=41, The Silhouette Coefficient is 0.25388399136384443
For n_clusters=42, The Silhouette Coeffi

For n_clusters=6, The Silhouette Coefficient is 0.26447069759492176
For n_clusters=7, The Silhouette Coefficient is 0.27033697534439777
For n_clusters=8, The Silhouette Coefficient is 0.2656195560746998
For n_clusters=9, The Silhouette Coefficient is 0.261482279849612
For n_clusters=10, The Silhouette Coefficient is 0.27303791106277964
For n_clusters=11, The Silhouette Coefficient is 0.27134340023680104
For n_clusters=12, The Silhouette Coefficient is 0.272073947441997
For n_clusters=13, The Silhouette Coefficient is 0.2591031496653538
For n_clusters=14, The Silhouette Coefficient is 0.2811620300138357
For n_clusters=15, The Silhouette Coefficient is 0.27101758355219546
For n_clusters=16, The Silhouette Coefficient is 0.2535086286197109
For n_clusters=17, The Silhouette Coefficient is 0.27424572526121654
For n_clusters=18, The Silhouette Coefficient is 0.2651095713783316
For n_clusters=19, The Silhouette Coefficient is 0.28121529427360414
For n_clusters=20, The Silhouette Coefficient i

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.264008021116715
For n_clusters=7, The Silhouette Coefficient is 0.2606154509762368
For n_clusters=8, The Silhouette Coefficient is 0.2704325682630952
For n_clusters=9, The Silhouette Coefficient is 0.2639017940989184
For n_clusters=10, The Silhouette Coefficient is 0.2808543684175499
For n_clusters=11, The Silhouette Coefficient is 0.2796967971400179
For n_clusters=12, The Silhouette Coefficient is 0.27316847106149983
For n_clusters=13, The Silhouette Coefficient is 0.2602057062591137
For n_clusters=14, The Silhouette Coefficient is 0.2813911902109416
For n_clusters=15, The Silhouette Coefficient is 0.28733969088927164
For n_clusters=16, The Silhouette Coefficient is 0.252

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.3676927081149251
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.2644568489306402
For n_clusters=7, The Silhouette Coefficient is 0.2605126677525488
For n_clusters=8, The Silhouette Coefficient is 0.2547818432059273
For n_clusters=9, The Silhouette Coefficient is 0.27611839828175927
For n_clusters=10, The Silhouette Coefficient is 0.267122497595442
For n_clusters=11, The Silhouette Coefficient is 0.2715018900731722
For n_clusters=12, The Silhouette Coefficient is 0.2702285123745244
For n_clusters=13, The Silhouette Coefficient is 0.2582764013034829
For n_clusters=14, The Silhouette Coefficient is 0.2833833001373993
For n_clusters=15, The Silhouette Coefficient is 0.28901500148264064
For n_clusters=16, The Silhouette Coefficient is 0.2615

For n_clusters=2, The Silhouette Coefficient is 0.3486778410277152
For n_clusters=3, The Silhouette Coefficient is 0.36768245219926315
For n_clusters=4, The Silhouette Coefficient is 0.3787509343305295
For n_clusters=5, The Silhouette Coefficient is 0.3714828057802203
For n_clusters=6, The Silhouette Coefficient is 0.2644052337316085
For n_clusters=7, The Silhouette Coefficient is 0.26069413132392916
For n_clusters=8, The Silhouette Coefficient is 0.27043178624172043
For n_clusters=9, The Silhouette Coefficient is 0.27618665711269547
For n_clusters=10, The Silhouette Coefficient is 0.26221431590031313
For n_clusters=11, The Silhouette Coefficient is 0.27140714739639976
For n_clusters=12, The Silhouette Coefficient is 0.2737829812676105
For n_clusters=13, The Silhouette Coefficient is 0.27768922966419557
For n_clusters=14, The Silhouette Coefficient is 0.2650604211706711
For n_clusters=15, The Silhouette Coefficient is 0.2705845955103341
For n_clusters=16, The Silhouette Coefficient is 

Family cluster majority

|    |   kmeans_label | Family          |   kmeans_family |
|---:|---------------:|:----------------|----------------:|
|  0 |              0 | Hylidae         |            1245 |
|  4 |              1 | Leptodactylidae |            3467 |
|  6 |              2 | Dendrobatidae   |             500 |
|  9 |              3 | Hylidae         |             590 |


Genus cluster majority

|    |   kmeans_label | Genus     |   kmeans_genus |
|---:|---------------:|:----------|---------------:|
|  0 |              0 | Hypsiboas |           1038 |
|  8 |              1 | Adenomera |           3466 |
| 13 |              2 | Ameerega  |            500 |
| 19 |              3 | Hypsiboas |            542 |


Species cluster majority

|    |   kmeans_label | Species                |   kmeans_species |
|---:|---------------:|:-----------------------|-----------------:|
|  0 |              0 | HypsiboasCordobae      |             1018 |
| 10 |              1 | AdenomeraHylaed

In [40]:
print("\n\n")
print("************** FINAL DATA AFTER 50 SIMULATIONS **************")
print("\n\n")

simulation_data = pd.DataFrame({'Hamming losses' : hamming_loss_dict, 'Hamming Scores' : hamming_score_dict
                                , 'Hamming Distances' : hamming_dist_dict, 'Hamming Distance Stddev' : hamming_dist_std_dict})

print(simulation_data.to_markdown())
print("\n\n")

print("Average Hamming loss: " + str(sum(hamming_loss_dict)/len(hamming_loss_dict)))
print("\n")
print("Average Hamming Score: " + str(sum(hamming_score_dict)/len(hamming_score_dict)))
print("\n")
print("Average Hamming Distance: " + str(sum(hamming_dist_dict)/len(hamming_dist_dict)))
print("\n")
print("Standard Deviation of Hamming Distance: " + str(simulation_data["Hamming Distances"].std()))




************** FINAL DATA AFTER 50 SIMULATIONS **************



|    |   Hamming losses |   Hamming Scores |   Hamming Distances |   Hamming Distance Stddev |
|---:|-----------------:|-----------------:|--------------------:|--------------------------:|
|  0 |         0.222423 |         0.777577 |            0.667269 |                   1.20624 |
|  1 |         0.222423 |         0.777577 |            0.667269 |                   1.20624 |
|  2 |         0.222423 |         0.777577 |            0.667269 |                   1.20624 |
|  3 |         0.222469 |         0.777531 |            0.667408 |                   1.20622 |
|  4 |         0.222423 |         0.777577 |            0.667269 |                   1.20624 |
|  5 |         0.222423 |         0.777577 |            0.667269 |                   1.20624 |
|  6 |         0.222284 |         0.777716 |            0.666852 |                   1.20596 |
|  7 |         0.222423 |         0.777577 |            0.667269 |           

# 3. ISLR 12.6.2

![Screen%20Shot%202022-11-08%20at%208.11.00%20PM.png](attachment:Screen%20Shot%202022-11-08%20at%208.11.00%20PM.png)

##### Answer a

![an_1.jpg](attachment:an_1.jpg)

##### Answer b

![bn_1.jpg](attachment:bn_1.jpg)

##### Answer c
Cluster 1 has 1, 2 <br>
Cluster 2 has 3, 4 <br>

##### Answer d
Cluster 1 has 1, 2, 3 <br>
Cluster 2 has 4 <br>

##### Answer e

![en_1.jpg](attachment:en_1.jpg)