In [1]:
import os
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

plt.style.use('ggplot')

# 1. Multi-class and Multi-Label Classification Using Support Vector Machines


## 1.(a) Download the Anuran Calls (MFCCs) Data Set

In [89]:
mfcc_path = "../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv"

mfcc_data = pd.read_csv(mfcc_path)

In [90]:
mfcc_data

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


**See the detail of labels**

In [91]:
mfcc_data['Family'].value_counts()

Leptodactylidae    4420
Hylidae            2165
Dendrobatidae       542
Bufonidae            68
Name: Family, dtype: int64

In [92]:
mfcc_data['Genus'].value_counts()

Adenomera        4150
Hypsiboas        1593
Ameerega          542
Dendropsophus     310
Leptodactylus     270
Scinax            148
Osteocephalus     114
Rhinella           68
Name: Genus, dtype: int64

In [93]:
mfcc_data['Species'].value_counts()

AdenomeraHylaedactylus    3478
HypsiboasCordobae         1121
AdenomeraAndre             672
Ameeregatrivittata         542
HypsiboasCinerascens       472
HylaMinuta                 310
LeptodactylusFuscus        270
ScinaxRuber                148
OsteocephalusOophagus      114
Rhinellagranulosa           68
Name: Species, dtype: int64

**Factorize the labels**

In [94]:
mfcc_data['Family'], family_labels = pd.factorize(mfcc_data['Family'])
mfcc_data['Family'].value_counts()

0    4420
2    2165
1     542
3      68
Name: Family, dtype: int64

In [95]:
mfcc_data['Genus'], genus_labels = pd.factorize(mfcc_data['Genus'])
mfcc_data['Genus'].value_counts()

0    4150
3    1593
1     542
2     310
4     270
7     148
5     114
6      68
Name: Genus, dtype: int64

In [96]:
mfcc_data['Species'], species_labels = pd.factorize(mfcc_data['Species'])
mfcc_data['Species'].value_counts()

2    3478
5    1121
0     672
1     542
4     472
3     310
6     270
9     148
7     114
8      68
Name: Species, dtype: int64

**Split dataset: 70% for training**

In [97]:
import random
num_row = len(mfcc_data)
random.seed(num_row)
train_idx = random.sample(range(num_row), int(num_row * 0.7))
test_idx = [i for i in range(num_row) if i not in train_idx]

# split train and test
mfcc_train = mfcc_data.iloc[train_idx, :].reset_index(drop=True)
mfcc_test = mfcc_data.iloc[test_idx, :].reset_index(drop=True)

# split train features and labels
train_X = mfcc_train.iloc[:, :-4]
train_Family = mfcc_train.iloc[:, -4]
train_Genus = mfcc_train.iloc[:, -3]
train_Species = mfcc_train.iloc[:, -2]

# split test features and labels
test_X = mfcc_test.iloc[:, :-4]
test_Family = mfcc_test.iloc[:, -4]
test_Genus = mfcc_test.iloc[:, -3]
test_Species = mfcc_test.iloc[:, -2]

In [98]:
mfcc_train.shape

(5036, 26)

In [99]:
mfcc_test.shape

(2159, 26)

## 1.(b) Solve a multi-class and multi-label problem

### 1.(b)-i Exact match and hamming score/ loss methods for evaluating multi- label classification

In [51]:
from sklearn.metrics import hamming_loss

def multilabelEval(title, testX, groundTruthY, classifiers):
    predictY = pd.DataFrame(columns=groundTruthY.columns)
    for label in groundTruthY.columns:
        clf = classifiers[label]
        test_pred = clf.predict(testX)
        predictY.loc[:, label] = test_pred
    print("Multilabel evaluation of {}".format(title))
    print("-" * 80)
    hamming, exact_ratio = multilableMetric(groundTruthY, predictY)
    return [hamming, exact_ratio]
    

def multilableMetric(groundTruthY, predictY):
    # hamming loss
    missclf_labels = 0
    for truth, pred in zip(groundTruthY.values, predictY.values):
        miss = (truth != pred)
        missclf_labels += np.sum(miss)
    hamming = missclf_labels / (groundTruthY.shape[0] * groundTruthY.shape[1])
    
    # exact math ratio
    exact_ratio = 0
    for truth, pred in zip(groundTruthY.values, predictY.values):
        match = (truth == pred)
        if sum(match) == groundTruthY.shape[1]:
            exact_ratio += 1;
    exact_ratio /= groundTruthY.shape[0]
    
    hamming = np.round(hamming, 4)
    exact_ratio = np.round(exact_ratio, 4)
    print("Hamming Loss : {}".format(hamming))
    print("Exact Match Ratio : {}".format(exact_ratio))
    return hamming, exact_ratio

In [12]:
# summary for the classifiers in this section
classifier_summary = {}

## 1.(b)-ii Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.svm import SVC

"""
Doc refrence:
https://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV.fit
https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
"""

def paramSearch(classifier, settings, trainX, trainY, testX, testY):
    # build and fit the grid search for given classifier
    clf = GridSearchCV(estimator=classifier, **settings)
    clf.fit(trainX, trainY)
    
    print("Grid scores on development set:\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    
    # output the best option
    print("\nThe best parameter setting is:")
    print(clf.best_params_, "\n")
    
    # evaluation on single-label task
    test_pred = clf.predict(testX)
    #conf_mat = confusion_matrix(testY, test_pred)
    #fscore = f1_score(testY, test_pred)
    #print("The confusion matrix of test set: \n{}".format(conf_mat))
    #print("F1-score : {.4f}".format(fscore))
    print(classification_report(testY, test_pred))
    return clf

**Gaussian SVC without standardization**

In [14]:
# dict collects classifiers
gaussianSVC_classifiers = {}

# create grid of tuned parameter
tuned_params = {'C' : np.logspace(1, 4, 4), 
                'gamma' : np.logspace(-3, 6, 10)}

# create a data splitter for cross-validation
splitter = StratifiedKFold(10, random_state=5036, shuffle=True)

# some general settings of grid search
settings = {
    'param_grid' : tuned_params, 
    'cv' : splitter,
    'scoring' : 'f1_weighted', 
    'verbose' : 1
}

In [15]:
# Label - Family : no standardization
gaussianSVC_classifiers['Family'] = paramSearch(SVC(kernel='rbf'), settings,
                                                train_X, train_Family, 
                                                test_X, test_Family)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Grid scores on development set:

0.827 (+/-0.026) for {'C': 10.0, 'gamma': 0.001}
0.930 (+/-0.016) for {'C': 10.0, 'gamma': 0.01}
0.971 (+/-0.012) for {'C': 10.0, 'gamma': 0.1}
0.991 (+/-0.007) for {'C': 10.0, 'gamma': 1.0}
0.986 (+/-0.010) for {'C': 10.0, 'gamma': 10.0}
0.791 (+/-0.040) for {'C': 10.0, 'gamma': 100.0}
0.492 (+/-0.021) for {'C': 10.0, 'gamma': 1000.0}
0.472 (+/-0.005) for {'C': 10.0, 'gamma': 10000.0}
0.470 (+/-0.002) for {'C': 10.0, 'gamma': 100000.0}
0.470 (+/-0.002) for {'C': 10.0, 'gamma': 1000000.0}
0.930 (+/-0.016) for {'C': 100.0, 'gamma': 0.001}
0.941 (+/-0.020) for {'C': 100.0, 'gamma': 0.01}
0.983 (+/-0.007) for {'C': 100.0, 'gamma': 0.1}
0.992 (+/-0.007) for {'C': 100.0, 'gamma': 1.0}
0.986 (+/-0.010) for {'C': 100.0, 'gamma': 10.0}
0.791 (+/-0.040) for {'C': 100.0, 'gamma': 100.0}
0.492 (+/-0.021) for {'C': 100.0, 'gamma': 1000.0}
0.472 (+/-0.005) for {'C': 100.0, 'gamma': 10000.0}
0.470 (+/-0.002) for {'C': 100.0, 'gamma': 100000.0}
0.470 (+/-0.002) for {'

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  5.8min finished


In [16]:
# Label - Genus  : no standardization
gaussianSVC_classifiers['Genus'] = paramSearch(SVC(kernel='rbf'), settings,
                                               train_X, train_Genus, 
                                               test_X, test_Genus)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on development set:

0.744 (+/-0.026) for {'C': 10.0, 'gamma': 0.001}
0.920 (+/-0.019) for {'C': 10.0, 'gamma': 0.01}
0.973 (+/-0.014) for {'C': 10.0, 'gamma': 0.1}
0.988 (+/-0.009) for {'C': 10.0, 'gamma': 1.0}
0.980 (+/-0.011) for {'C': 10.0, 'gamma': 10.0}
0.733 (+/-0.042) for {'C': 10.0, 'gamma': 100.0}
0.442 (+/-0.014) for {'C': 10.0, 'gamma': 1000.0}
0.425 (+/-0.004) for {'C': 10.0, 'gamma': 10000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.919 (+/-0.017) for {'C': 100.0, 'gamma': 0.001}
0.965 (+/-0.016) for {'C': 100.0, 'gamma': 0.01}
0.985 (+/-0.009) for {'C': 100.0, 'gamma': 0.1}
0.989 (+/-0.007) for {'C': 100.0, 'gamma': 1.0}
0.980 (+/-0.011) for {'C': 100.0, 'gamma': 10.0}
0.733 (+/-0.042) for {'C': 100.0, 'gamma': 100.0}
0.442 (+/-0.014) for {'C': 100.0, 'gamma': 1000.0}
0.425 (+/-0.004) for {'C': 100.0, 'gamma': 10000.0}
0.423 (+/-0.0

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  8.6min finished


In [17]:
# Label - Species : no standardization
gaussianSVC_classifiers['Species'] = paramSearch(SVC(kernel='rbf'), settings,
                                                 train_X, train_Species, 
                                                 test_X, test_Species)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on development set:

0.800 (+/-0.015) for {'C': 10.0, 'gamma': 0.001}
0.935 (+/-0.021) for {'C': 10.0, 'gamma': 0.01}
0.974 (+/-0.017) for {'C': 10.0, 'gamma': 0.1}
0.988 (+/-0.011) for {'C': 10.0, 'gamma': 1.0}
0.978 (+/-0.010) for {'C': 10.0, 'gamma': 10.0}
0.656 (+/-0.040) for {'C': 10.0, 'gamma': 100.0}
0.330 (+/-0.015) for {'C': 10.0, 'gamma': 1000.0}
0.314 (+/-0.005) for {'C': 10.0, 'gamma': 10000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.935 (+/-0.021) for {'C': 100.0, 'gamma': 0.001}
0.970 (+/-0.018) for {'C': 100.0, 'gamma': 0.01}
0.985 (+/-0.019) for {'C': 100.0, 'gamma': 0.1}
0.988 (+/-0.011) for {'C': 100.0, 'gamma': 1.0}
0.978 (+/-0.010) for {'C': 100.0, 'gamma': 10.0}
0.656 (+/-0.040) for {'C': 100.0, 'gamma': 100.0}
0.330 (+/-0.015) for {'C': 100.0, 'gamma': 1000.0}
0.314 (+/-0.005) for {'C': 100.0, 'gamma': 10000.0}
0.312 (+/-0.0

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  8.6min finished


In [18]:
# evaluation of multilabel problem
classifier_summary['GaussianSVC_wo_std'] = multilabelEval('Gaussian Kernel SVC without Standarization', 
                                                          test_X, mfcc_test.iloc[:, -4:-1], 
                                                          gaussianSVC_classifiers)

Multilabel evaluation of Gaussian Kernel SVC without Standarization
--------------------------------------------------------------------------------
Hamming Loss : 0.0096
Exact Match Ratio : 0.9852


**Gaussian SVC with standardization**

In [19]:
from sklearn.preprocessing import StandardScaler

# standardize data
std_scaler = StandardScaler()
std_train_X = std_scaler.fit_transform(train_X)
std_test_X = std_scaler.fit_transform(test_X)

In [20]:
# Label - Family : with standardization
gaussianSVC_classifiers['Family'] = paramSearch(SVC(kernel='rbf'), settings,
                                                std_train_X, train_Family, 
                                                std_test_X, test_Family)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  9.1min finished


Grid scores on development set:

0.945 (+/-0.021) for {'C': 10.0, 'gamma': 0.001}
0.988 (+/-0.005) for {'C': 10.0, 'gamma': 0.01}
0.990 (+/-0.007) for {'C': 10.0, 'gamma': 0.1}
0.905 (+/-0.023) for {'C': 10.0, 'gamma': 1.0}
0.575 (+/-0.047) for {'C': 10.0, 'gamma': 10.0}
0.473 (+/-0.004) for {'C': 10.0, 'gamma': 100.0}
0.470 (+/-0.002) for {'C': 10.0, 'gamma': 1000.0}
0.470 (+/-0.002) for {'C': 10.0, 'gamma': 10000.0}
0.470 (+/-0.002) for {'C': 10.0, 'gamma': 100000.0}
0.470 (+/-0.002) for {'C': 10.0, 'gamma': 1000000.0}
0.976 (+/-0.011) for {'C': 100.0, 'gamma': 0.001}
0.989 (+/-0.006) for {'C': 100.0, 'gamma': 0.01}
0.990 (+/-0.007) for {'C': 100.0, 'gamma': 0.1}
0.905 (+/-0.023) for {'C': 100.0, 'gamma': 1.0}
0.575 (+/-0.047) for {'C': 100.0, 'gamma': 10.0}
0.473 (+/-0.004) for {'C': 100.0, 'gamma': 100.0}
0.470 (+/-0.002) for {'C': 100.0, 'gamma': 1000.0}
0.470 (+/-0.002) for {'C': 100.0, 'gamma': 10000.0}
0.470 (+/-0.002) for {'C': 100.0, 'gamma': 100000.0}
0.470 (+/-0.002) for {'

In [21]:
# Label - Genus : with standardization
gaussianSVC_classifiers['Genus'] = paramSearch(SVC(kernel='rbf'), settings,
                                               std_train_X, train_Genus, 
                                               std_test_X, test_Genus)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Grid scores on development set:

0.958 (+/-0.018) for {'C': 10.0, 'gamma': 0.001}
0.986 (+/-0.008) for {'C': 10.0, 'gamma': 0.01}
0.987 (+/-0.009) for {'C': 10.0, 'gamma': 0.1}
0.857 (+/-0.020) for {'C': 10.0, 'gamma': 1.0}
0.513 (+/-0.018) for {'C': 10.0, 'gamma': 10.0}
0.426 (+/-0.006) for {'C': 10.0, 'gamma': 100.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 1000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 10000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.423 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.977 (+/-0.013) for {'C': 100.0, 'gamma': 0.001}
0.988 (+/-0.008) for {'C': 100.0, 'gamma': 0.01}
0.987 (+/-0.009) for {'C': 100.0, 'gamma': 0.1}
0.857 (+/-0.020) for {'C': 100.0, 'gamma': 1.0}
0.513 (+/-0.018) for {'C': 100.0, 'gamma': 10.0}
0.426 (+/-0.006) for {'C': 100.0, 'gamma': 100.0}
0.423 (+/-0.001) for {'C': 100.0, 'gamma': 1000.0}
0.423 (+/-0.001) for {'C': 100.0, 'gamma': 10000.0}
0.423 (+/-0.001) for {'C': 100.0, 'gamma': 100000.0}
0.423 (+/-0.001) for {'

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed: 11.5min finished


In [22]:
# Label - Species : with standardization
gaussianSVC_classifiers['Species'] = paramSearch(SVC(kernel='rbf'), settings,
                                                 std_train_X, train_Species, 
                                                 std_test_X, test_Species)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 10 folds for each of 40 candidates, totalling 400 fits
Grid scores on development set:

0.967 (+/-0.018) for {'C': 10.0, 'gamma': 0.001}
0.987 (+/-0.013) for {'C': 10.0, 'gamma': 0.01}
0.985 (+/-0.008) for {'C': 10.0, 'gamma': 0.1}
0.835 (+/-0.024) for {'C': 10.0, 'gamma': 1.0}
0.389 (+/-0.026) for {'C': 10.0, 'gamma': 10.0}
0.315 (+/-0.006) for {'C': 10.0, 'gamma': 100.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 1000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 10000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 100000.0}
0.312 (+/-0.001) for {'C': 10.0, 'gamma': 1000000.0}
0.980 (+/-0.014) for {'C': 100.0, 'gamma': 0.001}
0.987 (+/-0.012) for {'C': 100.0, 'gamma': 0.01}
0.985 (+/-0.008) for {'C': 100.0, 'gamma': 0.1}
0.835 (+/-0.024) for {'C': 100.0, 'gamma': 1.0}
0.389 (+/-0.026) for {'C': 100.0, 'gamma': 10.0}
0.315 (+/-0.006) for {'C': 100.0, 'gamma': 100.0}
0.312 (+/-0.001) for {'C': 100.0, 'gamma': 1000.0}
0.312 (+/-0.001) for {'C': 100.0, 'gamma': 10000.0}
0.312 (+/-0.0

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed: 11.6min finished


In [23]:
classifier_summary['GaussianSVC_w_std'] = multilabelEval('Gaussian Kernel SVC with Standarization', 
                                                         std_test_X, mfcc_test.iloc[:, -4:-1], 
                                                         gaussianSVC_classifiers)

Multilabel evaluation of Gaussian Kernel SVC with Standarization
--------------------------------------------------------------------------------
Hamming Loss : 0.0117
Exact Match Ratio : 0.9787


## 1.(b)-iii Repeat 1(b)ii with L1-penalized SVMs.

In [24]:
from sklearn.svm import LinearSVC
# create grid of tuned parameter
tuned_params = {'C' : np.logspace(1, 5, 10)}

# some general settings of grid search
settings = {
    'param_grid' : tuned_params, 
    'cv' : splitter,
    'scoring' : 'f1_weighted', 
    'verbose' : 1
}

L1_svm_classifiers = {}

In [25]:
# Label - Family : with standardization
L1_svm_classifiers['Family'] = paramSearch(LinearSVC(penalty='l1', dual=False), settings,
                                           std_train_X, train_Family, std_test_X, test_Family)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  2.0min finished


Grid scores on development set:

0.928 (+/-0.026) for {'C': 10.0}
0.928 (+/-0.027) for {'C': 27.825594022071243}
0.928 (+/-0.027) for {'C': 77.4263682681127}
0.928 (+/-0.027) for {'C': 215.44346900318823}
0.928 (+/-0.027) for {'C': 599.4842503189409}
0.928 (+/-0.027) for {'C': 1668.100537200059}
0.928 (+/-0.027) for {'C': 4641.588833612777}
0.928 (+/-0.027) for {'C': 12915.496650148827}
0.928 (+/-0.027) for {'C': 35938.13663804626}
0.928 (+/-0.027) for {'C': 100000.0}

The best parameter setting is:
{'C': 10.0} 

              precision    recall  f1-score   support

           0       0.95      0.98      0.96      1317
           1       0.88      0.92      0.90       154
           2       0.94      0.90      0.92       673
           3       0.00      0.00      0.00        15

    accuracy                           0.94      2159
   macro avg       0.69      0.70      0.70      2159
weighted avg       0.94      0.94      0.94      2159



In [26]:
# Label - Genus : with standardization
L1_svm_classifiers['Genus'] = paramSearch(LinearSVC(penalty='l1', dual=False), settings,
                                          std_train_X, train_Genus, std_test_X, test_Genus)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.3min finished


Grid scores on development set:

0.948 (+/-0.012) for {'C': 10.0}
0.948 (+/-0.013) for {'C': 27.825594022071243}
0.948 (+/-0.012) for {'C': 77.4263682681127}
0.947 (+/-0.012) for {'C': 215.44346900318823}
0.947 (+/-0.012) for {'C': 599.4842503189409}
0.947 (+/-0.012) for {'C': 1668.100537200059}
0.947 (+/-0.012) for {'C': 4641.588833612777}
0.947 (+/-0.012) for {'C': 12915.496650148827}
0.947 (+/-0.012) for {'C': 35938.13663804626}
0.947 (+/-0.012) for {'C': 100000.0}

The best parameter setting is:
{'C': 77.4263682681127} 

              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1240
           1       0.92      0.95      0.93       154
           2       0.92      0.67      0.78        91
           3       0.92      0.98      0.95       479
           4       0.97      0.90      0.93        77
           5       1.00      0.36      0.53        47
           6       0.91      0.67      0.77        15
           7       0.95      0.93   

In [27]:
# Label - Species : with standardization
L1_svm_classifiers['Species'] = paramSearch(LinearSVC(penalty='l1', dual=False), settings,
                                            std_train_X, train_Species, std_test_X, test_Species)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.4min finished


Grid scores on development set:

0.958 (+/-0.021) for {'C': 10.0}
0.959 (+/-0.021) for {'C': 27.825594022071243}
0.959 (+/-0.021) for {'C': 77.4263682681127}
0.959 (+/-0.022) for {'C': 215.44346900318823}
0.959 (+/-0.022) for {'C': 599.4842503189409}
0.959 (+/-0.022) for {'C': 1668.100537200059}
0.959 (+/-0.022) for {'C': 4641.588833612777}
0.959 (+/-0.022) for {'C': 12915.496650148827}
0.959 (+/-0.022) for {'C': 35938.13663804626}
0.959 (+/-0.022) for {'C': 100000.0}

The best parameter setting is:
{'C': 215.44346900318823} 

              precision    recall  f1-score   support

           0       0.88      0.94      0.91       182
           1       0.94      0.94      0.94       154
           2       0.99      1.00      0.99      1058
           3       0.92      0.71      0.80        91
           4       0.92      0.94      0.93       153
           5       0.92      0.96      0.94       326
           6       0.97      0.91      0.94        77
           7       1.00      0.49 

In [28]:
classifier_summary['SVC_L1'] = multilabelEval('SVM with L1 penalty', 
                                              std_test_X, mfcc_test.iloc[:, -4:-1], 
                                              L1_svm_classifiers)

Multilabel evaluation of SVM with L1 penalty
--------------------------------------------------------------------------------
Hamming Loss : 0.0522
Exact Match Ratio : 0.9143


## 1.(b)-iv Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance.

In [29]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

"""
Doc references:
https://imbalanced-learn.org/stable/generated/imblearn.pipeline.Pipeline.html
https://stackoverflow.com/questions/58815016/cross-validating-with-imblearn-pipeline-and-gridsearchcv
"""

# function to preform gridsearch with SMOTEin training
def smoteParamSearch(classifier, settings, trainX, trainY, testX, testY):
    naive_model = Pipeline([
        ('sampling', SMOTE()),
        ('classification', classifier)
    ])
    selected_model = paramSearch(naive_model, settings,
                                 trainX, trainY, 
                                 testX, testY)
    return selected_model

In [30]:
# create grid of tuned parameter
tuned_params = {'classification__C' : np.logspace(1, 5, 10)}

# some general settings of grid search
settings = {
    'param_grid' : tuned_params, 
    'cv' : splitter,
    'scoring' : 'f1_weighted', 
    'verbose' : 1
}

smote_svc_classifiers = {}

In [31]:
# Label - Family : with standardization, L1 penalty and SMOTE
smote_svc_classifiers['Family'] = smoteParamSearch(LinearSVC(penalty='l1', dual=False), settings,
                                                   std_train_X, train_Family, std_test_X, test_Family)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  7.8min finished


Grid scores on development set:

0.920 (+/-0.022) for {'classification__C': 10.0}
0.919 (+/-0.027) for {'classification__C': 27.825594022071243}
0.920 (+/-0.026) for {'classification__C': 77.4263682681127}
0.921 (+/-0.027) for {'classification__C': 215.44346900318823}
0.920 (+/-0.028) for {'classification__C': 599.4842503189409}
0.921 (+/-0.024) for {'classification__C': 1668.100537200059}
0.921 (+/-0.025) for {'classification__C': 4641.588833612777}
0.921 (+/-0.024) for {'classification__C': 12915.496650148827}
0.920 (+/-0.025) for {'classification__C': 35938.13663804626}
0.921 (+/-0.027) for {'classification__C': 100000.0}

The best parameter setting is:
{'classification__C': 100000.0} 

              precision    recall  f1-score   support

           0       0.96      0.95      0.96      1317
           1       0.79      0.98      0.88       154
           2       0.95      0.87      0.91       673
           3       0.24      1.00      0.38        15

    accuracy                 

In [36]:
# Label - Genus : with standardization, L1 penalty and SMOTE
smote_svc_classifiers['Genus'] = smoteParamSearch(LinearSVC(penalty='l1', dual=False), settings,
                                                  std_train_X, train_Genus, std_test_X, test_Genus)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 23.6min finished


Grid scores on development set:

0.918 (+/-0.023) for {'classification__C': 10.0}
0.918 (+/-0.023) for {'classification__C': 27.825594022071243}
0.918 (+/-0.023) for {'classification__C': 77.4263682681127}
0.917 (+/-0.023) for {'classification__C': 215.44346900318823}
0.917 (+/-0.024) for {'classification__C': 599.4842503189409}
0.917 (+/-0.026) for {'classification__C': 1668.100537200059}
0.920 (+/-0.023) for {'classification__C': 4641.588833612777}
0.917 (+/-0.027) for {'classification__C': 12915.496650148827}
0.918 (+/-0.024) for {'classification__C': 35938.13663804626}
0.919 (+/-0.025) for {'classification__C': 100000.0}

The best parameter setting is:
{'classification__C': 4641.588833612777} 

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1240
           1       0.82      0.92      0.87       154
           2       0.65      0.95      0.77        91
           3       0.97      0.89      0.93       479
           4       0.

In [37]:
# Label - Species : with standardization, L1 penalty and SMOTE
smote_svc_classifiers['Species'] = smoteParamSearch(LinearSVC(penalty='l1', dual=False), settings,
                                                  std_train_X, train_Species, std_test_X, test_Species)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 511.4min finished


Grid scores on development set:

0.957 (+/-0.014) for {'classification__C': 10.0}
0.957 (+/-0.018) for {'classification__C': 27.825594022071243}
0.956 (+/-0.019) for {'classification__C': 77.4263682681127}
0.958 (+/-0.014) for {'classification__C': 215.44346900318823}
0.956 (+/-0.018) for {'classification__C': 599.4842503189409}
0.958 (+/-0.016) for {'classification__C': 1668.100537200059}
0.956 (+/-0.016) for {'classification__C': 4641.588833612777}
0.958 (+/-0.017) for {'classification__C': 12915.496650148827}
0.958 (+/-0.015) for {'classification__C': 35938.13663804626}
0.955 (+/-0.019) for {'classification__C': 100000.0}

The best parameter setting is:
{'classification__C': 35938.13663804626} 

              precision    recall  f1-score   support

           0       0.96      0.92      0.94       182
           1       0.94      0.89      0.92       154
           2       0.99      1.00      0.99      1058
           3       0.83      0.91      0.87        91
           4       0.

In [52]:
classifier_summary['SVC_L1_SMOTE'] = multilabelEval('SVM with L1 penalty and SMOTE', 
                                                    std_test_X, mfcc_test.iloc[:, -4:-1], 
                                                    smote_svc_classifiers)

Multilabel evaluation of SVM with L1 penalty and SMOTE
--------------------------------------------------------------------------------
Hamming Loss : 0.0715
Exact Match Ratio : 0.8578


In [46]:
classifier_summary['metrics'] = ['hamming loss', 'exact match ratio']
summary = pd.DataFrame.from_dict(classifier_summary)
summary.set_index('metrics', inplace=True)
summary

Unnamed: 0_level_0,GaussianSVC_wo_std,GaussianSVC_w_std,SVC_L1,SVC_L1_SMOTE
metrics,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hamming loss,0.0096,0.0117,0.0522,0.0715
exact match ratio,0.9852,0.9787,0.9143,0.8578


# 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set

In [178]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

"""
Doc references:
https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
"""

def getOptimalK(num_cluster, X, rand):
    '''
    funciton to find optimal K
    @params
        num_cluster : integer to set the upper bound of possible number of cluster
        X : feature dataframe
    @return
        the optimal K with best silhouette score
    '''
    optimalK, max_score = 2, 0
    for n in range(2, num_cluster + 1):
        clusterer = KMeans(n_clusters=n, random_state=rand)
        cluster_labels = clusterer.fit_predict(X)
        silhouette_avg = silhouette_score(X, cluster_labels)
        #print("For n_clusters =", n, "The average silhouette_score is :", silhouette_avg)
        if silhouette_avg > max_score:
            optimalK = n
            max_score = silhouette_avg
    print("\nThe optimal K is: {}".format(optimalK))
    return optimalK

def getMajorityLabels(optimalK, cluster_labels, Y):
    '''
    function to get majority labels for each cluster
    @params
        optimalK : integer represents optimal K
        cluster_labels : array of cluster indecies of samples
        Y : (n_samples, n_labels) dataframe
    @return
        a (n_clusters, n_labels) dataframe
    '''
    cluster_major = pd.DataFrame(columns=Y.columns)
    for c in range(optimalK):
        idx, = np.where(cluster_labels == c)
        cluster_samples = Y.iloc[idx, :]
        row = []
        for label in Y.columns:
            # get the value counts of a type of label and get the first which is the most
            cur_major = cluster_samples.loc[:, label].value_counts().index[0]
            #print(label, cur_major)
            row.append(cur_major)
        cluster_major.loc[c] = row
    return cluster_major

def evaluation(cluster_major, cluster_labels, Y):
    '''
    function of evaluation: Hamming Distance/Loss
    @params
        cluster_major : (n_clusters, n_labels) dataframe
        cluster_labels : array of cluster indecies of samples
        Y : (n_samples, n_labels) dataframe
    @return
        hamming distance and hamming loss
    '''
    # variable to count miss-classified label
    missclf_labels = 0
    # for each (1, 3) vector, compare it with cluster major
    for c in range(len(cluster_major)):
        idx, = np.where(cluster_labels == c)
        for label in Y.loc[idx].values:
            miss = (label != cluster_major.loc[c].values)
            missclf_labels += np.sum(miss)
    # note that the hamming distance should be get average ove samples rather than labels
    hamming_dist = missclf_labels / Y.shape[0]
    hamming_loss = missclf_labels / (Y.shape[0] * Y.shape[1])
    return hamming_dist, hamming_loss
    
def monteCarlo(times, X, Y):
    '''
    function of evaluation: Hamming Distance/Loss
    @params
        cluster_major : (n_clusters, n_labels) dataframe
        cluster_labels : array of cluster indecies of samples
        Y : (n_samples, n_labels) dataframe
    @return
        hamming distance and hamming loss
    '''
    hamming_dist = []
    hamming_loss = []
    for i in range(times):
        # get optimal K for clustering
        optimalK = getOptimalK(50, X, i)
        
        # build kmeans clusterer
        clusterer = KMeans(n_clusters=optimalK, random_state=i)
        cluster_labels = clusterer.fit_predict(X)
        
        # get majority label
        cluster_major = getMajorityLabels(optimalK, cluster_labels, Y)
        cluster_major = cluster_major.astype('int64')
        
        # get Hamming distance and Hamming loss
        cur_dist, cur_loss = evaluation(cluster_major, cluster_labels, Y)
        hamming_dist.append(cur_dist)
        hamming_loss.append(cur_loss)
        print("Attempt {} ~ Hamming Distance : {}, Hamming Loss {}".format(i+1, cur_dist, cur_loss))
    return hamming_dist, hamming_loss

In [179]:
hamming_dist, hamming_loss = monteCarlo(50, mfcc_data.iloc[:, :-4], mfcc_data.iloc[:, -4:-1])


The optimal K is: 4
Attempt 1 ~ Hamming Distance : 0.6653231410701876, Hamming Loss 0.22177438035672922

The optimal K is: 4
Attempt 2 ~ Hamming Distance : 0.66726893676164, Hamming Loss 0.2224229789205467

The optimal K is: 4
Attempt 3 ~ Hamming Distance : 0.66726893676164, Hamming Loss 0.2224229789205467

The optimal K is: 4
Attempt 4 ~ Hamming Distance : 0.66726893676164, Hamming Loss 0.2224229789205467

The optimal K is: 4
Attempt 5 ~ Hamming Distance : 0.66726893676164, Hamming Loss 0.2224229789205467

The optimal K is: 4
Attempt 6 ~ Hamming Distance : 0.66726893676164, Hamming Loss 0.2224229789205467

The optimal K is: 4
Attempt 7 ~ Hamming Distance : 0.66726893676164, Hamming Loss 0.2224229789205467

The optimal K is: 4
Attempt 8 ~ Hamming Distance : 0.66726893676164, Hamming Loss 0.2224229789205467

The optimal K is: 4
Attempt 9 ~ Hamming Distance : 0.7357887421820709, Hamming Loss 0.24526291406069028

The optimal K is: 4
Attempt 10 ~ Hamming Distance : 0.66726893676164, Hammi

In [181]:
hamming_dist, hamming_loss = np.array(hamming_dist), np.array(hamming_loss)

In [183]:
print("The average Hamming Distance is {}".format(np.mean(hamming_dist)))
print("The standard deviation of simulation result is {}".format(np.std(hamming_dist)))

The average Hamming Distance is 0.6723613620569838
The standard deviation of simulation result is 0.01723837650073444


In [184]:
print("The average Hamming Loss is {}".format(np.mean(hamming_loss)))
print("The standard deviation of simulation results is {}".format(np.std(hamming_loss)))

The average Hamming Loss is 0.2241204540189947
The standard deviation of simulation results is 0.005746125500244805


# 3. ISLR 10.7.2


(a) Denrogram with complete linkage

![dendrogram-a](dendrogram-a.png)


(b) Denrogram with single linkage
![dendrogram-b](dendrogram-b.png)

(c) We get two clusters: {1, 2} and {3, 4}.

(d) We get two clusters: {1, 2, 3} and {4}

(e) Denrogram equivalent to the one in (a)
![dendrogram-e](dendrogram-e.png)