In [1]:
import pandas as pd
import numpy as np
import operator
import statistics
import matplotlib.pyplot as plt
from beautifultable import BeautifulTable
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import hamming_loss, silhouette_score, accuracy_score
from imblearn.over_sampling import SMOTE
from scipy.cluster import hierarchy
import warnings
warnings.filterwarnings("ignore")

In [2]:
def printDatainTable(tableHeaders, tableData):
    table = BeautifulTable(max_width=100)
    table.set_style(BeautifulTable.STYLE_SEPARATED)
    if len(tableHeaders)!=0:
        table.column_headers = tableHeaders
    for i in range(0, len(tableData)):
        table.append_row(tableData[i])
    print(table)

In [3]:
datasetURL = 'https://raw.githubusercontent.com/SurbhiJainUSC/Multiclass-and-Multilabel-Classification/master/Frogs_MFCCs.csv'

featureNames = []
for i in range(1, 23):
    if i<10:
        featureName = 'MFCCs_ ' + str(i)
    else:
        featureName = 'MFCCs_' + str(i)
    featureNames.append(featureName)
        
labels = ['Family', 'Genus', 'Species']
familyClasses = set()
genusClasses = set()
speciesClasses = set()

# Read data and randomly choose 70% data for training.

In [4]:
print('----------------------------------------------------------------------------')
data = pd.read_csv(datasetURL)
dataX = data[featureNames]
dataY = data[labels]
print('Total observations:', data.shape[0])
print('Total features:', len(featureNames))
print('----------------------------------------------------------------------------')

trainData, testData = train_test_split(data, test_size=0.3)

trainDataX = trainData[featureNames]
trainDataY = trainData[labels]
print('Total observations in training data:', trainDataX.shape[0])
print('First 5 observations in training data:')
print(trainData.head(5))
print('----------------------------------------------------------------------------')

testDataX = testData[featureNames]
testDataY = testData[labels]
print('Total observations in testing data:', testDataX.shape[0])
print('First 5 observations in testing data:')
print(testData.head(5))
print('----------------------------------------------------------------------------')

----------------------------------------------------------------------------
Total observations: 7195
Total features: 22
----------------------------------------------------------------------------
Total observations in training data: 5036
First 5 observations in training data:
      MFCCs_ 1  MFCCs_ 2  MFCCs_ 3  MFCCs_ 4  MFCCs_ 5  MFCCs_ 6  MFCCs_ 7  \
5162  1.000000  0.602752  0.599761  0.337133  0.023465  0.174081  0.029252   
7065  1.000000  0.021356  0.712023  0.540959  0.006561 -0.012396 -0.039906   
1636  0.447688  1.000000  0.025604  0.275647 -0.094225 -0.003478 -0.016021   
5311  1.000000  0.816068  0.747207  0.300618 -0.032464  0.254683  0.017812   
4536  1.000000  0.226668  0.328344  0.705906  0.125741  0.013133 -0.100290   

      MFCCs_ 8  MFCCs_ 9  MFCCs_10  ...  MFCCs_17  MFCCs_18  MFCCs_19  \
5162  0.165887  0.112901 -0.341206  ... -0.118965  0.037100  0.036765   
7065  0.206433  0.224516 -0.132646  ...  0.037481  0.024981 -0.023636   
1636  0.091354 -0.001657 -0.08402

In [5]:
for i in range(0, len(data)):
    familyClass = data['Family'][i]
    familyClasses.add(familyClass)
    
    genusClass = data['Genus'][i]
    genusClasses.add(genusClass)
    
    speciesClass = data['Species'][i]
    speciesClasses.add(speciesClass)

print('------------------------------------------------------------------------------------------------------------')
print('Classes in Family:', ', '.join(familyClasses))
print('------------------------------------------------------------------------------------------------------------')
print('Classes in Genus:', ', '.join(genusClasses))
print('------------------------------------------------------------------------------------------------------------')
print('Classes in Species:', ', '.join(speciesClasses))
print('------------------------------------------------------------------------------------------------------------')

------------------------------------------------------------------------------------------------------------
Classes in Family: Bufonidae, Dendrobatidae, Hylidae, Leptodactylidae
------------------------------------------------------------------------------------------------------------
Classes in Genus: Dendropsophus, Scinax, Rhinella, Leptodactylus, Adenomera, Osteocephalus, Ameerega, Hypsiboas
------------------------------------------------------------------------------------------------------------
Classes in Species: AdenomeraAndre, Ameeregatrivittata, LeptodactylusFuscus, HypsiboasCinerascens, Rhinellagranulosa, OsteocephalusOophagus, AdenomeraHylaedactylus, HylaMinuta, HypsiboasCordobae, ScinaxRuber
------------------------------------------------------------------------------------------------------------


# Exact Match Loss and Hamming Loss.
**Exact Match Score:** In multilabel classification, Exact Match Score defines the subset accuracy, i.e. the number of observations for which the actual set of labels correspond to the predicted set of labels for that obervation. In other words, if the entire set of predicted labels for a sample strictly match the actual set of labels, then the subset accuracy will be 1.0, otherwise it is 0.0. <br/>
**Hamming Loss:** Hamming Loss computes the average hamming distance between two set of samples. It is the fraction of observations for which labels are not predicted properly. In other words, if an observation has 3 labels, and 2 out of these 3 labels are not predicted correctly, then the hamming distance is 2. <br/>

# SVM Classifier for each label without standardized features.

In [6]:
CRange = np.logspace(-1, 4, 20)
gammaRange = np.linspace(0.1, 2, 20)

params = {'estimator__gamma':gammaRange, 'estimator__C':CRange}
svmModel = OneVsRestClassifier(SVC(kernel='rbf', tol=0.1))
gridModel = GridSearchCV(svmModel, param_grid=params, cv=KFold(10))

In [7]:
print('Label: Family')
print('---------------------------------------')
familySVMModel = gridModel.fit(trainDataX, trainDataY['Family'])

familyBestParams = familySVMModel.best_params_
familyBestC = familyBestParams['estimator__C']
familyBestC = round(familyBestC, 3)
familyBestGamma = familyBestParams['estimator__gamma']
familyBestGamma = round(familyBestGamma, 3)
familyBestWidth = round(1/np.sqrt(2*familyBestGamma), 3)
print('Best SVM Penalty:', familyBestC)
print('Best Width of Gaussian Kernel:', familyBestWidth)

predictedTestFamily = familySVMModel.predict(testDataX)
exactFamilyScore = accuracy_score(predictedTestFamily, testDataY['Family'])
exactFamilyLoss = round(1-exactFamilyScore, 3)

familyHammingLoss = hamming_loss(predictedTestFamily, testDataY['Family'])
familyHammingLoss = round(familyHammingLoss, 3)

print('Exact Match Loss:', exactFamilyLoss)
print('Hamming Loss:', familyHammingLoss)
print('---------------------------------------')

Label: Family
---------------------------------------
Best SVM Penalty: 42.813
Best Width of Gaussian Kernel: 0.513
Exact Match Loss: 0.009
Hamming Loss: 0.009
---------------------------------------


In [8]:
print('Label: Genus')
print('---------------------------------------')
genusSVMModel = gridModel.fit(trainDataX, trainDataY['Genus'])

genusBestParams = genusSVMModel.best_params_
genusBestC = genusBestParams['estimator__C']
genusBestC = round(genusBestC, 3)
genusBestGamma = genusBestParams['estimator__gamma']
genusBestGamma = round(genusBestGamma, 3)
genusBestWidth = round(1/np.sqrt(2*genusBestGamma), 3)
print('Best SVM Penalty:', genusBestC)
print('Best Width of Gaussian Kernel:', genusBestWidth)

predictedTestGenus = genusSVMModel.predict(testDataX)
exactGenusScore = accuracy_score(predictedTestGenus, testDataY['Genus'])
exactGenusLoss = round(1-exactGenusScore, 3)

genusHammingLoss = hamming_loss(predictedTestGenus, testDataY['Genus'])
genusHammingLoss = round(genusHammingLoss, 3)

print('Exact Match Loss:', exactGenusLoss)
print('Hamming Loss:', genusHammingLoss)
print('---------------------------------------')

Label: Genus
---------------------------------------
Best SVM Penalty: 6.952
Best Width of Gaussian Kernel: 0.5
Exact Match Loss: 0.009
Hamming Loss: 0.009
---------------------------------------


In [9]:
print('Label: Species')
print('---------------------------------------')
speciesSVMModel = gridModel.fit(trainDataX, trainDataY['Species'])

speciesBestParams = speciesSVMModel.best_params_
speciesBestC = speciesBestParams['estimator__C']
speciesBestC = round(speciesBestC, 3)
speciesBestGamma = speciesBestParams['estimator__gamma']
speciesBestGamma = round(speciesBestGamma, 3)
speciesBestWidth = round(1/np.sqrt(2*speciesBestGamma), 3)
print('Best SVM Penalty:', speciesBestC)
print('Best Width of Gaussian Kernel:', speciesBestWidth)

predictedTestSpecies = speciesSVMModel.predict(testDataX)
exactSpeciesScore = accuracy_score(predictedTestSpecies, testDataY['Species'])
exactSpeciesLoss = round(1-exactSpeciesScore, 3)

speciesHammingLoss = hamming_loss(predictedTestSpecies, testDataY['Species'])
speciesHammingLoss = round(speciesHammingLoss, 3)

print('Exact Match Loss:', exactSpeciesLoss)
print('Hamming Loss:', speciesHammingLoss)
print('---------------------------------------')

Label: Species
---------------------------------------
Best SVM Penalty: 12.743
Best Width of Gaussian Kernel: 0.5
Exact Match Loss: 0.009
Hamming Loss: 0.009
---------------------------------------


In [10]:
svmAvgHammingLoss = (familyHammingLoss + genusHammingLoss + speciesHammingLoss)/3
svmAvgHammingLoss = round(svmAvgHammingLoss, 3)

cnt = 0
for i in range(0, len(testDataY)):
    if (testDataY['Family'].ravel()[i]==predictedTestFamily[i] and 
        testDataY['Genus'].ravel()[i]==predictedTestGenus[i] and 
        testDataY['Species'].ravel()[i]==predictedTestSpecies[i]):
        cnt += 1
svmAvgExactScore = float(cnt)/len(testDataY)
svmAvgExactLoss = round(1-svmAvgExactScore, 3)

print('---------------------------------------')
print('Average Exact Match Loss:', svmAvgExactLoss)
print('Average Hamming Loss:', svmAvgHammingLoss)
print('---------------------------------------')

---------------------------------------
Average Exact Match Loss: 0.014
Average Hamming Loss: 0.009
---------------------------------------


# SVM Classifier for each label with standardized features.

In [11]:
scaler = preprocessing.StandardScaler()
scaler = scaler.fit(dataX)
scaledTrainDataX = scaler.transform(trainDataX)
scaledTestDataX = scaler.transform(testDataX)

In [12]:
print('Label: Family')
print('---------------------------------------')
scaledFamilySVMModel = gridModel.fit(scaledTrainDataX, trainDataY['Family'])

scaledFamilyBestParams = scaledFamilySVMModel.best_params_
scaledFamilyBestC = scaledFamilyBestParams['estimator__C']
scaledFamilyBestC = round(scaledFamilyBestC, 3)
scaledFamilyBestGamma = scaledFamilyBestParams['estimator__gamma']
scaledFamilyBestGamma = round(scaledFamilyBestGamma, 3)
scaledFamilyBestWidth = round(1/np.sqrt(2*scaledFamilyBestGamma), 3)
print('Best SVM Penalty:', scaledFamilyBestC)
print('Best Width of Gaussian Kernel:', scaledFamilyBestWidth)

scaledPredictedTestFamily = scaledFamilySVMModel.predict(scaledTestDataX)
scaledExactFamilyScore = accuracy_score(scaledPredictedTestFamily, testDataY['Family'])
scaledExactFamilyLoss = round(1-scaledExactFamilyScore, 3)

scaledFamilyHammingLoss = hamming_loss(scaledPredictedTestFamily, testDataY['Family'])
scaledFamilyHammingLoss = round(scaledFamilyHammingLoss, 3)

print('Exact Match Loss:', scaledExactFamilyLoss)
print('Hamming Loss:', scaledFamilyHammingLoss)
print('---------------------------------------')

Label: Family
---------------------------------------
Best SVM Penalty: 3.793
Best Width of Gaussian Kernel: 2.236
Exact Match Loss: 0.008
Hamming Loss: 0.008
---------------------------------------


In [13]:
print('Label: Genus')
print('---------------------------------------')
scaledGenusSVMModel = gridModel.fit(scaledTrainDataX, trainDataY['Genus'])

scaledGenusBestParams = scaledGenusSVMModel.best_params_
scaledGenusBestC = scaledGenusBestParams['estimator__C']
scaledGenusBestC = round(scaledGenusBestC, 3)
scaledGenusBestGamma = scaledGenusBestParams['estimator__gamma']
scaledGenusBestGamma = round(scaledGenusBestGamma, 3)
scaledGenusBestWidth = round(1/np.sqrt(2*scaledGenusBestGamma), 3)
print('Best SVM Penalty:', scaledGenusBestC)
print('Best Width of Gaussian Kernel:', scaledGenusBestWidth)

scaledPredictedTestGenus = scaledGenusSVMModel.predict(scaledTestDataX)
scaledExactGenusScore = accuracy_score(scaledPredictedTestGenus, testDataY['Genus'])
scaledExactGenusLoss = round(1-scaledExactGenusScore, 3)

scaledGenusHammingLoss = hamming_loss(scaledPredictedTestGenus, testDataY['Genus'])
scaledGenusHammingLoss = round(scaledGenusHammingLoss, 3)

print('Exact Match Loss:', scaledExactGenusLoss)
print('Hamming Loss:', scaledGenusHammingLoss)
print('---------------------------------------')

Label: Genus
---------------------------------------
Best SVM Penalty: 2.069
Best Width of Gaussian Kernel: 2.236
Exact Match Loss: 0.009
Hamming Loss: 0.009
---------------------------------------


In [14]:
print('Label: Species')
print('---------------------------------------')
scaledSpeciesSVMModel = gridModel.fit(scaledTrainDataX, trainDataY['Species'])

scaledSpeciesBestParams = scaledSpeciesSVMModel.best_params_
scaledSpeciesBestC = scaledSpeciesBestParams['estimator__C']
scaledSpeciesBestC = round(scaledSpeciesBestC, 3)
scaledSpeciesBestGamma = scaledSpeciesBestParams['estimator__gamma']
scaledSpeciesBestGamma = round(scaledSpeciesBestGamma, 3)
scaledSpeciesBestWidth = round(1/np.sqrt(2*scaledSpeciesBestGamma), 3)
print('Best SVM Penalty:', scaledSpeciesBestC)
print('Best Width of Gaussian Kernel:', scaledSpeciesBestWidth)

scaledPredictedTestSpecies = scaledSpeciesSVMModel.predict(scaledTestDataX)
scaledExactSpeciesScore = accuracy_score(scaledPredictedTestSpecies, testDataY['Species'])
scaledExactSpeciesLoss = round(1-scaledExactSpeciesScore, 3)

scaledSpeciesHammingLoss = hamming_loss(scaledPredictedTestSpecies, testDataY['Species'])
scaledSpeciesHammingLoss = round(scaledSpeciesHammingLoss, 3)

print('Exact Match Loss:', scaledExactSpeciesLoss)
print('Hamming Loss:', scaledSpeciesHammingLoss)
print('---------------------------------------')

Label: Species
---------------------------------------
Best SVM Penalty: 3.793
Best Width of Gaussian Kernel: 2.236
Exact Match Loss: 0.011
Hamming Loss: 0.011
---------------------------------------


In [15]:
scaledSVMAvgHammingLoss = (scaledFamilyHammingLoss + scaledGenusHammingLoss + scaledSpeciesHammingLoss)/3
scaledSVMAvgHammingLoss = round(scaledSVMAvgHammingLoss, 3)

cnt = 0
for i in range(0, len(testDataY)):
    if (testDataY['Family'].ravel()[i]==scaledPredictedTestFamily[i] and 
        testDataY['Genus'].ravel()[i]==scaledPredictedTestGenus[i] and 
        testDataY['Species'].ravel()[i]==scaledPredictedTestSpecies[i]):
        cnt += 1
scaledSVMAvgExactScore = float(cnt)/len(testDataY)
scaledSVMAvgExactLoss = round(1-scaledSVMAvgExactScore, 3)

print('Average Exact Match Loss:', scaledSVMAvgExactLoss)
print('Average Hamming Loss:', scaledSVMAvgHammingLoss)

Average Exact Match Loss: 0.013
Average Hamming Loss: 0.009


# L1-Penalized SVM for each label.

In [16]:
l1SVMModel = LinearSVC(penalty='l1', dual=False)
l1Parameters = {'C':CRange}
l1GridModel = GridSearchCV(l1SVMModel, l1Parameters, cv=KFold(10))

In [17]:
print('Label: Family')
print('---------------------------------------')
familyL1SVMModel = l1GridModel.fit(trainDataX, trainDataY['Family'])

familyL1BestParams = familyL1SVMModel.best_params_
familyL1BestC = familyL1BestParams['C']
familyL1BestC = round(familyL1BestC, 3)
print('Best SVM Penalty:', familyL1BestC)

L1PredictedTestFamily = familyL1SVMModel.predict(testDataX)
L1ExactFamilyScore = accuracy_score(L1PredictedTestFamily, testDataY['Family'])
L1ExactFamilyLoss = round(1-L1ExactFamilyScore, 3)

L1FamilyHammingLoss = hamming_loss(L1PredictedTestFamily, testDataY['Family'])
L1FamilyHammingLoss = round(L1FamilyHammingLoss, 3)

print('Exact Match Loss:', L1ExactFamilyLoss)
print('Hamming Loss:', L1FamilyHammingLoss)
print('---------------------------------------')

Label: Family
---------------------------------------
Best SVM Penalty: 143.845
Exact Match Loss: 0.065
Hamming Loss: 0.065
---------------------------------------


In [18]:
print('Label: Genus')
print('---------------------------------------')
genusL1SVMModel = l1GridModel.fit(trainDataX, trainDataY['Genus'])

genusL1BestParams = genusL1SVMModel.best_params_
genusL1BestC = genusL1BestParams['C']
genusL1BestC = round(genusL1BestC, 3)
print('Best SVM Penalty:', genusL1BestC)

L1PredictedTestGenus = genusL1SVMModel.predict(testDataX)
L1ExactGenusScore = accuracy_score(L1PredictedTestGenus, testDataY['Genus'])
L1ExactGenusLoss = round(1-L1ExactGenusScore, 3)

L1GenusHammingLoss = hamming_loss(L1PredictedTestGenus, testDataY['Genus'])
L1GenusHammingLoss = round(L1GenusHammingLoss, 3)

print('Exact Match Loss:', L1ExactGenusLoss)
print('Hamming Loss:', L1GenusHammingLoss)
print('---------------------------------------')

Label: Genus
---------------------------------------
Best SVM Penalty: 1623.777
Exact Match Loss: 0.047
Hamming Loss: 0.047
---------------------------------------


In [19]:
print('Label: Species')
print('---------------------------------------')
speciesL1SVMModel = l1GridModel.fit(trainDataX, trainDataY['Species'])

speciesL1BestParams = speciesL1SVMModel.best_params_
speciesL1BestC = speciesL1BestParams['C']
speciesL1BestC = round(speciesL1BestC, 3)
print('Best SVM Penalty:', speciesL1BestC)

L1PredictedTestSpecies = speciesL1SVMModel.predict(testDataX)
L1ExactSpeciesScore = accuracy_score(L1PredictedTestSpecies, testDataY['Species'])
L1ExactSpeciesLoss = round(1-L1ExactSpeciesScore, 3)

L1SpeciesHammingLoss = hamming_loss(L1PredictedTestSpecies, testDataY['Species'])
L1SpeciesHammingLoss = round(L1SpeciesHammingLoss, 3)

print('Exact Match Loss:', L1ExactSpeciesLoss)
print('Hamming Loss:', L1SpeciesHammingLoss)
print('---------------------------------------')

Label: Species
---------------------------------------
Best SVM Penalty: 42.813
Exact Match Loss: 0.038
Hamming Loss: 0.038
---------------------------------------


In [20]:
L1AvgHammingLoss = (L1FamilyHammingLoss + L1GenusHammingLoss + L1SpeciesHammingLoss)/3
L1AvgHammingLoss = round(L1AvgHammingLoss, 3)

cnt = 0
for i in range(0, len(testDataY)):
    if (testDataY['Family'].ravel()[i]==L1PredictedTestFamily[i] and 
        testDataY['Genus'].ravel()[i]==L1PredictedTestGenus[i] and 
        testDataY['Species'].ravel()[i]==L1PredictedTestSpecies[i]):
        cnt += 1
L1AvgExactScore = float(cnt)/len(testDataY)
L1AvgExactLoss = round(1-L1AvgExactScore, 3)

print('Average Exact Match Loss:', L1AvgExactLoss)
print('Average Hamming Loss:', L1AvgHammingLoss)

Average Exact Match Loss: 0.087
Average Hamming Loss: 0.05


# L1-penalized SVM with SMOTE for class imbalance.

In [21]:
smote = SMOTE()
familyTrainDataX, familyTrainDataY = smote.fit_sample(trainDataX, trainDataY['Family'])
genusTrainDataX, genusTrainDataY = smote.fit_sample(trainDataX, trainDataY['Genus'])
speciesTrainDataX, speciesTrainDataY = smote.fit_sample(trainDataX, trainDataY['Species'])

print('---------------------------------------------------------')
print('Before over-sampling, number of training samples:', trainDataX.shape[0])
print('---------------------------------------------------------')
print('After over-sampling, number of training samples:', familyTrainDataX.shape[0])
print('---------------------------------------------------------')

---------------------------------------------------------
Before over-sampling, number of training samples: 5036
---------------------------------------------------------
After over-sampling, number of training samples: 12544
---------------------------------------------------------


In [22]:
l1SmoteSVMModel = LinearSVC(penalty='l1', dual=False)
l1SmoteParameters = {'C':CRange}
l1SmoteGridModel = GridSearchCV(l1SmoteSVMModel, l1SmoteParameters, cv=KFold(10))

In [23]:
print('Label: Family')
print('---------------------------------------')
familyL1SmoteSVMModel = l1SmoteGridModel.fit(familyTrainDataX, familyTrainDataY)

familyL1SmoteBestParams = familyL1SmoteSVMModel.best_params_
familyL1SmoteBestC = familyL1SmoteBestParams['C']
familyL1SmoteBestC = round(familyL1SmoteBestC, 3)
print('Best SVM Penalty:', familyL1SmoteBestC)

L1SmotePredictedTestFamily = familyL1SmoteSVMModel.predict(testDataX)
L1SmoteExactFamilyScore = accuracy_score(L1SmotePredictedTestFamily, testDataY['Family'])
L1SmoteExactFamilyLoss = round(1-L1SmoteExactFamilyScore, 3)

L1SmoteFamilyHammingLoss = hamming_loss(L1SmotePredictedTestFamily, testDataY['Family'])
L1SmoteFamilyHammingLoss = round(L1SmoteFamilyHammingLoss, 3)

print('Exact Match Loss:', L1SmoteExactFamilyLoss)
print('Hamming Loss:', L1SmoteFamilyHammingLoss)
print('---------------------------------------')

Label: Family
---------------------------------------
Best SVM Penalty: 143.845
Exact Match Loss: 0.089
Hamming Loss: 0.089
---------------------------------------


In [24]:
print('Label: Genus')
print('---------------------------------------')
genusL1SmoteSVMModel = l1SmoteGridModel.fit(genusTrainDataX, genusTrainDataY)

genusL1SmoteBestParams = genusL1SmoteSVMModel.best_params_
genusL1SmoteBestC = genusL1SmoteBestParams['C']
genusL1SmoteBestC = round(genusL1SmoteBestC, 3)
print('Best SVM Penalty:', genusL1SmoteBestC)

L1SmotePredictedTestGenus = genusL1SmoteSVMModel.predict(testDataX)
L1SmoteExactGenusScore = accuracy_score(L1SmotePredictedTestGenus, testDataY['Genus'])
L1SmoteExactGenusLoss = round(1-L1SmoteExactGenusScore, 3)

L1SmoteGenusHammingLoss = hamming_loss(L1SmotePredictedTestGenus, testDataY['Genus'])
L1SmoteGenusHammingLoss = round(L1SmoteGenusHammingLoss, 3)

print('Exact Match Loss:', L1SmoteExactGenusLoss)
print('Hamming Loss:', L1SmoteGenusHammingLoss)
print('---------------------------------------')

Label: Genus
---------------------------------------
Best SVM Penalty: 263.665
Exact Match Loss: 0.082
Hamming Loss: 0.082
---------------------------------------


In [25]:
print('Label: Species')
print('---------------------------------------')
speciesL1SmoteSVMModel = l1SmoteGridModel.fit(speciesTrainDataX, speciesTrainDataY)

speciesL1SmoteBestParams = speciesL1SmoteSVMModel.best_params_
speciesL1SmoteBestC = speciesL1SmoteBestParams['C']
speciesL1SmoteBestC = round(speciesL1SmoteBestC, 3)
print('Best SVM Penalty:', speciesL1SmoteBestC)

L1SmotePredictedTestSpecies = speciesL1SmoteSVMModel.predict(testDataX)
L1SmoteExactSpeciesScore = accuracy_score(L1SmotePredictedTestSpecies, testDataY['Species'])
L1SmoteExactSpeciesLoss = round(1-L1SmoteExactSpeciesScore, 3)

L1SmoteSpeciesHammingLoss = hamming_loss(L1SmotePredictedTestSpecies, testDataY['Species'])
L1SmoteSpeciesHammingLoss = round(L1SmoteSpeciesHammingLoss, 3)

print('Exact Match Loss:', L1SmoteExactSpeciesLoss)
print('Hamming Loss:', L1SmoteSpeciesHammingLoss)
print('---------------------------------------')

Label: Species
---------------------------------------
Best SVM Penalty: 1623.777
Exact Match Loss: 0.044
Hamming Loss: 0.044
---------------------------------------


In [26]:
L1SmoteAvgHammingLoss = (L1SmoteFamilyHammingLoss + L1SmoteGenusHammingLoss + L1SmoteSpeciesHammingLoss)/3
L1SmoteAvgHammingLoss = round(L1SmoteAvgHammingLoss, 3)

cnt = 0
for i in range(0, len(testDataY)):
    if (testDataY['Family'].ravel()[i]==L1SmotePredictedTestFamily[i] and 
        testDataY['Genus'].ravel()[i]==L1SmotePredictedTestGenus[i] and 
        testDataY['Species'].ravel()[i]==L1SmotePredictedTestSpecies[i]):
        cnt += 1
L1SmoteAvgExactScore = float(cnt)/len(testDataY)
L1SmoteAvgExactLoss = round(1-L1SmoteAvgExactScore, 3)

print('Average Exact Match Loss:', L1SmoteAvgExactLoss)
print('Average Hamming Loss:', L1SmoteAvgHammingLoss)

Average Exact Match Loss: 0.145
Average Hamming Loss: 0.072


In [27]:
tableData = []
row = []
row.append('Gaussian Kernel SVM')
row.append(svmAvgExactLoss)
row.append(svmAvgHammingLoss)
tableData.append(row)
row = []
row.append('Gaussian Kernel SVM (Standardized Features)')
row.append(scaledSVMAvgExactLoss)
row.append(scaledSVMAvgHammingLoss)
tableData.append(row)
row = []
row.append('L1-Penalized SVM')
row.append(L1AvgExactLoss)
row.append(L1AvgHammingLoss)
tableData.append(row)
row = []
row.append('L1-Penalized SVM with SMOTE')
row.append(L1SmoteAvgExactLoss)
row.append(L1SmoteAvgHammingLoss)
tableData.append(row)
tableHeaders = ['Model', 'Exact Match Loss', 'Hamming Loss']
printDatainTable(tableHeaders, tableData)

|                    Model                    | Exact Match Loss | Hamming Loss |
|             Gaussian Kernel SVM             |      0.014       |    0.009     |
+---------------------------------------------+------------------+--------------+
| Gaussian Kernel SVM (Standardized Features) |      0.013       |    0.009     |
+---------------------------------------------+------------------+--------------+
|              L1-Penalized SVM               |      0.087       |     0.05     |
+---------------------------------------------+------------------+--------------+
|         L1-Penalized SVM with SMOTE         |      0.145       |    0.072     |
+---------------------------------------------+------------------+--------------+


### Gaussian Kernel SVM
Average Exact Match Loss is higher than Average Hamming Loss for Gaussian Kernel SVM.

### L1-Penalized SVM
In L1-Penalized SVM, the Average Exact Match Loss and Average Hamming Loss have increased as compared to Gaussian Kernel SVM.

### L1-Penalized SVM with SMOTE
There is a significant reduction in Average Exact Match Loss for L1-Penalized SVM after balancing the classes using SMOTE.

### Conclusion 
Comparing all the three models  based on Average Exact Match Loss and Average Hamming Loss, we can conclude that Gaussian Kernel SVM is performing the best.

# Determine optimal K for K-Means Clustering.

In [28]:
iterativeBestCluster = {}
for iteration in range(1, 51):
    silhouetteScoreDict = {}
    for k in range(2, 51):
        kMeansModel = KMeans(n_clusters=k)
        kMeansModel = kMeansModel.fit(dataX)
        clusterIndex = kMeansModel.labels_
        silhouetteScore = silhouette_score(dataX, clusterIndex)
        silhouetteScoreDict[k] = silhouetteScore

    sortedScores = sorted(silhouetteScoreDict.items(), key=operator.itemgetter(1), reverse=True)
    bestCluster = sortedScores[0][0]
    iterativeBestCluster[iteration] = bestCluster

In [29]:
print('---------------------------------------------')
print('Optimal Number of Clusters for 50 Iterations:')
print('---------------------------------------------')
tableData = []
for iteration in range(1, 51):
    row = []
    row.append('Iteration ' + str(iteration))
    row.append(iterativeBestCluster[iteration])
    tableData.append(row)
tableHeaders = ['Iteration', 'Optimal Number of Clusters']
printDatainTable(tableHeaders, tableData)

---------------------------------------------
Optimal Number of Clusters for 50 Iterations:
---------------------------------------------
|  Iteration   | Optimal Number of Clusters |
| Iteration 1  |             4              |
+--------------+----------------------------+
| Iteration 2  |             4              |
+--------------+----------------------------+
| Iteration 3  |             4              |
+--------------+----------------------------+
| Iteration 4  |             4              |
+--------------+----------------------------+
| Iteration 5  |             4              |
+--------------+----------------------------+
| Iteration 6  |             4              |
+--------------+----------------------------+
| Iteration 7  |             4              |
+--------------+----------------------------+
| Iteration 8  |             4              |
+--------------+----------------------------+
| Iteration 9  |             4              |
+--------------+------------------

In [30]:
iterationClusterIndex = {}
for iteration in range(1, 51):
    finalKMeansModel = KMeans(n_clusters=bestCluster)
    finalKMeansModel = finalKMeansModel.fit(dataX)
    clusterIndex = finalKMeansModel.labels_
    iterationClusterIndex[iteration] = clusterIndex

# For each cluster, determine its family, genus and species.

In [31]:
tableData = []
for iteration in range(1, 51):
    bestCluster = iterativeBestCluster[iteration]
    clusterIndex = iterationClusterIndex[iteration]
    clusterLabel = []
    for i in range(0, bestCluster):
        family = {}
        genus = {}
        species = {}
        for j in range(0, len(dataY)):
            if clusterIndex[j]==i:
                familyClass = dataY['Family'][j]
                if familyClass in family.keys():
                    family[familyClass] += 1
                else:
                    family[familyClass] = 1

                genusClass = dataY['Genus'][j]
                if genusClass in genus.keys():
                    genus[genusClass] += 1
                else:
                    genus[genusClass] = 1

                speciesClass = dataY['Species'][j]
                if speciesClass in species.keys():
                    species[speciesClass] += 1
                else:
                    species[speciesClass] = 1

        sortedFamily = sorted(family.items(), key=operator.itemgetter(1), reverse=True)
        sortedGenus = sorted(genus.items(), key=operator.itemgetter(1), reverse=True)
        sortedSpecies = sorted(species.items(), key=operator.itemgetter(1), reverse=True)

        familyLabel = sortedFamily[0][0]
        genusLabel = sortedGenus[0][0]
        speciesLabel = sortedSpecies[0][0]

        label = {}
        label['Family'] = familyLabel
        label['Genus'] = genusLabel
        label['Species'] = speciesLabel
        clusterLabel.append(label)
  
    for i in range(0, bestCluster):
        row = []
        row.append('Iteration ' + str(iteration))
        row.append('Cluster ' + str(i+1))
        row.append(clusterLabel[i]['Family'])
        row.append(clusterLabel[i]['Genus'])
        row.append(clusterLabel[i]['Species'])
        tableData.append(row)
    
print('----------------------------------------------------')
print('Cluster Labels for each cluster in 50 Iterations:')
print('----------------------------------------------------')
tableHeaders = ['Iteration', 'Cluster Name', 'Family Label', 'Genus Label', 'Species Label']
printDatainTable(tableHeaders, tableData)

----------------------------------------------------
Cluster Labels for each cluster in 50 Iterations:
----------------------------------------------------
|  Iteration   | Cluster Name |  Family Label   | Genus Label |     Species Label      |
| Iteration 1  |  Cluster 1   |     Hylidae     |  Hypsiboas  |  HypsiboasCinerascens  |
+--------------+--------------+-----------------+-------------+------------------------+
| Iteration 1  |  Cluster 2   |     Hylidae     |  Hypsiboas  |   HypsiboasCordobae    |
+--------------+--------------+-----------------+-------------+------------------------+
| Iteration 1  |  Cluster 3   | Leptodactylidae |  Adenomera  | AdenomeraHylaedactylus |
+--------------+--------------+-----------------+-------------+------------------------+
| Iteration 1  |  Cluster 4   | Leptodactylidae |  Adenomera  |     AdenomeraAndre     |
+--------------+--------------+-----------------+-------------+------------------------+
| Iteration 2  |  Cluster 1   |  Dendrobati

# Calculate average Hamming Score and Hamming Loss.

In [32]:
avgHammingLoss = []
avgHammingScore = []
for iteration in range(1, 51):
    clusterIndex = iterationClusterIndex[iteration]
    predictedFamilyY = []
    predictedGenusY = []
    predictedSpeciesY = []
    for i in range(0, len(dataX)):
        clusterName = clusterIndex[i]
        label = clusterLabel[clusterName]
        predFamilyLabel = label['Family']
        predGenusLabel = label['Genus']
        predSpeciesLabel = label['Species']

        predictedFamilyY.append(predFamilyLabel)
        predictedGenusY.append(predGenusLabel)
        predictedSpeciesY.append(predSpeciesLabel)

    actualFamilyLabel = np.array(dataY['Family'])
    actualGenusLabel = np.array(dataY['Genus'])
    actualSpeciesLabel = np.array(dataY['Species'])

    predictedFamilyY = np.array(predictedFamilyY)
    predictedGenusY = np.array(predictedGenusY)
    predictedSpeciesY = np.array(predictedSpeciesY)

    familyHammingLoss = hamming_loss(predictedFamilyY, actualFamilyLabel)
    familyHammingLoss = round(familyHammingLoss, 3)
    familyHammingScore = accuracy_score(predictedFamilyY, actualFamilyLabel)
    familyHammingScore = round(familyHammingScore, 3)

    genusHammingLoss = hamming_loss(predictedGenusY, actualGenusLabel)
    genusHammingLoss = round(genusHammingLoss, 3)
    genusHammingScore = accuracy_score(predictedGenusY, actualGenusLabel)
    genusHammingScore = round(genusHammingScore, 3)

    speciesHammingLoss = hamming_loss(predictedSpeciesY, actualSpeciesLabel)
    speciesHammingLoss = round(speciesHammingLoss, 3)
    speciesHammingScore = accuracy_score(predictedSpeciesY, actualSpeciesLabel)
    speciesHammingScore = round(speciesHammingScore, 3)
    
    avgHammingLossIteration = (familyHammingLoss + genusHammingLoss + speciesHammingLoss)/3
    avgHammingLossIteration = round(avgHammingLossIteration, 3)
    avgHammingScoreIteration = (familyHammingScore + genusHammingScore + speciesHammingScore)/3
    avgHammingScoreIteration = round(avgHammingScoreIteration, 3)
    
    avgHammingLoss.append(avgHammingLossIteration)
    avgHammingScore.append(avgHammingScoreIteration)

In [33]:
tableData = []
for iteration in range(1, 51):
    row = []
    row.append('Iteration ' + str(iteration))
    row.append(avgHammingLoss[iteration-1])
    row.append(avgHammingScore[iteration-1])
    tableData.append(row)
print('----------------------------------------------------------------')
print('Hamming Loss/Hamming Distance & Hamming Score for 50 Iterations:')
print('----------------------------------------------------------------')
tableHeaders = ['Iteration', 'Hamming Loss/Hamming Distance', 'Hamming Score']
printDatainTable(tableHeaders, tableData)

----------------------------------------------------------------
Hamming Loss/Hamming Distance & Hamming Score for 50 Iterations:
----------------------------------------------------------------
|  Iteration   | Hamming Loss/Hamming Distance | Hamming Score |
| Iteration 1  |             0.884             |     0.116     |
+--------------+-------------------------------+---------------+
| Iteration 2  |             0.905             |     0.095     |
+--------------+-------------------------------+---------------+
| Iteration 3  |             0.937             |     0.063     |
+--------------+-------------------------------+---------------+
| Iteration 4  |             0.428             |     0.572     |
+--------------+-------------------------------+---------------+
| Iteration 5  |             0.428             |     0.572     |
+--------------+-------------------------------+---------------+
| Iteration 6  |             0.822             |     0.178     |
+--------------+---------

In [34]:
meanHammingLoss = statistics.mean(avgHammingLoss)
meanHammingLoss = round(meanHammingLoss, 3)

stdHammingLoss = statistics.stdev(avgHammingLoss)
stdHammingLoss = round(stdHammingLoss, 3)

meanHammingScore = statistics.mean(avgHammingScore)
meanHammingScore = round(meanHammingScore, 3)

stdHammingScore = statistics.stdev(avgHammingScore)
stdHammingScore = round(stdHammingScore, 3)

print('---------------------------------------------------------------------')
print('Average & Standard Deviation for 50 Iterations:')
print('---------------------------------------------------------------------')
tableData = []
row = []
row.append('Average')
row.append(meanHammingLoss)
row.append(meanHammingScore)
tableData.append(row)
row = []
row.append('Standard Deviation')
row.append(stdHammingLoss)
row.append(stdHammingScore)
tableData.append(row)
tableHeaders = ['Measure', 'Hamming Loss/Hamming Distance', 'Hamming Score']
printDatainTable(tableHeaders, tableData)

---------------------------------------------------------------------
Average & Standard Deviation for 50 Iterations:
---------------------------------------------------------------------
|      Measure       | Hamming Loss/Hamming Distance | Hamming Score |
|      Average       |             0.616             |     0.384     |
+--------------------+-------------------------------+---------------+
| Standard Deviation |             0.279             |     0.279     |
+--------------------+-------------------------------+---------------+
