In [58]:
import numpy as np
import pandas as pa
import zipfile 
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import hamming_loss
from sklearn.metrics import confusion_matrix
from sklearn import metrics, preprocessing
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import normalize
from imblearn.over_sampling import SMOTE
from sklearn.svm import LinearSVC
from sklearn.preprocessing import Normalizer
from sklearn.multiclass import OneVsRestClassifier

In [75]:
#Exact Match: Exact Match loss is a very strict parameter which considers the 
#sample correctly classified only if all the three labels are classified correctly. 
#Any partial correct answer is considered as wrong classification.

#Hamming Loss: Hamming loss is a less strict parameter which takes into account partial correct answers.
#Hence, hamming loss usually gives a lower value error value compared to exact match loss as partial correct 
#answers are taken into account and not completely classified as wrong classification.

In [37]:
cd Downloads

[WinError 2] The system cannot find the file specified: 'Downloads'
C:\Users\Shilpa\Downloads


In [38]:
db = zipfile.ZipFile("Anuran_Calls(MFCCs).zip")
data = pa.read_csv(db.open("Frogs_MFCCs.csv"))

In [39]:
data_train, data_test = train_test_split(data, test_size=0.30, random_state=42, shuffle = True)
data_train.reset_index(drop=True,inplace=True)
data_test.reset_index(drop = True, inplace=True)

In [40]:
X = data.iloc[:, :22]
y = data.iloc[:, 22:-1]
y_family = data['Family']
y_genus = data['Genus']
y_species = data['Species']

In [41]:
#Splitting the data into test and train data in the ratio of 30:70
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42, shuffle = True)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5036, 22), (5036, 3), (2159, 22), (2159, 3))

In [42]:
#Normalizing the Training data and Testing data
norm_x_train = Normalizer().fit(X_train)
norm_x_train = norm_x_train.transform(X_train)
norm_x_test = Normalizer().fit(X_test)
norm_x_test = norm_x_test.transform(X_test)

In [44]:
#Classification using Gaussian SVM for each label
def svm_gaussian(norm_x_train, y_train):
    params = {'C': [1, 10, 100, 1000], 'gamma': [0.1, 0.01, 1, 2, 3], 'kernel': ['rbf']}
    search = GridSearchCV(estimator=SVC(), param_grid=params, cv=10)
    search.fit(norm_x_train, y_train)
    print("Best parameter values:", search.best_params_)
    print("CV Score with best parameter values:", search.best_score_)
    return search.best_params_, search.best_score_, search

In [45]:
#Best parameter values for Family, Genus, Species
family_param, family_score, family_model = svm_gaussian(norm_x_train, y_train['Family'])
genus_param, genus_score, genus_model = svm_gaussian(norm_x_train, y_train['Genus'])
species_param, species_score, species_model = svm_gaussian(norm_x_train, y_train['Species'])

Best parameter values: {'C': 10, 'gamma': 3, 'kernel': 'rbf'}
CV Score with best parameter values: 0.9926528991262907
Best parameter values: {'C': 100, 'gamma': 3, 'kernel': 'rbf'}
CV Score with best parameter values: 0.9914614773629865
Best parameter values: {'C': 10, 'gamma': 3, 'kernel': 'rbf'}
CV Score with best parameter values: 0.9912629070691025


In [46]:
#Classification using best parameters obtained from train data
def svm_gaussian_test(norm_x_train, y_train, C_, gamma_, norm_x_test):
    svm = SVC(kernel="rbf", C=C_, gamma=gamma_)
    svm.fit(norm_x_train, y_train)   
    y_predict = svm.predict(norm_x_test)
    return y_predict

In [47]:
# Family
y_pred_fam = svm_gaussian_test(norm_x_train, y_train['Family'], family_param['C'], family_param['gamma'], norm_x_test)
y_pred_fam = pa.DataFrame(y_pred_fam)
# Genus
y_pred_gen = svm_gaussian_test(norm_x_train, y_train['Genus'], genus_param['C'], genus_param['gamma'], norm_x_test)
y_pred_gen = pa.DataFrame(y_pred_gen)
# Species
y_pred_spe = svm_gaussian_test(norm_x_train, y_train['Species'], species_param['C'], species_param['gamma'], norm_x_test)
y_pred_spe = pa.DataFrame(y_pred_spe)
# combine
y_predict = pa.concat([y_pred_fam, y_pred_gen, y_pred_spe], axis=1, sort=False)

In [48]:
#Function for calculating Hamming Loss
def hammingLoss(y_predict, y_test):
    loss_list = list()
    for i in range(len(y_test)):
        loss_list.append(hamming_loss(y_predict.iloc[i,:], y_test.iloc[i,:]))
    return sum(loss_list) / len(loss_list)

In [49]:
#Function for calculating Exact Match
def exactMatch(y_predict, y_test):
    loss_list = list()
    for i in range(len(y_test)):
        if set(y_predict.values[i,:]) == set(y_test.iloc[i,:]):
            loss_list.append(1)
        else:
            loss_list.append(0)
    return sum(loss_list) / len(loss_list)

In [50]:
#Hamming loss and Exact math for the Gaussian kernel
hammingloss_gauss = hammingLoss(y_predict, y_test)
print('Hamming loss for Gaussian Kernel: ', hammingloss_gauss)       
exactscore_gauss = exactMatch(y_predict, y_test)
print('Exact Match Score for Gaussian Kernel: ', exactscore_gauss)

Hamming loss for Gaussian Kernel:  0.007565230816736144
Exact Match Score for Gaussian Kernel:  0.9874942102825383


In [51]:
#Cross validation of linear SVM for Family
clf = svm.SVC(kernel='linear', C=10)
scores = cross_val_score(clf, norm_x_train, y_train['Family'], cv=10)
scores.mean()

0.9529444637535137

In [52]:
#Cross validation of linear SVM for Genus
clf = svm.SVC(kernel='linear', C=10)
scores = cross_val_score(clf, norm_x_train, y_train['Genus'], cv=10)
scores.mean()

0.972600167232301

In [53]:
#Cross validation of linear SVM for Species
clf = svm.SVC(kernel='linear', C=10)
scores = cross_val_score(clf, norm_x_train, y_train['Species'], cv=10)
scores.mean()

0.9789516254497059

In [59]:
def OneVsRest_linear(trainDataLinearOvR, trainLabelLinearOvR,classEstimated):
    params = {'estimator__C':[1, 10, 100, 1000]}
    print(classEstimated)
    svc = LinearSVC(penalty='l1', dual = False)
    classifier = GridSearchCV(OneVsRestClassifier(svc, n_jobs=-1), params, cv=10)
    classifier.fit(trainDataLinearOvR, trainLabelLinearOvR)
    print("Best Weight of SVM penalty = ", classifier.best_params_['estimator__C'])
    return classifier,classifier.best_params_['estimator__C']

In [60]:
svc_family, C_fam = OneVsRest_linear(norm_x_train, y_train['Family'],'Family')
svc_genus, C_gen = OneVsRest_linear(norm_x_train, y_train['Genus'],'Genus')
svc_species, C_spe= OneVsRest_linear(norm_x_train, y_train['Species'],'Species') 

Family
Best Weight of SVM penalty =  1000
Genus
Best Weight of SVM penalty =  100
Species
Best Weight of SVM penalty =  100


In [61]:
#Classification using L1 penalty in SVM
def svm_l1_best(norm_x_train, y_train, C, norm_x_test):
    svm = LinearSVC(penalty='l1', dual=False, C=C)
    svm.fit(norm_x_train, y_train)   
    y_predict = svm.predict(norm_x_test)
    return y_predict

In [62]:
# Family
y_p_fam_l1 = svm_l1_best(norm_x_train, y_train['Family'], C_fam, norm_x_test)
y_p_fam_l1 = pa.DataFrame(y_p_fam_l1)
# Genus
y_p_gen_l1 = svm_l1_best(norm_x_train, y_train['Genus'], C_gen, norm_x_test)
y_p_gen_l1 = pa.DataFrame(y_p_gen_l1)
# Species
y_p_spe_l1 = svm_l1_best(norm_x_train, y_train['Species'], C_spe, norm_x_test)
y_p_spe_l1 = pa.DataFrame(y_p_spe_l1)
# combine
y_predict_l1 = pa.concat([y_p_fam_l1, y_p_gen_l1, y_p_spe_l1], axis=1, sort=False)

In [64]:
#Hamming loss and Exact score for linear SVM
hammingloss_svm_l1 = hammingLoss(y_predict_l1, y_test)
print('Hamming loss for Gaussian Kernel: ', hammingloss_svm_l1)       
exactscore_svm_l1 = exactMatch(y_predict_l1, y_test)
print('Exact Match Score for Gaussian Kernel: ', exactscore_svm_l1)

Hamming loss for Gaussian Kernel:  0.04662652462559824
Exact Match Score for Gaussian Kernel:  0.9231125521074571


In [65]:
#Using SMOTE and SVM
sm = SMOTE(random_state=42)
trainDataSMOTE_family, trainLabelSMOTE_family = sm.fit_sample(norm_x_train, y_train["Family"])
trainDataSMOTE_genus, trainLabelSMOTE_genus = sm.fit_sample(norm_x_train, y_train["Genus"])
trainDataSMOTE_species, trainLabelSMOTE_species = sm.fit_sample(norm_x_train, y_train["Species"])

In [66]:
res_family, C_res_fam = OneVsRest_linear(trainDataSMOTE_family, trainLabelSMOTE_family, "Family")
res_genus, C_res_gen = OneVsRest_linear(trainDataSMOTE_genus, trainLabelSMOTE_genus, "Genus")
res_species, C_res_spe = OneVsRest_linear(trainDataSMOTE_species, trainLabelSMOTE_species, "Species")

Family
Best Weight of SVM penalty =  1000
Genus
Best Weight of SVM penalty =  100
Species
Best Weight of SVM penalty =  10


In [67]:
#Using the best parameter in SMOTE with SVM
def res_SVM_best(norm_x_train, y_train, C_, norm_x_test):
    sm = SMOTE(random_state=42)
    X_res, y_res = sm.fit_sample(norm_x_train, y_train)   
    svm = LinearSVC(penalty='l1', dual=False, C=C_)
    svm.fit(norm_x_train, y_train)   
    y_predict = svm.predict(X_test)
    return y_predict

In [69]:
# Family
y_p_fam_l1_rs = res_SVM_best(norm_x_train, y_train['Family'], C_res_fam, norm_x_test)
y_p_fam_l1_rs = pa.DataFrame(y_p_fam_l1_rs)
# Genus
y_p_gen_l1_rs = res_SVM_best(norm_x_train, y_train['Genus'], C_res_gen, norm_x_test)
y_p_gen_l1_rs = pa.DataFrame(y_p_gen_l1_rs)
# Species
y_p_spe_l1_rs = res_SVM_best(norm_x_train, y_train['Species'], C_res_spe, norm_x_test)
y_p_spe_l1_rs = pa.DataFrame(y_p_spe_l1_rs)
# combine
y_predict_l1_rs = pa.concat([y_p_fam_l1_rs, y_p_gen_l1_rs, y_p_spe_l1_rs], axis=1, sort=False)

In [71]:
#Hamming loss and Exact score for SMOTE with SVM
hammingloss_rs = hammingLoss(y_predict_l1_rs, y_test)
print('Hamming loss for Gaussian Kernel: ', hammingloss_rs)       
exactscore_rs = exactMatch(y_predict_l1_rs, y_test)
print('Exact Match Score for Gaussian Kernel: ', exactscore_rs)

Hamming loss for Gaussian Kernel:  0.20518758684576135
Exact Match Score for Gaussian Kernel:  0.6586382584529875


In [72]:
score_report = pa.DataFrame()
score_report['Hamming Loss'] = [hammingloss_gauss, hammingloss_svm_l1, hammingloss_rs]
score_report['Exact Match Score'] = [exactscore_gauss, exactscore_svm_l1, exactscore_rs]
score_report.index = ['Gaussian', "l1 penalty", 'resampled l1']
score_report

Unnamed: 0,Hamming Loss,Exact Match Score
Gaussian,0.007565,0.987494
l1 penalty,0.046627,0.923113
resampled l1,0.205188,0.658638


In [74]:
'''We can observe from the above table that the Hamming loss and Exact match loss of 
SMOTE with SVM is less compare to other classifications we implemented. 
This proves that data was imbalanced and using SMOTE was useful in classification.'''

'We can observe from the above table that the Hamming loss and Exact match loss of \nSMOTE with SVM is less compare to other classifications we implemented. \nThis proves that data was imbalanced and using SMOTE was useful in classification.'