In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import hamming_loss
import warnings
warnings.filterwarnings("ignore")

## 1. Multi-class and Multi-Label Classification Using Support Vector Machines

### a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. Choose 70% of the data randomly as the training set.

In [2]:
df = pd.read_csv('../data/Frogs_MFCCs.csv')

In [3]:
train, test = train_test_split(df, test_size = 0.3)

In [4]:
train.reset_index(drop=True, inplace = True)
train_x = train.drop(columns = ['Family','Genus','Species','RecordID'])
train_y_family = train['Family']
train_y_genus = train['Genus']
train_y_species = train['Species']
train_x

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
0,1.0,0.158646,0.534379,0.276438,-0.113383,0.294784,0.157811,-0.213827,0.052966,0.175000,...,0.217513,0.115549,-0.142948,-0.043739,-0.027222,-0.028655,0.082999,0.146133,0.049442,-0.084164
1,1.0,0.157730,0.129206,0.591634,0.275526,-0.006178,-0.153495,0.082747,0.206555,-0.110400,...,0.324661,-0.203326,-0.268547,0.113816,0.179313,-0.037457,-0.141004,-0.094309,0.106879,0.186066
2,1.0,0.828310,0.243861,0.135409,0.166489,0.386477,0.435395,0.219620,0.026417,-0.152293,...,-0.275076,-0.034042,0.042109,-0.171347,-0.301549,-0.475986,-0.471597,-0.208141,-0.007926,0.030605
3,1.0,0.015939,0.121945,0.479598,0.053737,-0.005948,-0.078343,0.044297,0.289690,0.084880,...,0.199034,-0.174799,-0.165752,0.126657,0.127599,0.045503,0.004640,-0.071200,0.067910,0.152645
4,1.0,0.145449,0.215023,0.688683,0.192779,0.067867,-0.044627,0.082023,0.207315,0.004216,...,0.152739,-0.223397,-0.067073,0.299766,0.159250,-0.076507,-0.077418,-0.077509,0.118048,0.254046
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5031,1.0,0.164155,0.240342,0.603025,0.163849,0.006236,-0.101984,0.094291,0.311779,0.033328,...,0.281446,-0.175565,-0.220382,0.123637,0.195625,0.021879,-0.069427,-0.089395,0.067826,0.112058
5032,1.0,0.408306,0.264014,0.435050,0.171421,0.127738,0.191343,-0.022585,-0.131846,0.047509,...,-0.276959,0.153144,0.202171,-0.176625,-0.234048,-0.022756,-0.055081,-0.095792,0.009612,0.039744
5033,1.0,0.492655,0.276040,0.407506,0.192651,0.048210,-0.055457,0.008024,0.187479,0.138029,...,0.163116,0.157449,-0.101399,-0.096676,0.011191,0.042344,0.008098,-0.037977,-0.103140,-0.022722
5034,1.0,0.017325,0.164911,0.625717,0.289707,0.109264,-0.113049,-0.052196,0.215732,0.049555,...,0.200085,-0.326034,-0.185656,0.327710,0.255466,-0.103170,-0.182513,-0.040778,0.222275,0.213486


In [5]:
test.reset_index(drop=True, inplace = True)
test_x = test.drop(columns = ['Family','Genus','Species','RecordID'])
#test_y_family = test[['Family']]
#test_y_genus = test['Genus'].tolist()
#test_y_species = test['Species'].tolist()
#type(test_y_family)

### b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance). We first try this approach:

#### i) Research exact match and hamming score/ loss methods for evaluating multilabel classification and use them in evaluating the classifiers in this problem

Exact match

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html

Hamming loss is the fraction of wrong labels to the total number of labels. In multi-class classification, hamming loss is calculated as the hamming distance between y_true and y_pred. 

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html


#### ii) Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation. You are welcome to try to solve the problem with both standardized 2 and raw attributes and report the results.

In [6]:
from sklearn import svm
from sklearn.metrics import accuracy_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import hamming_loss

In [7]:
def svc_gaussian (column_name):
    train_y = train[column_name]
    test_y = test[[column_name]]
    clf = svm.SVC(kernel = 'rbf')
    C_list = []
    k = range(-3,6)#-3,6
    for i in k:
        C_list.append(10**(i))
    params = {'C':C_list,'gamma': np.linspace(0.1,2,20)}#(0.1,2,20)
    grid = GridSearchCV(clf, param_grid = params, cv = 10)
    grid.fit(train_x, train_y)
    pred_y = grid.predict(test_x)
    optimal_estimator = grid.best_estimator_
    optimal_params = grid.best_params_
    hamming_los = hamming_loss(test_y,pred_y)
    #print ('hamming_loss:', hamming_los)
    correct = 0
    test_y_ = test[column_name].tolist()
    for i in range(len(test_y_)):
        if test_y_[i] == pred_y[i]:
            correct+=1
    exact_match = correct/len(test_y_)
    #print ('exact_match:', exact_match)
    return ('optimal_params:',optimal_params,'hamming_loss:', hamming_los,'exact_match:', exact_match)

In [8]:
svc_gaussian('Family')

('optimal_params:',
 {'C': 100, 'gamma': 1.5999999999999999},
 'hamming_loss:',
 0.006021306160259379,
 'exact_match:',
 0.9939786938397406)

In [None]:
svc_gaussian('Genus')

In [None]:
svc_gaussian('Species')

#### iii) Repeat 1(b)ii with L1-penalized SVMs. Remember to standardize4 the attributes. Determine the weight of the SVM penalty using 10 fold cross validation.

In [None]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler().fit(train_x)
#mean = scaler.mean_
#train_x = scaler.transform(train_x)
#test_x = scaler.transform(test_x)

In [None]:
def svc_L1_penalized (column_name):
    train_y = train[column_name]
    test_y = test[[column_name]]
    clf = svm.LinearSVC(penalty = 'l1', dual=False)
    C_list = []
    k = range(-3,6)#-3,6
    for i in k:
        C_list.append(10**(i))
    params = {'C':C_list}
    grid = GridSearchCV(svm.LinearSVC(penalty = 'l1', dual=False), param_grid = params, cv = 10)
    grid.fit(train_x, train_y)
    pred_y = grid.predict(test_x)
    optimal_estimator = grid.best_estimator_
    optimal_params = grid.best_params_
    hamming_los = hamming_loss(test_y,pred_y)
    #print ('hamming_loss:', hamming_los)
    correct = 0
    test_y_ = test[column_name].tolist()
    for i in range(len(test_y_)):
        if test_y_[i] == pred_y[i]:
            correct+=1
    exact_match = correct/len(test_y_)
    #print ('exact_match:', exact_match)
    return ('optimal_params:',optimal_params,'hamming_loss:', hamming_los,'exact_match:', exact_match)

In [None]:
svc_L1_penalized('Family')

In [None]:
svc_L1_penalized('Genus')

In [None]:
svc_L1_penalized('Species')

#### iv) Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [None]:
from imblearn.over_sampling import SMOTE


In [None]:
def svc_L1_SMOTE (column_name):
    smt = SMOTE(random_state=42)
    train_x_smote, train_y_smote = smt.fit_resample(train_x,train[column_name])
    test_y = test[[column_name]]
    
    clf = svm.LinearSVC(penalty = 'l1', dual=False)
    C_list = []
    k = range(-3,6)#-3,6
    for i in k:
        C_list.append(10**(i))
    params = {'C':C_list}
    grid = GridSearchCV(svm.LinearSVC(penalty = 'l1', dual=False), param_grid = params, cv = 10)
    grid.fit(train_x_smote, train_y_smote)
    pred_y = grid.predict(test_x)
    optimal_estimator = grid.best_estimator_
    optimal_params = grid.best_params_
    hamming_los = hamming_loss(test_y,pred_y)
    #print ('hamming_loss:', hamming_los)
    correct = 0
    test_y_ = test[column_name].tolist()
    for i in range(len(test_y_)):
        if test_y_[i] == pred_y[i]:
            correct+=1
    exact_match = correct/len(test_y_)
    #print ('exact_match:', exact_match)
    return ('optimal_params:',optimal_params,'hamming_loss:', hamming_los,'exact_match:', exact_match)

In [None]:
svc_L1_SMOTE('Family')

In [None]:
svc_L1_SMOTE('Genus')

In [None]:
svc_L1_SMOTE('Species')