# Lab 9: Support Vector Machines

The objective of this lab is 

In [1]:
import warnings
warnings.filterwarnings('ignore')

from time import time
from datetime import timedelta

import pandas as pd
import seaborn as sns
import numpy as np

from dython.nominal import associations
from dython.nominal import correlation_ratio
from dython.nominal import cramers_v

from scipy.stats import chi2_contingency 
from scipy.stats import pearsonr 

from sklearn.metrics import f1_score, accuracy_score
from sklearn.preprocessing import minmax_scale

from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.datasets import fetch_lfw_people
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

from matplotlib import pyplot as plt 

In [2]:
data_learn = pd.read_csv("log_ILDS_train_X.csv", delimiter = ",", header = None)
X_test = pd.read_csv("log_ILDS_test_X.csv", delimiter = ",", header = None)

data_learn.columns = ['Age', 'TP', 'ALB', 'AR', 'DBratio',
                      'logTB', 'logDB', 'logAlkphos', 'logSgpt', 'logSgot',
                      'Female', 'Target']
X_test.columns = ['Age', 'TP', 'ALB', 'AR', 'DBratio',
                  'logTB', 'logDB', 'logAlkphos', 'logSgpt', 'logSgot',
                  'Female']

X_learn = data_learn.drop(columns = ["Target"])
y_learn = data_learn["Target"]

In [3]:
print(X_learn.shape, y_learn.shape, X_test.shape)

(620, 11) (620,) (116, 11)


In [4]:
def compute_metrics(y_pred, y_real):
    f1_score_macro = f1_score(y_pred, y_real, average = "macro")
    f1_score_0 = f1_score(y_pred, y_real, pos_label = 0)
    f1_score_1 = f1_score(y_pred, y_real, pos_label = 1)
    accuracy = accuracy_score(y_pred, y_real)
    return [f1_score_macro, f1_score_0, f1_score_1, accuracy]

results = pd.DataFrame(columns=['Kernel', 'C', 'F1-Macro', 'F1-Class0', 'F1-Class1', 'Accuracy'])

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X_learn, y_learn, train_size = 0.80, random_state = 42)

knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
y_pred = knn.predict(X_val)

results.loc['KNN', :] = ['-', '-'] + compute_metrics(y_pred,y_val)
results

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
KNN,-,-,0.716895,0.666667,0.767123,0.725806


## Linear SVM

We have to remove the Female variable because it is categorical.

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_learn, y_learn, train_size = 0.80, random_state = 42)
X_train = X_train.drop(columns = ["Female"])
X_val = X_val.drop(columns = ["Female"])

In [7]:
linear_svc = LinearSVC()

linear_svc.fit(X_train, y_train)
y_pred = linear_svc.predict(X_val)
results.loc['LinearSVC-default', :] = ['Linear', 1] + compute_metrics(y_pred, y_val)

results.sort_values(by='F1-Macro', ascending=False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484


In [8]:
cv_results = pd.DataFrame(columns=['Kernel', 'C', 'F1-Macro', 'F1-Class0', 'F1-Class1', 'Accuracy'])

Cs = range(1, 51)
for c in Cs:
    svc = SVC(kernel='linear', C = c)
    svc.fit(X_train,y_train)
    y_pred = svc.predict(X_val)
    cv_results.loc['LinearSVC-{}'.format(c), :] = ['Linear', c] + compute_metrics(y_pred,y_val)
        
best = cv_results.sort_values(by='F1-Macro',ascending=False).iloc[0,:]
results.loc['LinearSVC-best',:] = best

In [9]:
cv_results.sort_values(by = 'F1-Macro', ascending = False).head()

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
LinearSVC-2,Linear,2,0.706933,0.678571,0.735294,0.709677
LinearSVC-26,Linear,26,0.705929,0.672727,0.73913,0.709677
LinearSVC-38,Linear,38,0.705929,0.672727,0.73913,0.709677
LinearSVC-28,Linear,28,0.705929,0.672727,0.73913,0.709677
LinearSVC-29,Linear,29,0.705929,0.672727,0.73913,0.709677


In [10]:
results.sort_values(by = "F1-Macro", ascending = False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484


## Other kernels

### Radial-basis function

In [11]:
svr = SVC(kernel = "rbf")
svr.fit(X_train, y_train)
y_pred = svr.predict(X_val)

results.loc['RBF-SVC-default', :] = ['RBF', 1]+ compute_metrics(y_pred,y_val)

In [12]:
results.sort_values(by = "F1-Macro", ascending = False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
RBF-SVC-default,RBF,1,0.752857,0.716981,0.788732,0.758065
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484


In [13]:
cv_results_rbf = pd.DataFrame(columns=['Kernel', 'C', 'F1-Macro', 'F1-Class0', 'F1-Class1', 'Accuracy'])

Cs = range(1, 51)
for c in Cs:
    svr = SVC(kernel = 'rbf', C = c)
    svr.fit(X_train, y_train)
    y_pred = svr.predict(X_val)
    cv_results_rbf.loc['RBF-SVC-{}'.format(c), :] = ['RBF', c] + compute_metrics(y_pred,y_val)
        
best = cv_results_rbf.sort_values(by='F1-Macro',ascending=False).iloc[0,:]
results.loc['RBF-SVC-best',:] = best

In [14]:
cv_results_rbf.sort_values(by = 'F1-Macro', ascending = False).head()

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
RBF-SVC-50,RBF,50,0.754941,0.727273,0.782609,0.758065
RBF-SVC-1,RBF,1,0.752857,0.716981,0.788732,0.758065
RBF-SVC-31,RBF,31,0.751603,0.711538,0.791667,0.758065
RBF-SVC-33,RBF,33,0.751603,0.711538,0.791667,0.758065
RBF-SVC-34,RBF,34,0.751603,0.711538,0.791667,0.758065


In [15]:
results.sort_values(by = "F1-Macro", ascending = False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
RBF-SVC-best,RBF,50,0.754941,0.727273,0.782609,0.758065
RBF-SVC-default,RBF,1,0.752857,0.716981,0.788732,0.758065
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484


In [16]:
svs = SVC(kernel = 'sigmoid')

svs.fit(X_train,y_train)
y_pred = svs.predict(X_val)
results.loc['SigmoidSVC-default', :] = ['Sigmoid', 1]+ compute_metrics(y_pred,y_val)

results.sort_values(by='F1-Macro', ascending=False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
RBF-SVC-best,RBF,50,0.754941,0.727273,0.782609,0.758065
RBF-SVC-default,RBF,1,0.752857,0.716981,0.788732,0.758065
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484
SigmoidSVC-default,Sigmoid,1,0.628646,0.616667,0.640625,0.629032


In [17]:
cv_results_sigmoid = pd.DataFrame(columns=['Kernel', 'C', 'F1-Macro', 'F1-Class0', 'F1-Class1', 'Accuracy'])

Cs = range(1, 51)
for c in Cs:
    svs = SVC(kernel = 'sigmoid', C = c)
    svs.fit(X_train, y_train)
    y_pred = svs.predict(X_val)
    cv_results_sigmoid.loc['Sigmoid-SVC-{}'.format(c), :] = ['Sigmoid', c] + compute_metrics(y_pred, y_val)
        
best = cv_results_sigmoid.sort_values(by = 'F1-Macro',ascending=False).iloc[0,:]
results.loc['Sigmoid-SVC-best',:] = best

In [18]:
cv_results_sigmoid.sort_values(by = 'F1-Macro', ascending = False).head()

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
Sigmoid-SVC-50,Sigmoid,50,0.693229,0.683333,0.703125,0.693548
Sigmoid-SVC-48,Sigmoid,48,0.685463,0.682927,0.688,0.685484
Sigmoid-SVC-11,Sigmoid,11,0.684972,0.672269,0.697674,0.685484
Sigmoid-SVC-43,Sigmoid,43,0.677083,0.666667,0.6875,0.677419
Sigmoid-SVC-47,Sigmoid,47,0.677083,0.666667,0.6875,0.677419


In [19]:
results.sort_values(by = 'F1-Macro', ascending = False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
RBF-SVC-best,RBF,50,0.754941,0.727273,0.782609,0.758065
RBF-SVC-default,RBF,1,0.752857,0.716981,0.788732,0.758065
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
Sigmoid-SVC-best,Sigmoid,50,0.693229,0.683333,0.703125,0.693548
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484
SigmoidSVC-default,Sigmoid,1,0.628646,0.616667,0.640625,0.629032


In [20]:
svp = SVC(kernel = 'poly')

svp.fit(X_train, y_train)
y_pred = svp.predict(X_val)
results.loc['Poly-SVC-default', :] = ['Poly-2', 1] + compute_metrics(y_pred, y_val)

results.sort_values(by = 'F1-Macro', ascending=False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
RBF-SVC-best,RBF,50,0.754941,0.727273,0.782609,0.758065
RBF-SVC-default,RBF,1,0.752857,0.716981,0.788732,0.758065
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
Poly-SVC-default,Poly-2,1,0.698378,0.64,0.756757,0.709677
Sigmoid-SVC-best,Sigmoid,50,0.693229,0.683333,0.703125,0.693548
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484
SigmoidSVC-default,Sigmoid,1,0.628646,0.616667,0.640625,0.629032


In [21]:
cv_results_poly = pd.DataFrame(columns = ['Kernel', 'C', 'Degree', 'F1-Macro', 'F1-Class0', 'F1-Class1', 'Accuracy'])

Cs = range(1, 51)
degrees = [2, 3, 4, 5]
for c in Cs:
    for degree in degrees:
        svs = SVC(kernel = 'poly', C = c, degree = degree)
        svs.fit(X_train, y_train)
        y_pred = svs.predict(X_val)
        cv_results_poly.loc['Poly-SVC-{}-{}'.format(c, degree), :] = ['Poly', c, degree] + compute_metrics(y_pred, y_val)
        
best = cv_results_poly.sort_values(by = 'F1-Macro', ascending = False).iloc[0,:]
results.loc['Poly-SVC-best', :] = best

In [22]:
cv_results_poly.sort_values(by = 'F1-Macro', ascending = False).head()

Unnamed: 0,Kernel,C,Degree,F1-Macro,F1-Class0,F1-Class1,Accuracy
Poly-SVC-43-3,Poly,43,3,0.777023,0.742857,0.811189,0.782258
Poly-SVC-42-3,Poly,42,3,0.777023,0.742857,0.811189,0.782258
Poly-SVC-41-3,Poly,41,3,0.777023,0.742857,0.811189,0.782258
Poly-SVC-44-3,Poly,44,3,0.769333,0.735849,0.802817,0.774194
Poly-SVC-40-3,Poly,40,3,0.769333,0.735849,0.802817,0.774194


In [23]:
results.sort_values(by = 'F1-Macro', ascending = False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
Poly-SVC-best,Poly,43,0.777023,0.742857,0.811189,0.782258
RBF-SVC-best,RBF,50,0.754941,0.727273,0.782609,0.758065
RBF-SVC-default,RBF,1,0.752857,0.716981,0.788732,0.758065
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
Poly-SVC-default,Poly-2,1,0.698378,0.64,0.756757,0.709677
Sigmoid-SVC-best,Sigmoid,50,0.693229,0.683333,0.703125,0.693548
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484
SigmoidSVC-default,Sigmoid,1,0.628646,0.616667,0.640625,0.629032


In [24]:
X_train, X_val, y_train, y_val = train_test_split(X_learn, y_learn, train_size = 0.80, random_state = 42)

def gaussian_kernel (x1, x2, female_weight):
    index_numerical = [True, True, True, True, True, True, True, True, True, True, False]
    female_similarity = female_weight if x1[-1] == x2[-1] else 1 - female_weight
    gamma = 1.0 / len(x1)
    sim = np.exp(-gamma * np.sum(np.power((x1 - x2), 2))) * female_similarity
    return sim

def kernel_matrix (X1, X2, female_weight):
    matrix = np.zeros((X1.shape[0], X2.shape[0]))
    for i, x1 in enumerate(X1):
        for j, x2 in enumerate(X2):
            matrix[i,j] = gaussian_kernel(x1, x2, female_weight)
    return matrix
            
cv_results_custom = pd.DataFrame(columns = ['Kernel','C','Female','F1-Macro','F1-Class0','F1-Class1','Accuracy'])

Cs = [1, 10, 20, 30, 40, 50, 60]
Fs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

for c in Cs:
    for f in Fs:
        clf = SVC(C = c, kernel = "precomputed")
        clf.fit(kernel_matrix(X_train.values, X_train.values, f), y_train)
        y_pred = clf.predict(kernel_matrix(X_val.values, X_train.values, f))
        cv_results_custom.loc['Custom-{}-{}'.format(c, f), :] = ['Custom', c, f] + compute_metrics(y_pred, y_val)

best = cv_results_custom.sort_values(by = 'F1-Macro', ascending = False).iloc[0,:]
results.loc['Custom-SVC-best', :] = best

In [25]:
results.sort_values(by = 'F1-Macro', ascending = False)

Unnamed: 0,Kernel,C,F1-Macro,F1-Class0,F1-Class1,Accuracy
Custom-SVC-best,Custom,40,0.78895,0.77193,0.80597,0.790323
Poly-SVC-best,Poly,43,0.777023,0.742857,0.811189,0.782258
RBF-SVC-best,RBF,50,0.754941,0.727273,0.782609,0.758065
RBF-SVC-default,RBF,1,0.752857,0.716981,0.788732,0.758065
KNN,-,-,0.716895,0.666667,0.767123,0.725806
LinearSVC-best,Linear,2,0.706933,0.678571,0.735294,0.709677
Poly-SVC-default,Poly-2,1,0.698378,0.64,0.756757,0.709677
Sigmoid-SVC-best,Sigmoid,50,0.693229,0.683333,0.703125,0.693548
LinearSVC-default,Linear,1,0.683818,0.66087,0.706767,0.685484
SigmoidSVC-default,Sigmoid,1,0.628646,0.616667,0.640625,0.629032


In [26]:
cv_results_custom.sort_values(by = 'F1-Macro', ascending = False).head()

Unnamed: 0,Kernel,C,Female,F1-Macro,F1-Class0,F1-Class1,Accuracy
Custom-40-0.6,Custom,40,0.6,0.78895,0.77193,0.80597,0.790323
Custom-60-0.7,Custom,60,0.7,0.781562,0.769231,0.793893,0.782258
Custom-60-0.8,Custom,60,0.8,0.77325,0.758621,0.787879,0.774194
Custom-60-0.6,Custom,60,0.6,0.77325,0.758621,0.787879,0.774194
Custom-30-0.8,Custom,30,0.8,0.772715,0.754386,0.791045,0.774194


## Trying with test dataset

In [27]:
X_test

Unnamed: 0,Age,TP,ALB,AR,DBratio,logTB,logDB,logAlkphos,logSgpt,logSgot,Female
0,-2.078832,0.648619,1.404934,1.590982,2.681419,-0.794539,-1.236238,2.031194,-0.520624,-0.571370,0
1,1.034421,2.089511,1.147577,-0.461597,-0.988557,0.173733,0.473897,-0.093123,0.784347,1.434449,0
2,0.912333,1.321035,1.404934,0.564692,0.112435,-0.794539,-0.696750,-0.645242,-0.243001,-1.442851,0
3,0.912333,0.840738,0.118150,-0.530016,-0.962343,1.355472,1.357269,-0.192949,3.173539,3.471001,0
4,0.179803,-0.888331,-0.525242,-0.119501,0.846431,-0.536889,-0.696750,-0.632716,-0.627645,0.172602,1
...,...,...,...,...,...,...,...,...,...,...,...
111,1.461731,1.897392,2.305683,1.590982,0.479433,-0.657641,-0.696750,-1.028742,0.120587,0.483158,1
112,-0.613771,2.665868,-1.425990,-2.172079,-0.861170,2.923124,2.496401,-1.687714,1.974128,3.112570,0
113,-0.186462,0.552560,-0.139207,-0.803693,0.479433,-0.657641,-0.696750,-1.569533,-0.806183,-0.848766,0
114,-1.651522,1.032857,0.118150,-0.803693,-0.254562,-0.952576,-0.696750,1.822124,-0.201858,-0.381018,0


In [48]:
X_learn_num = X_learn.drop(columns = ["Female"])
X_test_num = X_test.drop(columns = ["Female"])

### Linear

In [49]:
linear_svc = LinearSVC(C = 2)
linear_svc.fit(X_learn_num, y_learn)
y_pred = pd.DataFrame(linear_svc.predict(X_test_num))

labels = pd.DataFrame(columns = ["Id", "Label"])
labels.Id = y_pred.index
labels.Label = y_pred
labels.to_csv("svm/linear.csv", header = ["Id", "Label"])

In [50]:
labels

Unnamed: 0,Id,Label
0,0,1
1,1,0
2,2,1
3,3,0
4,4,1
...,...,...
111,111,1
112,112,0
113,113,1
114,114,0


### Sigmoid

In [51]:
sigmoid_svc = SVC(kernel = "sigmoid", C = 50)
sigmoid_svc.fit(X_learn_num, y_learn)
y_pred = pd.DataFrame(sigmoid_svc.predict(X_test_num))

labels = pd.DataFrame(columns = ["Id", "Label"])
labels.Id = y_pred.index
labels.Label = y_pred
labels.to_csv("svm/sigmoid.csv", header = ["Id", "Label"])

In [52]:
labels

Unnamed: 0,Id,Label
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1
...,...,...
111,111,1
112,112,1
113,113,1
114,114,0


### Polynomial

In [53]:
poly_svc = SVC(kernel = "poly", degree = 3, C = 43)
poly_svc.fit(X_learn_num, y_learn)
y_pred = pd.DataFrame(poly_svc.predict(X_test_num))

labels = pd.DataFrame(columns = ["Id", "Label"])
labels.Id = y_pred.index
labels.Label = y_pred
labels.to_csv("svm/poly.csv", header = ["Id", "Label"])

In [54]:
labels

Unnamed: 0,Id,Label
0,0,0
1,1,0
2,2,1
3,3,0
4,4,1
...,...,...
111,111,1
112,112,0
113,113,1
114,114,0


### Radial-basis function

In [55]:
rbf_svc = SVC(kernel = "rbf", C = 50)
rbf_svc.fit(X_learn_num, y_learn)
y_pred = pd.DataFrame(rbf_svc.predict(X_test_num))

labels = pd.DataFrame(columns = ["Id", "Label"])
labels.Id = y_pred.index
labels.Label = y_pred
labels.to_csv("svm/rbf.csv", header = ["Id", "Label"])

In [56]:
labels

Unnamed: 0,Id,Label
0,0,0
1,1,1
2,2,1
3,3,0
4,4,1
...,...,...
111,111,0
112,112,0
113,113,0
114,114,0


### Gaussian kernel

In [61]:
custom_svc = SVC(kernel = "precomputed", C = 50)
custom_svc.fit(kernel_matrix(X_learn.values, X_learn.values, 0.6), y_learn)
y_pred = pd.DataFrame(custom_svc.predict(kernel_matrix(X_test.values, X_learn.values, 0.6)))

labels = pd.DataFrame(columns = ["Id", "Label"])
labels.Id = y_pred.index
labels.Label = y_pred
labels.to_csv("svm/gaussian.csv", header = ["Id", "Label"])

In [62]:
labels

Unnamed: 0,Id,Label
0,0,0
1,1,1
2,2,1
3,3,0
4,4,1
...,...,...
111,111,0
112,112,0
113,113,1
114,114,0
