In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Loading results of SVMs with different C and Kernels

In [27]:
polySVCResults= np.load('unscaledPolynomial.npy', allow_pickle= True)
polySVCResults = pd.DataFrame (polySVCResults,columns = ['C','test_score','train_score','f1_score'])

linearSVCResults= np.load('linearSVC.npy', allow_pickle= True)
linearSVCResults = pd.DataFrame (linearSVCResults,columns = ['C','test_score','train_score','f1_score'])

sigmoidSVCResults = np.load('sigmoid.npy', allow_pickle= True)
sigmoidSVCResults = pd.DataFrame (sigmoidSVCResults,columns = ['C','test_score','train_score','f1_score'])

rbfSVCResults = np.load('rbf.npy', allow_pickle= True)
rbfSVCResults = pd.DataFrame (rbfSVCResults,columns = ['C','test_score','train_score','f1_score'])

In [28]:
polySVCResults #they are all equal. 

Unnamed: 0,C,test_score,train_score,f1_score
0,0.1,0.795282,0.791738,0.704595
1,1.0,0.795282,0.791738,0.704595
2,10.0,0.795282,0.791738,0.704595
3,100.0,0.795282,0.791738,0.704595
4,1000.0,0.795282,0.791738,0.704595


In [16]:
linearSVCResults #once again all equal

Unnamed: 0,C,test_score,train_score,f1_score
0,0.1,0.795282,0.791738,0.704595
1,1.0,0.795282,0.791738,0.704595
2,10.0,0.795282,0.791738,0.704595
3,100.0,0.795282,0.791738,0.704595
4,1000.0,0.795282,0.791738,0.704595


In [18]:
rbfSVCResults

Unnamed: 0,C,test_score,train_score,f1_score
0,0.1,0.795282,0.791738,0.704595
1,1.0,0.795282,0.791738,0.704595
2,10.0,0.795282,0.791738,0.704595
3,100.0,0.792237,0.791928,0.70309
4,1000.0,0.792237,0.791928,0.70309


In [32]:
rbfSVCResults.iloc[rbfSVCResults['f1_score'].idxmax()]

C              0.100000
test_score     0.795282
train_score    0.791738
f1_score       0.704595
Name: 0, dtype: float64

In [30]:
sigmoidSVCResults

Unnamed: 0,C,test_score,train_score,f1_score
0,0.1,0.745814,0.731772,0.705217
1,1.0,0.723744,0.70988,0.696193
2,10.0,0.670472,0.66305,0.671251
3,100.0,0.667428,0.662288,0.669336
4,1000.0,0.667428,0.662288,0.669336


In [31]:
sigmoidSVCResults.iloc[sigmoidSVCResults['f1_score'].idxmax()]

C              0.100000
test_score     0.745814
train_score    0.731772
f1_score       0.705217
Name: 0, dtype: float64

In [36]:
#based on the f1 score, C of 0.1 and sigmoid kernel is best.
#for sigmoid kernel, gamma and coef0 hyperparams are relevant. Lets find the best value

# Load Dataset

In [38]:
arr2= np.load('data_array_non_formatted.npy', allow_pickle= True)
X= arr2[:,0:-1]
Y= arr2[:,-1]

X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.20, random_state=101)

# Finding best gamma

In [39]:
#if gamma='scale' (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma
#var returns the variance of the array elements, a measure of the spread of a distribution
#if ‘auto’, uses 1 / n_features.
gammaValues = ['scale','auto',0.1,1,5,10]
# the range of gamma values selected represent how they are utilized in the sigmoid function

In [43]:
%%time
test_score = []
train_score = []
f1_scores = []

for currentGamma in gammaValues: 
    sigmoid_model = SVC(C=0.1, kernel='sigmoid', gamma=currentGamma)        
    sigmoid_model.fit(X_train, y_train) 
    test_score.append(sigmoid_model.score (X_test, y_test))
    train_score.append(sigmoid_model.score (X_train, y_train))
    y_pred =sigmoid_model.predict(X_test)
    f1_scores.append(f1_score(y_test,y_pred,average='weighted'))  
df = pd.DataFrame(list(zip(gammaValues,test_score,train_score,f1_scores)),columns = ['Gamma','test_score','train_score','f1_scores'])

CPU times: user 3.92 s, sys: 128 ms, total: 4.04 s
Wall time: 4.04 s


In [45]:
df

Unnamed: 0,Gamma,test_score,train_score,f1_scores
0,scale,0.745814,0.731772,0.705217
1,auto,0.795282,0.791738,0.704595
2,0.1,0.795282,0.791738,0.704595
3,1,0.795282,0.791738,0.704595
4,5,0.795282,0.791738,0.704595
5,10,0.795282,0.791738,0.704595


In [46]:
#best gamma value is scale

In [47]:
X_test.var()

141221476288556.0

# Finding best Coef0

In [51]:
coef0Values = range(-1000,1000,50)
# the range of coef0 values selected represent how they are utilized in the sigmoid function

In [54]:
%%time
test_score = []
train_score = []
f1_scores = []

for currentCoef0Value in coef0Values: 
    sigmoid_model = SVC(C=0.1, kernel='sigmoid', gamma=currentGamma, coef0=currentCoef0Value)        
    sigmoid_model.fit(X_train, y_train) 
    test_score.append(sigmoid_model.score (X_test, y_test))
    train_score.append(sigmoid_model.score (X_train, y_train))
    y_pred =sigmoid_model.predict(X_test)
    f1_scores.append(f1_score(y_test,y_pred,average='weighted'))  
df = pd.DataFrame(list(zip(coef0Values,test_score,train_score,f1_scores)),columns = ['Coef0','test_score','train_score','f1_scores'])

CPU times: user 20.6 s, sys: 760 ms, total: 21.4 s
Wall time: 21.4 s


In [58]:
df # they are all the same wtf

Unnamed: 0,Coef0,test_score,train_score,f1_scores
0,-1000,0.795282,0.791738,0.704595
1,-950,0.795282,0.791738,0.704595
2,-900,0.795282,0.791738,0.704595
3,-850,0.795282,0.791738,0.704595
4,-800,0.795282,0.791738,0.704595
5,-750,0.795282,0.791738,0.704595
6,-700,0.795282,0.791738,0.704595
7,-650,0.795282,0.791738,0.704595
8,-600,0.795282,0.791738,0.704595
9,-550,0.795282,0.791738,0.704595


In [57]:
df.iloc[df['f1_scores'].idxmax()]

Coef0         -1000.000000
test_score        0.795282
train_score       0.791738
f1_scores         0.704595
Name: 0, dtype: float64