In [4]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import metrics

In [5]:
def text_to_numpy(path):
    data = []
    with open(path) as file:
        for line in file.readlines():
            data.append(list(map(float, line.rstrip('\n').split())))
    
    return np.array(data)

In [6]:
def load_data():
    train_x = pd.DataFrame(text_to_numpy("gisette_train.data"))
    train_x = train_x.fillna(0)    
    train_Y = pd.DataFrame(text_to_numpy("gisette_train.labels"))
    
    test_x = pd.DataFrame(text_to_numpy("gisette_valid.data"))
    test_x = test_x.fillna(0)
    test_Y = pd.DataFrame(text_to_numpy("gisette_valid.labels"))
    
    return train_x, train_Y, test_x, test_Y 

In [1]:
def classify(train_x, train_Y, test_x, test_Y, k, d = 3, coef = 0.0, g = 'scale'):
    clf = svm.SVC(kernel = k, decision_function_shape = 'ovo', gamma = g, degree = d, coef0 = coef)
    train_err, test_err, no_suppVec, acc = getMetrics(clf, train_x, train_Y, test_x, test_Y)
    
    return train_err, test_err, no_suppVec, acc

In [7]:
def getMetrics(clf, train_x, train_Y, test_x, test_Y):
    train_Y = train_Y.values.reshape(-1)
    test_Y = test_Y.values.reshape(-1)
    
    clf.fit(train_x, train_Y)    
    pred = clf.predict(test_x)    
    
    train_err = 1 - clf.score(train_x, train_Y)
    test_err = 1 - clf.score(test_x, test_Y)    
    no_suppVec = np.shape(clf.support_)
    acc = metrics.accuracy_score(test_Y, pred)
    
    return train_err, test_err, no_suppVec, acc

In [8]:
train_x, train_Y, test_x, test_Y = load_data()

In [9]:
def printMetrics(k, train_err, test_err, no_suppVec, acc):
    print("Kernel = ", k)
    print("==============================================")
    print("%10s|%10s|%10s|%9s|" % ("Train_Err", "Test_Err", "#Supp_Vec", "Accuracy"))
    print("----------------------------------------------")
    print("%10f|%10f|%10d|%10f|" % (train_err, test_err, no_suppVec, acc))
    print("==============================================")

In [10]:
train_err, test_err, no_suppVec, acc = classify(train_x, train_Y, test_x, test_Y, 'linear')
printMetrics('linear', train_err, test_err, no_suppVec[0], acc)

Kernel =  linear
 Train_Err|  Test_Err| #Supp_Vec| Accuracy|
----------------------------------------------
  0.000000|  0.024000|      1084|  0.976000|


In [11]:
train_err, test_err, no_suppVec, acc = classify(train_x, train_Y, test_x, test_Y, 'poly', d = 2, coef = 1)
printMetrics('poly', train_err, test_err, no_suppVec[0], acc)

Kernel =  poly
 Train_Err|  Test_Err| #Supp_Vec| Accuracy|
----------------------------------------------
  0.000500|  0.020000|      1332|  0.980000|


In [12]:
train_err, test_err, no_suppVec, acc = classify(train_x, train_Y, test_x, test_Y, 'rbf', g = 0.001)
printMetrics('rbf', train_err, test_err, no_suppVec[0], acc)

Kernel =  rbf
 Train_Err|  Test_Err| #Supp_Vec| Accuracy|
----------------------------------------------
  0.000000|  0.500000|      6000|  0.500000|


RBF yields lower training error than Polynomial kernel.