In [1]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import metrics

In [2]:
def text_to_numpy(path):
    data = []
    with open(path) as file:
        for line in file.readlines():
            data.append(list(map(float, line.rstrip('\n').split())))
    
    return np.array(data)

In [3]:
def split_label_features(data):
    Y = data[:, 0] 
    x = data[:, 1:]
    return Y, x

In [4]:
def load_data():
    train = text_to_numpy("features.train.txt")
    test = text_to_numpy("features.test.txt")
    
    train_Y, train_x = split_label_features(train)
    test_Y, test_x = split_label_features(test)
    
    return train_Y, train_x, test_Y, test_x

In [5]:
def get_data_labels_1_5(Y, x, cl1, cl2):
    idx_cl1 = np.where(Y == cl1)[0]
    idx_cl2 = np.where(Y == cl2)[0]
    indices = np.sort(np.concatenate((idx_cl1, idx_cl2)))
    new_Y = np.take(Y, indices).reshape(-1)    
    new_x = np.take(x, indices, axis=0).reshape(-1,2)
    
    np.put(new_Y, np.where(new_Y == cl1)[0], [1])
    np.put(new_Y, np.where(new_Y == cl2)[0], [-1])
    
    return new_Y, new_x
    

In [6]:
def get_train_test_features():
    train_Y, train_x, test_Y, test_x = load_data()
    
    train_Y, train_x = get_data_labels_1_5(train_Y, train_x, 1, 5)
    test_Y, test_x = get_data_labels_1_5(test_Y, test_x, 1, 5)
    
    return train_x, train_Y, test_x, test_Y

In [7]:
def NuSVM_on_limited_samples(samples, train_x, train_Y, test_x, test_Y):
    
    no_suppVec, acc = 0.0, 0.0
    nu_ = 0.5
    
    if samples != 0:
        sampled_train_x = train_x[:samples,]
        sampled_train_Y = train_Y[:samples,]
        
        if samples == 50:
            nu_ = 0.1
        elif samples == 100:
            nu_ = 0.2
        elif samples == 200:
            nu_ = 0.3
        elif samples == 800:
            nu_ = 0.4
        
        no_suppVec, acc = classify_SVM_OvO_linear(sampled_train_x, sampled_train_Y, test_x, test_Y, nu_)
    
    else:
        
        no_suppVec, acc = classify_SVM_OvO_linear(train_x, train_Y, test_x, test_Y, nu_)
    
    return no_suppVec, acc

In [8]:
def classify_SVM_OvO_linear(train_x, train_Y, test_x, test_Y, nu_):
    
    classifier = svm.NuSVC(nu = nu_, kernel = 'linear', decision_function_shape = 'ovo')
    _, _, no_suppVec, acc = classify(classifier, train_x, train_Y, test_x, test_Y)    
    
    return no_suppVec, acc

In [9]:
def classify_SVM_OvO_polynomial(train_x, train_Y, test_x, test_Y, q, c):
    
    classifier = svm.SVC(C = c, kernel = 'poly', degree = q, gamma = 'scale', decision_function_shape = 'ovo')
    train_err, test_err, no_suppVec, acc = classify(classifier, train_x, train_Y, test_x, test_Y)    
    
    return train_err, test_err, no_suppVec, acc

In [10]:
def classify_SVM_rbf(train_X, train_Y, test_X, test_Y, c):
    classifier = svm.SVC(C = c, kernel = 'rbf', gamma = 'scale')
    train_err, test_err, _, _ = classify(classifier, train_x, train_Y, test_x, test_Y)    
    return train_err, test_err

In [11]:
def classify(clf, train_x, train_Y, test_x, test_Y):
    train_Y = train_Y.reshape(-1)
    test_Y = test_Y.reshape(-1)
    
    clf.fit(train_x, train_Y)
    
    pred = clf.predict(test_x)
    
    train_err = 1 - clf.score(train_x, train_Y)
    test_err = 1 - clf.score(test_x, test_Y)
    
    no_suppVec = np.shape(clf.support_)
    acc = metrics.accuracy_score(test_Y, pred)
    
    return train_err, test_err, no_suppVec, acc

In [12]:
"""Question 4.a"""
train_x, train_Y, test_x, test_Y = get_train_test_features()
no_suppVec, acc = NuSVM_on_limited_samples(0, train_x, train_Y, test_x, test_Y)
print("===========================================================")
print("Solution - Question 4.a")
print("-----------------------------------------------------------")
print("#Support Vectors - %-4d Accuracy - %-7f" % (no_suppVec[0], acc))
print("===========================================================")

Solution - Question 4.a
-----------------------------------------------------------
#Support Vectors - 782  Accuracy - 0.983491


In [13]:
"""Question 4.b"""
samples_list = [50, 100, 200, 800]

train_x, train_Y, test_x, test_Y = get_train_test_features()
train_Y = train_Y.reshape(-1,1)
test_Y = test_Y.reshape(-1,1)
print("===========================================================")
print("Solution - Question 4.b")
print("-----------------------------------------------------------")
for n in samples_list:
    no_suppVec, acc = NuSVM_on_limited_samples(n, train_x, train_Y, test_x, test_Y)
    print("#Samples - %4d #Support Vectors - %4d Accuracy - %7f" % ( n, no_suppVec[0], acc))
print("===========================================================")

Solution - Question 4.b
-----------------------------------------------------------
#Samples -   50 #Support Vectors -    6 Accuracy - 0.981132
#Samples -  100 #Support Vectors -   20 Accuracy - 0.981132
#Samples -  200 #Support Vectors -   60 Accuracy - 0.981132
#Samples -  800 #Support Vectors -  320 Accuracy - 0.981132


In [14]:
"""Question 4.c"""
Q = [2, 5]
C = [0.0001, 0.001, 0.01, 1]
print("===========================================================")
print("Solution - Question 4.c")
print("-----------------------------------------------------------")
for q in Q:
    print("%11s|%-10s|%-10s|%-10s|%-9s|" % (" ", "Train_Err", "Test_Err", "#Supp_Vec", "Accuracy"))
    print("-----------------------------------------------------------")
    print("Q = %-2d" % (q))
    print("-----------------------------------------------------------")
    for c in C:
        train_err, test_err, no_suppVec, acc = classify_SVM_OvO_polynomial(train_x, train_Y, test_x, test_Y, q, c)
        print("C = %-3.4f | %8f | %8f | %8d | %8f|" % (c, train_err, test_err, no_suppVec[0], acc))
    print("-----------------------------------------------------------")
    
print("===========================================================")

Solution - Question 4.c
-----------------------------------------------------------
           |Train_Err |Test_Err  |#Supp_Vec |Accuracy |
-----------------------------------------------------------
Q = 2 
-----------------------------------------------------------
C = 0.0001 | 0.340807 | 0.346698 |     1112 | 0.653302|
C = 0.0010 | 0.024984 | 0.035377 |      558 | 0.964623|
C = 0.0100 | 0.008328 | 0.021226 |      164 | 0.978774|
C = 1.0000 | 0.004484 | 0.018868 |       30 | 0.981132|
-----------------------------------------------------------
           |Train_Err |Test_Err  |#Supp_Vec |Accuracy |
-----------------------------------------------------------
Q = 5 
-----------------------------------------------------------
C = 0.0001 | 0.051890 | 0.075472 |      374 | 0.924528|
C = 0.0010 | 0.021140 | 0.030660 |      158 | 0.969340|
C = 0.0100 | 0.008328 | 0.021226 |       68 | 0.978774|
C = 1.0000 | 0.004484 | 0.016509 |       26 | 0.983491|
------------------------------------------

On comparing the results from the above table, we comment TRUE/FALSE:
NOTE: Here I assume kernel coefficient gamma = 'scale' - 

4.c) 

i. When C = 0.0001, training error is higher at Q = 5

Ans: FALSE

ii. When C = 0.001, the number of support vectors is lower at Q = 5

Ans: TRUE

iii. When C = 0.01, training error is higher at Q = 5

Ans: FALSE

iv. When C = 1, test error is lower at Q = 5

Ans: TRUE

In [15]:
"""Question 4.d"""
print("=====================================")
print("Solution - Question 4.d")
print("-------------------------------------")
print("%14s|%-10s|%-10s|" % (" ", "Train_Err", "Test_Err"))
print("-------------------------------------")
for i in range(-2, 8, 2):
    c = 1 * pow(10, i)
    train_err, test_err = classify_SVM_rbf(train_x, train_Y, test_x, test_Y, c)
    print("C = %-10.2f|%10f|%10f|" % (c, train_err, test_err))
print("======================================")

Solution - Question 4.d
-------------------------------------
              |Train_Err |Test_Err  |
-------------------------------------
C = 0.01      |  0.005125|  0.016509|
C = 1.00      |  0.004484|  0.021226|
C = 100.00    |  0.003203|  0.018868|
C = 10000.00  |  0.002562|  0.018868|
C = 1000000.00|  0.002562|  0.023585|


We can see from the result above, that - 

C = 10000.00 or 1000000.00 results in lowest training error value of 0.002562.

C = 0.01 results in lowest test error value of 0.016509.

All error values corresponding for all C values are shown in the above table.
