In [15]:
from sklearn.pipeline import make_pipeline
#importing necessary libraries
import numpy as np
import pandas as pd
import os
from sklearn import svm
from sklearn.metrics import accuracy_score
import functools
from itertools import product
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

### Set Input Path

In [16]:
# data_path := the path where the test -train datasets are placed
data_path = r'C:\Users\saura\OneDrive\Documents\IIT Hyderabad\Assignments\ML\Assignment 2\Dataset'
os.chdir(data_path)
os.listdir()

['features.test.txt',
 'features.train.txt',
 'gisette.param',
 'gisette_test.data',
 'gisette_train.data',
 'gisette_train.labels',
 'gisette_valid.data',
 'gisette_valid.labels',
 'Linear_all_dataresult.txt',
 'Q5__minmax__result.txt',
 'Q5__None__result.txt']

### Load test train data

In [17]:
#load train data
svm_train = pd.read_csv('features.train.txt', delimiter = " ", header=None)
# there wore some null columns loaded. dropping them in place
svm_train.dropna(axis = 1, inplace = True) 
svm_train.columns = ['label', 'feature1' , 'feature2']# assign column names
svm_train['label']= svm_train['label'].astype(int)
# only keep the records with label 1,5
svm_train = svm_train[svm_train['label'].isin([1,5])] 
# if the original label is 1, class is 1, if  5--> class -1
svm_train['class'] = np.where(svm_train['label'] ==1, 1, -1) 
print(svm_train['class'].value_counts()) # get class wise count
svm_train.head()

 1    1005
-1     556
Name: class, dtype: int64


Unnamed: 0,label,feature1,feature2,class
1,5,0.444131,-5.496812,-1
7,1,0.123043,-0.707875,1
9,1,0.113859,-0.931375,1
12,1,0.115371,-0.386,1
13,1,0.102281,-0.378812,1


In [18]:
# Train data class distribution ->  1 : 1005, -1 : 556

In [19]:
#Load test data
svm_test = pd.read_csv('features.test.txt', delimiter = " ", header=None)
# there wore some null columns loaded. dropping them in place
svm_test.dropna(axis = 1, inplace = True)
svm_test.columns = ['label', 'feature1' , 'feature2']# assign column names
svm_test['label']= svm_test['label'].astype(int)
# only keep the records with label 1,5
svm_test = svm_test[svm_test['label'].isin([1,5])]
 # if the original label is 1, class is 1, if  5--> class -1
svm_test['class'] = np.where(svm_test['label'] ==1, 1, -1)
print(svm_test['class'].value_counts())# get class wise count
svm_test.head()

 1    264
-1    160
Name: class, dtype: int64


Unnamed: 0,label,feature1,feature2,class
16,1,0.159934,-1.742,1
32,5,0.32484,-6.360563,-1
36,5,0.25118,-4.389375,-1
42,1,0.124609,-0.474438,1
45,1,0.219117,-0.9045,1


In [20]:
# Test data class distribution ->  1 : 264, -1 : 160

In [21]:
# drop the original labels , keep only +1, -1 derived classes
svm_test.drop(['label'], axis =1,  inplace = True)
svm_train.drop(['label'], axis =1, inplace = True)
svm_train.head()

Unnamed: 0,feature1,feature2,class
1,0.444131,-5.496812,-1
7,0.123043,-0.707875,1
9,0.113859,-0.931375,1
12,0.115371,-0.386,1
13,0.102281,-0.378812,1


In [22]:
svm_train['class'].values

array([-1,  1,  1, ...,  1,  1,  1])

In [23]:
svm_train.shape#[:-1].shape

(1561, 3)

# Modelling 

### Linear SVM

### Without Scaling

In [41]:
classifier = svm.SVC(kernel='linear')# create SVM classifier object with linear kernel
classifier.fit(svm_train[['feature1','feature2']].values, svm_train['class'].values)
print('the number of support vectors : '+str(classifier.n_support_))# printing the number of support vectors
# predicting for test set
predicted_class = classifier.predict(svm_test[['feature1','feature2']].values) 
# get the accuracy on the test data
print('test accuracy : '+str(accuracy_score(svm_test['class'].values, predicted_class)))
myname = "Q4_Linear_whole_dataset"
f = open(myname+"result.txt", "a")
f.write('Without Scaling the number of support vectors : '+str(classifier.n_support_))
f.write('\n')
f.write('test accuracy : '+str(accuracy_score(svm_test['class'].values, predicted_class)))
f.write('\n')
f.close()

the number of support vectors : [14 14]
test accuracy : 0.9787735849056604


### With Min Max Scaling

In [25]:
scaler = MinMaxScaler()
train = scaler.fit_transform(svm_train[['feature1','feature2']].values)
test = scaler.transform(svm_test[['feature1','feature2']].values)
#NOTE - only need transform the test data, not fit transform - https://stackoverflow.com/questions/43675665/when-scale-the-data-why-the-train-dataset-use-fit-and-transform-but-the-te

In [26]:
classifier = svm.SVC(kernel='linear')# create SVM classifier object with linear kernel
#classifier = make_pipeline(StandardScaler(), svm.SVC(kernel='linear'))
classifier.fit(train, svm_train['class'].values)
print('the number of support vectors : '+str(classifier.n_support_))# printing the number of support vectors
# predicting for test set
predicted_class = classifier.predict(test) 
# get the accuracy on the test data
print(accuracy_score(svm_test['class'].values, predicted_class))
myname = "Linear_all_data"
f = open(myname+"result.txt", "a")
f.write('With MinMax Scaling the number of support vectors : '+str(classifier.n_support_))
f.write('\n')
f.write('With MinMax test accuracy : '+str(accuracy_score(svm_test['class'].values, predicted_class)))
f.write('\n')
f.close()

the number of support vectors : [44 45]
0.9787735849056604


In [27]:
scaler = StandardScaler()
train_std = scaler.fit_transform(svm_train[['feature1','feature2']].values)
test_std = scaler.transform(svm_test[['feature1','feature2']].values)

In [28]:
classifier = svm.SVC(kernel='linear')# create SVM classifier object with linear kernel
#classifier = make_pipeline(StandardScaler(), svm.SVC(kernel='linear'))
classifier.fit(train_std, svm_train['class'].values)
print('the number of support vectors : '+str(classifier.n_support_))# printing the number of support vectors
# predicting for test set
predicted_class = classifier.predict(test_std) 
# get the accuracy on the test data
print(accuracy_score(svm_test['class'].values, predicted_class))
myname = "Linear_all_data"
f = open(myname+"result.txt", "a")
f.write('With Std Scaling the number of support vectors : '+str(classifier.n_support_))
f.write('\n')
f.write('With Std test accuracy : '+str(accuracy_score(svm_test['class'].values, predicted_class)))
f.write('\n')
f.close()

the number of support vectors : [18 18]
0.9811320754716981


In [29]:
svm_train.shape

(1561, 3)

In [30]:
svm_train[['feature1','feature2']].head(50).shape

(50, 2)

In [31]:
svm_train.shape

(1561, 3)

### Linear SVM different train data size

### Linear SVM with Scaling 

In [32]:
#accuracy_list =[]
def check_accuracy_for_given_traindata(train_size, scaling =None):
    """
    Checks for the accuracy given a train data size
    ---
    Input Parameters :
    ---
    train_size : int
        the no of rows of train data to consider (from the start i.e. first n rows)
    Returns :
    ------
    accuracy : float
        accuracy of the classifier on the test data
    """
    # create SVM classifier object with linear kernel
    classifier = svm.SVC(kernel='linear')
    #next line -- only take the first "train_size" count of rows from the train data for training the SVM
    train_Feature_curr = svm_train[['feature1','feature2']].head(train_size).values
    train_label_curr = svm_train['class'].head(train_size).values
    test_features_curr = svm_test[['feature1','feature2']].values
    #print('scaling' + str(scaling) )
    if scaling =='standard': # 'minmax', 'standard'
        print('using standard scaling')
        scaler = StandardScaler()
        train_Feature_curr = scaler.fit_transform(train_Feature_curr)
        test_features_curr = scaler.transform(test_features_curr)
        
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
        print('using minmax scaling')
        train_Feature_curr = scaler.fit_transform(train_Feature_curr)
        test_features_curr = scaler.transform(test_features_curr)
    classifier.fit(train_Feature_curr, train_label_curr)
    print ("for train data set size : "+ str(train_label_curr.shape))
    print('Total no of support vectors : '+str(len(classifier.support_vectors_)) +' i.e. for each class '+str(classifier.n_support_))
    #predict on the test set
    predicted_class = classifier.predict(test_features_curr)
    accuracy = accuracy_score(svm_test['class'].values, predicted_class)
    print('accuracy for training data size '+str(train_size)+" is :"+str(accuracy))
    #write to text file
    myname = "Q4__Linear_"
    f = open(myname+"result.txt", "a")
    f.write("Linear_Scaling_is : "+str(scaling)+ '_Size_ is : '+str(train_size))
    f.write('\n')
    f.write('the number of support vectors : '+str(classifier.n_support_))
    f.write('\n')
    f.write('test accuracy : '+str(accuracy_score(svm_test['class'].values, predicted_class)))
    f.write ('----------------------------------------------------------------------------')
    f.write('\n')
    f.write('\n')
    
    f.close()
    return accuracy

train_size_list =[50,100,200,800]

accuracy_list = list(map(functools.partial(check_accuracy_for_given_traindata),train_size_list))

for train data set size : (50,)
Total no of support vectors : 2 i.e. for each class [1 1]
accuracy for training data size 50 is :0.9811320754716981
for train data set size : (100,)
Total no of support vectors : 4 i.e. for each class [2 2]
accuracy for training data size 100 is :0.9811320754716981
for train data set size : (200,)
Total no of support vectors : 8 i.e. for each class [4 4]
accuracy for training data size 200 is :0.9811320754716981
for train data set size : (800,)
Total no of support vectors : 14 i.e. for each class [7 7]
accuracy for training data size 800 is :0.9811320754716981


In [33]:
#scaling : # 'minmax', 'standard'
scaling = 'minmax'
accuracy_list = list(map(functools.partial(check_accuracy_for_given_traindata, scaling = scaling),train_size_list))

using minmax scaling
for train data set size : (50,)
Total no of support vectors : 12 i.e. for each class [6 6]
accuracy for training data size 50 is :0.9764150943396226
using minmax scaling
for train data set size : (100,)
Total no of support vectors : 20 i.e. for each class [10 10]
accuracy for training data size 100 is :0.9811320754716981
using minmax scaling
for train data set size : (200,)
Total no of support vectors : 33 i.e. for each class [16 17]
accuracy for training data size 200 is :0.9811320754716981
using minmax scaling
for train data set size : (800,)
Total no of support vectors : 61 i.e. for each class [30 31]
accuracy for training data size 800 is :0.9811320754716981


In [34]:
scaling = 'standard'
accuracy_list = list(map(functools.partial(check_accuracy_for_given_traindata,  scaling = scaling),train_size_list))

using standard scaling
for train data set size : (50,)
Total no of support vectors : 4 i.e. for each class [2 2]
accuracy for training data size 50 is :0.9716981132075472
using standard scaling
for train data set size : (100,)
Total no of support vectors : 6 i.e. for each class [3 3]
accuracy for training data size 100 is :0.9811320754716981
using standard scaling
for train data set size : (200,)
Total no of support vectors : 11 i.e. for each class [5 6]
accuracy for training data size 200 is :0.9811320754716981
using standard scaling
for train data set size : (800,)
Total no of support vectors : 20 i.e. for each class [10 10]
accuracy for training data size 800 is :0.9811320754716981


### polynomial kernel

In [51]:
def check_accuracy_for_poly(params, scaling = None):
    """
    Checks for the accuracy from an SVM with Polynomial kernel
    ---
    Input Parameters :
    ---
    params : list
        a list containing the regulaization parameter & the degree of the polynomial i.e. [C, degree]
    Returns :
    ----
    accuracy : float
        accuracy of the classifier on the test data
    """
    C, degree = params
    # create SVM classifier object with polynomial kernel with degree = 'degree' parameter & regularization param C
    classifier = svm.SVC(kernel='poly', gamma =1, C =C, coef0 = 1, degree = degree)
    #fit the classifier on train data
    train_Feature_curr = svm_train[['feature1','feature2']].values
    test_features_curr = svm_test[['feature1','feature2']].values
    if scaling =='standard': # 'minmax', 'standard'
        print('using standard scaling')
        scaler = StandardScaler()
        train_Feature_curr = scaler.fit_transform(train_Feature_curr)
        test_features_curr = scaler.transform(test_features_curr)
        
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
        print('using minmax scaling')
        train_Feature_curr = scaler.fit_transform(train_Feature_curr)
        test_features_curr = scaler.transform(test_features_curr)
    classifier.fit(train_Feature_curr, svm_train['class'].values)
    print('Total no of support vectors : '+str(len(classifier.support_vectors_)) +' i.e. for each class '+str(classifier.n_support_))
    #predict on the test data
    predicted_class = classifier.predict(test_features_curr)
    train_predicted_class = classifier.predict(train_Feature_curr)
    # get the accuracy on the test data
    train_accuracy = accuracy_score(svm_train['class'].values, train_predicted_class) 
    print('Train accuracy for C ='+str(C)+" and degree ="+str(degree)+" is :"+str(train_accuracy))
    accuracy = accuracy_score(svm_test['class'].values, predicted_class) 
    print('Test accuracy for C ='+str(C)+" and degree ="+str(degree)+" is :"+str(accuracy))
    print('Training error for C ='+str(C)+" and degree ="+str(degree)+" is :"+str(1-train_accuracy))
    print('Test error for C ='+str(C)+" and degree ="+str(degree)+" is :"+str(1-accuracy))
    #write to text file
    myname = "Q4__Polynomial"
    f = open(myname+"result.txt", "a")
    f.write("Polynomial_Scaling_"+str(scaling)+ '_C_'+str(C)+ '_degree_'+str(degree))
    f.write('\n')
    f.write('the number of support vectors : '+str(classifier.n_support_))
    f.write('\n')
    f.write('Training error for C ='+str(C)+" and degree ="+str(degree)+" is :"+str(1-train_accuracy))
    f.write('\n')
    f.write('Test error for C ='+str(C)+" and degree ="+str(degree)+" is :"+str(1-accuracy))
    f.write ('----------------------------------------------------------------------------')
    f.write('\n')
    f.write('\n')
    
    f.close()
    return accuracy
C_list = [1,.01,.001,.0001] # regularization parameter list 
degree_list = [2,5] # degree of the polynomial kernel
# iterate through the C_list  & degree list in a vectorized way, iterating through each of the cartesian products of C & degree
poly_accuracy_list = list(map(functools.partial(check_accuracy_for_poly),list(product(C_list, degree_list))))

Total no of support vectors : 24 i.e. for each class [12 12]
Train accuracy for C =1 and degree =2 is :0.9967969250480462
Test accuracy for C =1 and degree =2 is :0.9811320754716981
Training error for C =1 and degree =2 is :0.0032030749519538215
Test error for C =1 and degree =2 is :0.018867924528301883
Total no of support vectors : 21 i.e. for each class [11 10]
Train accuracy for C =1 and degree =5 is :0.9967969250480462
Test accuracy for C =1 and degree =5 is :0.9787735849056604
Training error for C =1 and degree =5 is :0.0032030749519538215
Test error for C =1 and degree =5 is :0.021226415094339646
Total no of support vectors : 34 i.e. for each class [17 17]
Train accuracy for C =0.01 and degree =2 is :0.9955156950672646
Test accuracy for C =0.01 and degree =2 is :0.9811320754716981
Training error for C =0.01 and degree =2 is :0.004484304932735439
Test error for C =0.01 and degree =2 is :0.018867924528301883
Total no of support vectors : 23 i.e. for each class [12 11]
Train accurac

In [52]:
scaling = 'standard'
poly_accuracy_list = list(map(functools.partial(check_accuracy_for_poly, scaling = scaling),list(product(C_list, degree_list))))

using standard scaling
Total no of support vectors : 29 i.e. for each class [14 15]
Train accuracy for C =1 and degree =2 is :0.9967969250480462
Test accuracy for C =1 and degree =2 is :0.9787735849056604
Training error for C =1 and degree =2 is :0.0032030749519538215
Test error for C =1 and degree =2 is :0.021226415094339646
using standard scaling
Total no of support vectors : 22 i.e. for each class [11 11]
Train accuracy for C =1 and degree =5 is :0.9974375400384369
Test accuracy for C =1 and degree =5 is :0.9740566037735849
Training error for C =1 and degree =5 is :0.002562459961563124
Test error for C =1 and degree =5 is :0.02594339622641506
using standard scaling
Total no of support vectors : 166 i.e. for each class [83 83]
Train accuracy for C =0.01 and degree =2 is :0.9935938500960922
Test accuracy for C =0.01 and degree =2 is :0.9811320754716981
Training error for C =0.01 and degree =2 is :0.006406149903907754
Test error for C =0.01 and degree =2 is :0.018867924528301883
using 

In [53]:
scaling = 'minmax'
poly_accuracy_list = list(map(functools.partial(check_accuracy_for_poly, scaling = scaling),list(product(C_list, degree_list))))

using minmax scaling
Total no of support vectors : 50 i.e. for each class [25 25]
Train accuracy for C =1 and degree =2 is :0.9961563100576554
Test accuracy for C =1 and degree =2 is :0.9787735849056604
Training error for C =1 and degree =2 is :0.0038436899423446302
Test error for C =1 and degree =2 is :0.021226415094339646
using minmax scaling
Total no of support vectors : 26 i.e. for each class [13 13]
Train accuracy for C =1 and degree =5 is :0.9967969250480462
Test accuracy for C =1 and degree =5 is :0.9764150943396226
Training error for C =1 and degree =5 is :0.0032030749519538215
Test error for C =1 and degree =5 is :0.02358490566037741
using minmax scaling
Total no of support vectors : 498 i.e. for each class [249 249]
Train accuracy for C =0.01 and degree =2 is :0.9929532351057014
Test accuracy for C =0.01 and degree =2 is :0.9811320754716981
Training error for C =0.01 and degree =2 is :0.007046764894298563
Test error for C =0.01 and degree =2 is :0.018867924528301883
using min

### rbf kernel

In [48]:
def check_accuracy_for_rbf(C, scaling =None):
    """
    Checks for the accuracy from an SVM with RBF kernel
    ---
    Input Parameters :
    ---
    C : float
        the regularization parameter
    Returns :
    ----
    accuracy : float
        accuracy of the classifier on the test data
    """
    # create a SVM classifier object with RBF kernel & regularization param =C
    classifier = svm.SVC(kernel='rbf', C =C, gamma = 1)# setting gamma = 1 as per the formula in the assignment 
    #fit the classifier on train data
    train_Feature_curr = svm_train[['feature1','feature2']].values
    test_features_curr = svm_test[['feature1','feature2']].values
    if scaling =='standard': # 'minmax', 'standard'
        print('using standard scaling')
        scaler = StandardScaler()
        train_Feature_curr = scaler.fit_transform(train_Feature_curr)
        test_features_curr = scaler.transform(test_features_curr)
        
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
        print('using minmax scaling')
        train_Feature_curr = scaler.fit_transform(train_Feature_curr)
        test_features_curr = scaler.transform(test_features_curr)
    classifier.fit(svm_train[['feature1','feature2']].values, svm_train['class'].values)
    print('Total no of support vectors : '+str(len(classifier.support_vectors_)) +' i.e. for each class '+str(classifier.n_support_))
    #predict on the test data
    predicted_class = classifier.predict(svm_test[['feature1','feature2']].values)
    train_predicted_class = classifier.predict(svm_train[['feature1','feature2']].values)
    # get the accuracy on the test data
    train_accuracy = accuracy_score(svm_train['class'].values, train_predicted_class)
    print('accuracy for C ='+str(C)+" is :"+str(train_accuracy))
    accuracy = accuracy_score(svm_test['class'].values, predicted_class)
    print('accuracy for C ='+str(C)+" is :"+str(accuracy))
    print('Training error for C ='+str(C)+" is :"+str(1-train_accuracy))
    print('Test error for C ='+str(C)+" is :"+str(1-accuracy))
    #write to text file
    myname = "Q4__RBF__"
    f = open(myname+"result.txt", "a")
    f.write("RBF_Scaling_is : "+str(scaling)+ '_C_is : '+str(C))
    f.write('\n')

    f.write('Training error for C ='+str(C)+" is :"+str(1-train_accuracy))
    f.write('\n')
    f.write('Test error for C ='+str(C)+'is : '+str(1-accuracy))
    f.write ('----------------------------------------------------------------------------')
    f.write('\n')
    f.write('\n')
    return accuracy
C_list = [.001,1, 100, 10**4, 10**6] # regularization parameter list 
# iterate through the C_list  & degree list in a vectorized way
rbf_accuracy_list = list(map(functools.partial(check_accuracy_for_rbf),C_list))

Total no of support vectors : 1112 i.e. for each class [556 556]
accuracy for C =0.001 is :0.643818065342729
accuracy for C =0.001 is :0.6226415094339622
Training error for C =0.001 is :0.356181934657271
Test error for C =0.001 is :0.37735849056603776
Total no of support vectors : 31 i.e. for each class [17 14]
accuracy for C =1 is :0.9955156950672646
accuracy for C =1 is :0.9787735849056604
Training error for C =1 is :0.004484304932735439
Test error for C =1 is :0.021226415094339646
Total no of support vectors : 22 i.e. for each class [14  8]
accuracy for C =100 is :0.9967969250480462
accuracy for C =100 is :0.9811320754716981
Training error for C =100 is :0.0032030749519538215
Test error for C =100 is :0.018867924528301883
Total no of support vectors : 19 i.e. for each class [12  7]
accuracy for C =10000 is :0.9974375400384369
accuracy for C =10000 is :0.9764150943396226
Training error for C =10000 is :0.002562459961563124
Test error for C =10000 is :0.02358490566037741
Total no of s

In [49]:
scaling = 'standard'
rbf_accuracy_list = list(map(functools.partial(check_accuracy_for_rbf, scaling = scaling),C_list))

using standard scaling
Total no of support vectors : 1112 i.e. for each class [556 556]
accuracy for C =0.001 is :0.643818065342729
accuracy for C =0.001 is :0.6226415094339622
Training error for C =0.001 is :0.356181934657271
Test error for C =0.001 is :0.37735849056603776
using standard scaling
Total no of support vectors : 31 i.e. for each class [17 14]
accuracy for C =1 is :0.9955156950672646
accuracy for C =1 is :0.9787735849056604
Training error for C =1 is :0.004484304932735439
Test error for C =1 is :0.021226415094339646
using standard scaling
Total no of support vectors : 22 i.e. for each class [14  8]
accuracy for C =100 is :0.9967969250480462
accuracy for C =100 is :0.9811320754716981
Training error for C =100 is :0.0032030749519538215
Test error for C =100 is :0.018867924528301883
using standard scaling
Total no of support vectors : 19 i.e. for each class [12  7]
accuracy for C =10000 is :0.9974375400384369
accuracy for C =10000 is :0.9764150943396226
Training error for C =

In [50]:
scaling = 'minmax'
rbf_accuracy_list = list(map(functools.partial(check_accuracy_for_rbf, scaling= scaling),C_list))

using minmax scaling
Total no of support vectors : 1112 i.e. for each class [556 556]
accuracy for C =0.001 is :0.643818065342729
accuracy for C =0.001 is :0.6226415094339622
Training error for C =0.001 is :0.356181934657271
Test error for C =0.001 is :0.37735849056603776
using minmax scaling
Total no of support vectors : 31 i.e. for each class [17 14]
accuracy for C =1 is :0.9955156950672646
accuracy for C =1 is :0.9787735849056604
Training error for C =1 is :0.004484304932735439
Test error for C =1 is :0.021226415094339646
using minmax scaling
Total no of support vectors : 22 i.e. for each class [14  8]
accuracy for C =100 is :0.9967969250480462
accuracy for C =100 is :0.9811320754716981
Training error for C =100 is :0.0032030749519538215
Test error for C =100 is :0.018867924528301883
using minmax scaling
Total no of support vectors : 19 i.e. for each class [12  7]
accuracy for C =10000 is :0.9974375400384369
accuracy for C =10000 is :0.9764150943396226
Training error for C =10000 is