### Importing necessary libraries

In [1]:
from sklearn.pipeline import make_pipeline
#importing necessary libraries
import numpy as np
import pandas as pd
import os
from sklearn import svm
from sklearn.metrics import accuracy_score
import functools
from itertools import product
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
from io import StringIO

### Set the input path

In [3]:
data_path = r'C:\Users\saura\OneDrive\Documents\IIT Hyderabad\Assignments\ML\Assignment 2\Dataset'
os.chdir(data_path)
os.listdir()

['features.test.txt',
 'features.train.txt',
 'gisette.param',
 'gisette_test.data',
 'gisette_train.data',
 'gisette_train.labels',
 'gisette_valid.data',
 'gisette_valid.labels',
 'Q4__Polynomialresult.txt',
 'Q4__RBF__result.txt']

### load the datasets

In [4]:
train_data = pd.read_csv('gisette_train.data', sep =' ', header =None)
train_data.dropna(axis =1, how ='all', inplace =True)
test_data = pd.read_csv('gisette_valid.data', sep =' ', header =None)
test_data.dropna(axis =1,how ='all', inplace =True)
train_labels = pd.read_csv('gisette_train.labels', sep =' ', header =None)
train_labels.dropna(axis =1,how ='all', inplace =True)
test_labels = pd.read_csv('gisette_valid.labels', sep =' ', header =None)
test_labels.dropna(axis =1,how ='all', inplace =True)
test_labels.head()

Unnamed: 0,0
0,1
1,1
2,-1
3,1
4,1


In [5]:
train_labels.value_counts()

-1    3000
 1    3000
dtype: int64

In [6]:
test_labels.value_counts()

-1    500
 1    500
dtype: int64

In [7]:
train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,550,0,495,0,0,0,0,976,0,0,...,0,0,0,991,991,0,0,0,0,983
1,0,0,0,0,0,0,0,976,0,0,...,0,475,0,991,0,0,991,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,742,0,0,0,0,684,0,956,...,0,0,0,0,0,0,674,0,0,838
4,0,0,0,0,0,0,0,608,0,979,...,991,0,0,828,0,0,0,0,0,0


In [8]:
#checking for null values
train_data.isna().sum().sum()

0

### Scaling 

In [5]:
scaler = MinMaxScaler()
train_data_mm = scaler.fit_transform(train_data.values)
test_data_mm = scaler.transform(test_data.values)
#NOTE - only need transform the test data, not fit transform 
# Source - https://stackoverflow.com/questions/43675665/when-scale-the-data-why-the-train-dataset-use-fit-and-transform-but-the-te

In [6]:
scaler = StandardScaler()
train_data_std = scaler.fit_transform(train_data.values)
test_data_std = scaler.transform(test_data.values)

In [7]:
def gisette_fit_SVM(train_data_mm,  test_data_mm, kernel='linear' , scaling = "None"):
    '''
    for a given test train data & kernel type -- fits the model & stores the error metrics in a text file.
    -----
    Input Parameters:
    -----
    train_data_mm , test_data_mm: train & test dataframes
    kernel : string 
        can hold the below values - 'poly', 'linear'(default), 'rbf'
    scaling : string 
        denotes whether no scaling is done , or standard or min -max. This string is used in outputting suitable text 
        in the text results files
    -----
    Returns 
    -----
    None 
    '''
    if kernel=='poly' :
        classifier = svm.SVC(kernel='poly', degree = 2, coef0 = 1, gamma =1)# create SVM classifier object with poly kernel
    if kernel=='linear' :
        classifier = svm.SVC(kernel='linear')
    if kernel =='rbf':
        classifier = svm.SVC(kernel='rbf', gamma = 0.001)
 
    classifier.fit(train_data_mm, train_labels)
    print('the number of support vectors : '+str(classifier.n_support_))# printing the number of support vectors
    # predicting for test set
    predicted_class = classifier.predict(test_data_mm)
    train_predicted_class = classifier.predict(train_data_mm)
    train_accuracy = accuracy_score(train_labels, train_predicted_class)
    test_accuracy = accuracy_score(test_labels, predicted_class)
    # get the accuracy on the test data
    print('poly train error with '+str(scaling)+' scaling : '+str(1-train_accuracy))
    print('poly test error with '+str(scaling)+' scaling : '+str(1-test_accuracy))
    #write output /results to text file
    myname = "Q5__"+kernel+"__"
    f = open(myname+"result.txt", "a")
    f.write('\n')
    f.write('the number of support vectors for '+str(kernel)+" & scaling: "+scaling+" is :"+str(classifier.n_support_))
    f.write('\n')

    f.write('Training error for kernel ='+str(kernel)+"scaling: "+scaling+" is :"+str(1-train_accuracy))
    f.write('\n')
    f.write('Test error for kernel ='+str(kernel)+"scaling: "+scaling+" is :"+str(1-test_accuracy))
    f.write ('----------------------------------------------------------------------------')
    f.write('\n')
    f.write('\n')
    f.close()

In [32]:
os.getcwd()

'C:\\Users\\saura\\OneDrive\\Documents\\IIT Hyderabad\\Assignments\\ML\\Assignment 2\\Dataset'

In [8]:
gisette_fit_SVM(train_data,  test_data, kernel='linear')

  return f(*args, **kwargs)


the number of support vectors : [542 542]
poly train error with None scaling : 0.0
poly test error with None scaling : 0.02400000000000002


In [9]:
gisette_fit_SVM(train_data_mm,  test_data_mm, kernel='linear' , scaling = 'minmax' )

  return f(*args, **kwargs)


the number of support vectors : [542 542]
poly train error with minmax scaling : 0.0
poly test error with minmax scaling : 0.02400000000000002


In [10]:
gisette_fit_SVM(train_data_std,  test_data_std, kernel='linear' , scaling = 'standard' )

  return f(*args, **kwargs)


the number of support vectors : [628 608]
poly train error with standard scaling : 0.0
poly test error with standard scaling : 0.019000000000000017


In [11]:
gisette_fit_SVM(train_data,  test_data, kernel='poly'  )
gisette_fit_SVM(train_data_std,  test_data_std, kernel='poly' , scaling = 'standard' )
gisette_fit_SVM(train_data_mm,  test_data_mm, kernel='poly' , scaling = 'minmax' )

  return f(*args, **kwargs)


the number of support vectors : [817 938]
poly train error with None scaling : 0.0
poly test error with None scaling : 0.02100000000000002


  return f(*args, **kwargs)


the number of support vectors : [2287 2372]
poly train error with standard scaling : 0.0
poly test error with standard scaling : 0.019000000000000017


  return f(*args, **kwargs)


the number of support vectors : [814 938]
poly train error with minmax scaling : 0.0
poly test error with minmax scaling : 0.020000000000000018


In [12]:
gisette_fit_SVM(train_data,  test_data, kernel='rbf'  )
gisette_fit_SVM(train_data_std,  test_data_std, kernel='rbf' , scaling = 'standard' )
gisette_fit_SVM(train_data_mm,  test_data_mm, kernel='rbf' , scaling = 'minmax' )

  return f(*args, **kwargs)


the number of support vectors : [3000 3000]
poly train error with None scaling : 0.0
poly test error with None scaling : 0.5


  return f(*args, **kwargs)


the number of support vectors : [2999 3000]
poly train error with standard scaling : 0.0
poly test error with standard scaling : 0.14600000000000002


  return f(*args, **kwargs)


the number of support vectors : [809 832]
poly train error with minmax scaling : 0.010166666666666657
poly test error with minmax scaling : 0.03500000000000003
