# Support Vector Machine

## Importing Libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

pd.set_option('max_columns', None)

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

## Reading the Data

In [5]:
data = pd.read_csv('data.txt', delimiter=' ', header=None)

## Exploratory Data Analysis

In [6]:
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
0,1,6,4,12,5,5,3,4,1,67,3,2,1,2,1,0,0,1,0,0,1,0,0,1,1
1,2,48,2,60,1,3,2,2,1,22,3,1,1,1,1,0,0,1,0,0,1,0,0,1,2
2,4,12,4,21,1,4,3,3,1,49,3,1,2,1,1,0,0,1,0,0,1,0,1,0,1
3,1,42,2,79,1,4,3,4,2,45,3,1,2,1,1,0,0,0,0,0,0,0,0,1,1
4,1,24,3,49,1,3,3,4,4,53,3,2,2,1,1,1,0,1,0,0,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,4,9,2,23,2,2,2,4,2,22,3,1,1,1,1,0,0,1,0,1,0,0,0,1,1
796,1,18,2,75,5,5,3,4,2,51,3,1,2,2,1,0,1,1,0,0,0,0,0,1,2
797,4,12,4,13,1,2,2,4,2,22,3,2,1,1,1,0,0,1,0,1,0,0,1,0,1
798,4,24,3,7,5,5,4,4,3,54,3,2,1,2,1,1,0,1,0,0,1,0,0,1,1


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 25 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   0       800 non-null    int64
 1   1       800 non-null    int64
 2   2       800 non-null    int64
 3   3       800 non-null    int64
 4   4       800 non-null    int64
 5   5       800 non-null    int64
 6   6       800 non-null    int64
 7   7       800 non-null    int64
 8   8       800 non-null    int64
 9   9       800 non-null    int64
 10  10      800 non-null    int64
 11  11      800 non-null    int64
 12  12      800 non-null    int64
 13  13      800 non-null    int64
 14  14      800 non-null    int64
 15  15      800 non-null    int64
 16  16      800 non-null    int64
 17  17      800 non-null    int64
 18  18      800 non-null    int64
 19  19      800 non-null    int64
 20  20      800 non-null    int64
 21  21      800 non-null    int64
 22  22      800 non-null    int64
 23  23      800 non

In [8]:
# data = data.loc[:,data.apply(pd.Series.nunique) < 5]

In [9]:
X, y = data.iloc[:, :-1].values, data.iloc[:, -1].values

In [10]:
def svm(X, y, model):
    accuracies = []

    for i in range(10):
        accuracy = 0
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.4, random_state=i)

        model.fit(X_train, y_train)
        y_predict = model.predict(X_test)
        for j in range(len(y_predict)):
            if y_predict[j] == y_test[j]:
                accuracy += 1
        accuracies.append(accuracy / len(y_predict) * 100)
    return accuracies

In [11]:
svm_clf = Pipeline((
("linear_svc", SVC(C=0.1, kernel="linear")),
))
not_scaled_accuracies = svm(X, y, svm_clf)
mean_of_not_scaled_accuracies = np.mean(not_scaled_accuracies)

print(not_scaled_accuracies)
print(mean_of_not_scaled_accuracies)

[74.375, 77.8125, 69.0625, 73.4375, 75.0, 74.6875, 73.125, 75.625, 73.4375, 71.5625]
73.8125


In [12]:
svm_clf = Pipeline((
("scaler", StandardScaler()),
("linear_svc", SVC(C=0.1, kernel="linear")),
))
scaled_accuracies = svm(X, y, svm_clf)
mean_of_scaled_accuracies = np.mean(scaled_accuracies)

print(scaled_accuracies)
print(mean_of_scaled_accuracies)

[75.0, 78.125, 71.5625, 75.625, 75.625, 75.3125, 73.125, 75.0, 73.4375, 73.125]
74.59375


-------

## Reporting

(A) - Difference Between The Dataset Used in **1** and **2**

In [13]:
# Data used in pipeline A
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,2.5825,20.65125,2.5475,31.90875,2.10625,3.3975,2.67375,2.84125,2.36625,35.40625,2.67625,1.39625,1.1475,1.39875,1.03375,0.23,0.10125,0.91,0.0375,0.175,0.7125,0.02125,0.20125,0.62875,1.29875
std,1.242023,12.15635,1.084765,27.352617,1.567812,1.20054,0.700303,1.106833,1.06114,11.470317,0.706796,0.569773,0.354825,0.489947,0.180698,0.421096,0.301848,0.286361,0.190102,0.380205,0.45288,0.144307,0.401185,0.483441,0.457996
min,1.0,4.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,19.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,1.0,12.0,2.0,13.0,1.0,3.0,2.0,2.0,1.0,27.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,2.0,18.0,2.0,23.0,1.0,3.0,3.0,3.0,2.0,33.0,3.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
75%,4.0,24.0,4.0,39.0,3.0,5.0,3.0,4.0,3.0,41.0,3.0,2.0,1.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0
max,4.0,72.0,4.0,159.0,5.0,5.0,4.0,4.0,4.0,75.0,3.0,4.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [14]:
# Data used in pipeline B
scaler = StandardScaler()

scaler.fit(data)

scaled_features = scaler.transform(data)
pd.DataFrame(scaled_features, index=data.index, columns=data.columns).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,1.554312e-17,-8.65974e-17,1.487699e-16,-3.5527140000000005e-17,-9.65894e-17,-2.220446e-18,-4.4408920000000007e-17,-3.663736e-17,1.44329e-17,6.661338e-18,-9.992007e-18,3.774758e-17,1.398881e-16,8.65974e-17,2.753353e-16,-6.661338000000001e-17,-2.220446e-18,-8.21565e-17,1.887379e-17,1.7763570000000002e-17,-1.076916e-16,4.8849810000000005e-17,-6.439294000000001e-17,-6.994405000000001e-17,-1.776357e-16
std,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626,1.000626
min,-1.274928,-1.370614,-2.349905,-1.094135,-0.7060427,-1.998268,-2.391533,-1.66457,-1.288336,-1.431217,-2.373102,-0.695887,-0.4159574,-0.8143719,-0.1868926,-0.5465357,-0.3356434,-3.179797,-0.1973855,-0.4605662,-1.574249,-0.1473478,-0.5019524,-1.301385,-0.6527059
25%,-1.274928,-0.7121103,-0.5050335,-0.6917283,-0.7060427,-0.3313082,-0.9626857,-0.7605267,-1.288336,-0.7333284,0.4583396,-0.695887,-0.4159574,-0.8143719,-0.1868926,-0.5465357,-0.3356434,0.3144855,-0.1973855,-0.4605662,-1.574249,-0.1473478,-0.5019524,-1.301385,-0.6527059
50%,-0.4692862,-0.2182323,-0.5050335,-0.3259039,-0.7060427,-0.3313082,0.4661614,0.1435169,-0.3453635,-0.2099118,0.4583396,-0.695887,-0.4159574,-0.8143719,-0.1868926,-0.5465357,-0.3356434,0.3144855,-0.1973855,-0.4605662,0.6352234,-0.1473478,-0.5019524,0.7684122,-0.6527059
75%,1.141997,0.2756456,1.339838,0.2594153,0.5704187,1.335651,0.4661614,1.047561,0.5976086,0.4879769,0.4583396,1.060295,-0.4159574,1.22794,-0.1868926,-0.5465357,-0.3356434,0.3144855,-0.1973855,-0.4605662,0.6352234,-0.1473478,-0.5019524,0.7684122,1.532084
max,1.141997,4.226669,1.339838,4.649309,1.84688,1.335651,1.895008,1.047561,1.540581,3.454004,0.4583396,4.572658,2.404093,1.22794,5.350666,1.829707,2.979353,0.3144855,5.066228,2.171241,0.6352234,6.786665,1.992221,0.7684122,1.532084


(A) Here we can see that columns [1, 3, and 9] have a very huge region compared to other columns 
so after scaling all columns it helped in the accuracy measure **Not By Much Though (Just 0.6%) Which is not much**

But if we tried to drop columns that have very wide range and a lot of unique values (say > 5)
we can see that the accuracy of the scaled version of the data is actually nearly the same as the not scaled one
if not even less accurate

(B) The Averaged Accuracy of both pipelines ranges in a not small range of accuracy (about 6%) and that might be due to the fact that some of the points might be outliers or have noise in them which affects the learning of the linear svm algorithm

(C) Again the Difference in the averaged accuracies between the two pipelines is not that noticeable, maybe due to the fact that most of the data range is very close
If we tried to do feature selection to pick only the most important features to learn form, then we might get a noticeable difference

(D) \
1 - Reading the data and parse it into X, and y arrays \
2 - create the svm function which takes the X, and y with the training model pipeline and run the learning algorithm for 10 times and then return all of them as an array \
3 - take the mean of these accuracies to compare between the two pipelines \
4 - for the second pipeline, I've applied a StandardScaler for normalizing the input data. I didn't apply any extra preprocessing though for the second pipeline \
5 - I've choosen C hyperparameter to be 0.1 just after some trials

-------

In [15]:
from sklearn import datasets
from svm import SVM
from sklearn.model_selection import train_test_split

iris = datasets.load_iris()

X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y == 0)

svm_clf = SVM()
svm_clf.fit(X_train, y_train)

y_predict = svm_clf.predict(X_test)

accuracy = 0

for i in range(len(X_test)):
    if (y_predict[i] == 1 and y_test[i] == True) or (y_predict[i] == -1 and y_test[i] == False):
        accuracy += 1

print(accuracy / len(X_test) * 100)

100.0
