# Support Vector Machine

In [1]:
import pandas as pd
import numpy as np

# Scaling
from sklearn.preprocessing import StandardScaler

from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# Metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Social_Network_Ads.csv")
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [19]:
df['Gender'].value_counts()

Gender
Female    204
Male      196
Name: count, dtype: int64

In [3]:
df['Gender'].replace({"Female":0, "Male":1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].replace({"Female":0, "Male":1}, inplace=True)
  df['Gender'].replace({"Female":0, "Male":1}, inplace=True)


In [4]:
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0
2,15668575,0,26,43000,0
3,15603246,0,27,57000,0
4,15804002,1,19,76000,0
...,...,...,...,...,...
395,15691863,0,46,41000,1
396,15706071,1,51,23000,1
397,15654296,0,50,20000,1
398,15755018,1,36,33000,0


In [5]:
df['User ID'].value_counts()

User ID
15624510    1
15767681    1
15589449    1
15791373    1
15688172    1
           ..
15675185    1
15792102    1
15722758    1
15745232    1
15594041    1
Name: count, Length: 400, dtype: int64

In [6]:
df.drop(['User ID'], axis=1, inplace=True)

In [7]:
df

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0
3,0,27,57000,0
4,1,19,76000,0
...,...,...,...,...
395,0,46,41000,1
396,1,51,23000,1
397,0,50,20000,1
398,1,36,33000,0


#### Scaling

In [8]:
df.columns

Index(['Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [10]:
x = df[['Age', 'EstimatedSalary']]

std_scalar = StandardScaler()
array = std_scalar.fit_transform(x)

x = pd.DataFrame(array, columns=x.columns)
x['Gender'] = df['Gender']
x

Unnamed: 0,Age,EstimatedSalary,Gender
0,-1.781797,-1.490046,1
1,-0.253587,-1.460681,1
2,-1.113206,-0.785290,0
3,-1.017692,-0.374182,0
4,-1.781797,0.183751,1
...,...,...,...
395,0.797057,-0.844019,0
396,1.274623,-1.372587,1
397,1.179110,-1.460681,0
398,-0.158074,-1.078938,1


In [11]:
y = df['Purchased']

### Train test split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.3, random_state=10, stratify=y)
x_train

Unnamed: 0,Age,EstimatedSalary,Gender
254,1.179110,-0.755925,0
33,-0.922179,-0.755925,0
360,0.510518,1.740088,1
168,-0.826666,2.298020,1
241,0.032952,-0.315452,1
...,...,...,...
269,0.223978,-0.256722,1
327,0.415005,0.154386,0
74,-0.540127,-1.519411,1
99,-0.922179,-0.961479,0


# Model Training

In [13]:
svm_clf = SVC()
svm_clf.fit(x_train, y_train)

# Model Evaluation

In [16]:
# Testing Accuracy

y_pred = svm_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n", cnf_matrix)
print("*"*55)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy Score:",accuracy)
print("*"*55)


clf_report = classification_report(y_pred, y_test)
print("Classification report:\n", clf_report)

Confusion Matrix:
 [[69  3]
 [ 8 40]]
*******************************************************
Accuracy Score: 0.9083333333333333
*******************************************************
Classification report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93        72
           1       0.93      0.83      0.88        48

    accuracy                           0.91       120
   macro avg       0.91      0.90      0.90       120
weighted avg       0.91      0.91      0.91       120



In [17]:
# Training Accuracy

y_pred_train = svm_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n", cnf_matrix)
print("*"*55)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy Score:",accuracy)
print("*"*55)


clf_report = classification_report(y_pred_train, y_train)
print("Classification report:\n", clf_report)

Confusion Matrix:
 [[166   9]
 [ 14  91]]
*******************************************************
Accuracy Score: 0.9178571428571428
*******************************************************
Classification report:
               precision    recall  f1-score   support

           0       0.92      0.95      0.94       175
           1       0.91      0.87      0.89       105

    accuracy                           0.92       280
   macro avg       0.92      0.91      0.91       280
weighted avg       0.92      0.92      0.92       280



# Hyperparameter Tunning

In [15]:
y_train

254    0
33     0
360    1
168    1
241    0
      ..
269    0
327    0
74     0
99     0
18     1
Name: Purchased, Length: 280, dtype: int64

In [20]:
svm_clf = SVC()

hyperparameters = {"C":np.arange(1,100),
                  "kernel": ['linear', 'poly', 'rbf', 'sigmoid']}

gscv_svm = GridSearchCV(svm_clf, hyperparameters, cv=5)
gscv_svm.fit(x_train, y_train)
gscv_svm.best_estimator_

### Evaluation after Hyperparameter Tunning

In [21]:
svm_clf = gscv_svm.best_estimator_
svm_clf.fit(x_train, y_train)

In [22]:
# Testing Accuracy

y_pred = svm_clf.predict(x_test)

cnf_matrix = confusion_matrix(y_pred, y_test)
print("Confusion Matrix:\n", cnf_matrix)
print("*"*55)

accuracy = accuracy_score(y_pred, y_test)
print("Accuracy Score:",accuracy)
print("*"*55)


clf_report = classification_report(y_pred, y_test)
print("Classification report:\n", clf_report)

Confusion Matrix:
 [[68  4]
 [ 9 39]]
*******************************************************
Accuracy Score: 0.8916666666666667
*******************************************************
Classification report:
               precision    recall  f1-score   support

           0       0.88      0.94      0.91        72
           1       0.91      0.81      0.86        48

    accuracy                           0.89       120
   macro avg       0.90      0.88      0.88       120
weighted avg       0.89      0.89      0.89       120



In [23]:
# Training Accuracy

y_pred_train = svm_clf.predict(x_train)

cnf_matrix = confusion_matrix(y_pred_train, y_train)
print("Confusion Matrix:\n", cnf_matrix)
print("*"*55)

accuracy = accuracy_score(y_pred_train, y_train)
print("Accuracy Score:",accuracy)
print("*"*55)


clf_report = classification_report(y_pred_train, y_train)
print("Classification report:\n", clf_report)

Confusion Matrix:
 [[167   6]
 [ 13  94]]
*******************************************************
Accuracy Score: 0.9321428571428572
*******************************************************
Classification report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95       173
           1       0.94      0.88      0.91       107

    accuracy                           0.93       280
   macro avg       0.93      0.92      0.93       280
weighted avg       0.93      0.93      0.93       280

