# Support Vector Machine (SVM)
This document will look at implementing the SVM Model on a subset of the Titanic Dataset to classify individuals survived or perished in the disaster.

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing Data

In [2]:
raw_titanic_df = pd.read_csv(r'data\partial_titanic.csv')
print("Number of Observations: " + str(raw_titanic_df.iloc[:,1].count()))
raw_titanic_df.head(5)

Number of Observations: 891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Selecting Features & Preprocessing
I will select some features that do not require a lot of engineering to be compatible with the model.
To Note:
- EDA not performed, undertaken an understanding of this dataset in other documents.
- Simple Selection to demonstrate the Logitic Regression Classifier.
- Pclass and Gender will be one-hot-encoded
- All features will be scaled

In [3]:
# Select a subset of features
sub_titanic_df = raw_titanic_df[["Pclass", "Sex", "Age", "Fare", "Survived"]].copy()
print(sub_titanic_df.isna().sum())
print("Length before NaN drop: " + str(sub_titanic_df.iloc[:, 1].count()))

Pclass        0
Sex           0
Age         177
Fare          0
Survived      0
dtype: int64
Length before NaN drop: 891


In [4]:
# Drop NaN's from Age column which is the only column they exist in
sub_titanic_df = sub_titanic_df[sub_titanic_df["Age"].notna()]
print("Length After NaN drop: " + str(sub_titanic_df.iloc[:, 1].count()))

Length After NaN drop: 714


In [5]:
# Split Target and Dependant Variables
X = sub_titanic_df.iloc[:, :-1]
y = sub_titanic_df.iloc[:, -1]

print("X Dependant Variables: ")
print(str(X.head()) + "\n")
print(r"y Target Predictor (Survived): ")
print(y.head())

X Dependant Variables: 
   Pclass     Sex   Age     Fare
0       3    male  22.0   7.2500
1       1  female  38.0  71.2833
2       3  female  26.0   7.9250
3       1  female  35.0  53.1000
4       3    male  35.0   8.0500

y Target Predictor (Survived): 
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [6]:
# Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 0 & 1 as thats the index of the columns to transform
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [8]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Scale Train and Test sets by the Scaler that knows the Training Data
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [9]:
print("X_train Dependant Variables One-Hot-Encoded & Feature Scaled: ")
print(str(X_train[0:5]) + "\n")
print(r"y Target Predictor (Survived): ")
print(y_train[:5])

X_train Dependant Variables One-Hot-Encoded & Feature Scaled: 
[[-0.59534056  1.80167471 -1.00562329 -0.75426451  0.75426451 -0.44288641
  -0.36401192]
 [-0.59534056 -0.55503915  0.99440816 -0.75426451  0.75426451 -0.92142214
  -0.49818747]
 [-0.59534056 -0.55503915  0.99440816 -0.75426451  0.75426451  0.24073606
  -0.48196269]
 [-0.59534056  1.80167471 -1.00562329  1.32579484 -1.32579484  0.3090983
  -0.40176517]
 [-0.59534056 -0.55503915  0.99440816 -0.75426451  0.75426451 -0.64797316
  -0.49541937]]

y Target Predictor (Survived): 
135    0
764    0
103    0
576    1
664    1
Name: Survived, dtype: int64


# Building Linear SVM Model

In [10]:
# Training the SVM model on the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear')
classifier.fit(X_train, y_train)

SVC(kernel='linear')

In [11]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [12]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[84 19]
 [28 48]]


0.7374301675977654

# Building  SVM With Parameter Tuning

In [13]:
# Set the parameters by cross-validation
parameters = [{'kernel': ['rbf', 'linear'],
                'gamma': [0.01, 0.1, 0.5],
                'C': [10, 100, 1000]}]
print("# Tuning hyper-parameters")
from sklearn.model_selection import GridSearchCV
# Fit Classifier with Grid Search CV
classifier = GridSearchCV(SVC(), parameters, cv=5)
classifier.fit(X_train, y_train)

# Tuning hyper-parameters


GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [10, 100, 1000], 'gamma': [0.01, 0.1, 0.5],
                          'kernel': ['rbf', 'linear']}])

In [14]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [15]:
# Best Parameters
print('best parameters:')
print(classifier.best_params_)

best parameters:
{'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}


In [16]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[90 13]
 [25 51]]


0.7877094972067039

In [17]:
# List of All Combinations
print("All Combinations:")
print('-------------------------------------')
means = classifier.cv_results_['mean_test_score']
stds = classifier.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, classifier.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))

All Combinations:
-------------------------------------
0.785 (+/-0.037) for {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.794 (+/-0.068) for {'C': 10, 'gamma': 0.01, 'kernel': 'linear'}
0.813 (+/-0.065) for {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.794 (+/-0.068) for {'C': 10, 'gamma': 0.1, 'kernel': 'linear'}
0.809 (+/-0.063) for {'C': 10, 'gamma': 0.5, 'kernel': 'rbf'}
0.794 (+/-0.068) for {'C': 10, 'gamma': 0.5, 'kernel': 'linear'}
0.781 (+/-0.044) for {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.794 (+/-0.068) for {'C': 100, 'gamma': 0.01, 'kernel': 'linear'}
0.806 (+/-0.045) for {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.794 (+/-0.068) for {'C': 100, 'gamma': 0.1, 'kernel': 'linear'}
0.804 (+/-0.067) for {'C': 100, 'gamma': 0.5, 'kernel': 'rbf'}
0.794 (+/-0.068) for {'C': 100, 'gamma': 0.5, 'kernel': 'linear'}
0.802 (+/-0.069) for {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.794 (+/-0.068) for {'C': 1000, 'gamma': 0.01, 'kernel': 'linear'}
0.817 (+/-0.061) for {'C': 1000, 'gamma