# Support Vector Machines with Python

Welcome to the Support Vector Machines in Python for the Titanic Crash.

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [11]:
#Getting Data

train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')

**Since we already know the data of this problem, I am going to clean the data. But I know the it isn't the only way to clean it.**

In [3]:
# Data Cleaning

# Defining a fuction to fulfill the missing data in age column
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    
    if pd.isnull(Age):

        if Pclass == 1:
            return 37

        elif Pclass == 2:
            return 29

        else:
            return 24

    else:
        return Age

In [12]:
# Applying to test and train data

train['Age'] = train[['Age','Pclass']].apply(impute_age,axis=1)
test['Age'] = test[['Age','Pclass']].apply(impute_age,axis=1)

In [13]:
test['Fare'].loc[152]=test['Fare'].mean()
test['Fare'].loc[152]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


35.6271884892086

In [14]:
# Cleaning Cabin column

train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)

In [15]:
#Removing lines with Missing arguments 

train.dropna(inplace=True)
train[train['Fare'].isna()==True]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [16]:
test[test['Fare'].isna()==True]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked


In [18]:
## Converting Categorical Features and dropping unuseful args

train_new = pd.get_dummies(data=train,columns=['Sex'],drop_first=True)
train_new = pd.get_dummies(data=train_new,columns=['Embarked'],drop_first=True)
train_new.drop(['Name','Ticket'],axis=1,inplace=True)
train_new.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,1,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,1
4,5,0,3,35.0,0,0,8.05,1,0,1


In [20]:
## Converting Categorical Features and dropping unuseful args

test_new = pd.get_dummies(data=test,columns=['Sex'],drop_first=True)
test_new = pd.get_dummies(data=test_new,columns=['Embarked'],drop_first=True)
test_new.drop(['Name','Ticket'],axis=1,inplace=True)
test_new.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,1,1,0
1,893,3,47.0,1,0,7.0,0,0,1
2,894,2,62.0,0,0,9.6875,1,1,0
3,895,3,27.0,0,0,8.6625,1,0,1
4,896,3,22.0,1,1,12.2875,0,0,1


## Train Test Split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(train_new.drop(['PassengerId','Survived'],axis=1), train_new['Survived'], test_size=0.33)

In [23]:
from sklearn.svm import SVC

In [24]:
model = SVC()

In [25]:
model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [26]:
predictions = model.predict(X_test)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print('\n')
print(classification_report(y_test,predictions))

[[130  50]
 [ 45  69]]


             precision    recall  f1-score   support

          0       0.74      0.72      0.73       180
          1       0.58      0.61      0.59       114

avg / total       0.68      0.68      0.68       294



# Gridsearch

In [27]:
param_grid = {'C': [0.01,0.1,1, 10, 100, 1000], 'gamma': [1,0.1,0.01,0.001,0.0001]}

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)

In [30]:
# This may take a while

grid.fit(X_train,y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV] C=0.01, gamma=1 .................................................
[CV] ........ C=0.01, gamma=1, score=0.6180904522613065, total=   0.0s
[CV] C=0.01, gamma=1 .................................................
[CV] ........ C=0.01, gamma=1, score=0.6212121212121212, total=   0.0s
[CV] C=0.01, gamma=1 .................................................
[CV] ........ C=0.01, gamma=1, score=0.6212121212121212, total=   0.0s
[CV] C=0.01, gamma=0.1 ...............................................
[CV] ...... C=0.01, gamma=0.1, score=0.6180904522613065, total=   0.0s
[CV] C=0.01, gamma=0.1 ...............................................
[CV] ...... C=0.01, gamma=0.1, score=0.6212121212121212, total=   0.0s
[CV] C=0.01, gamma=0.1 ...............................................
[CV] ...... C=0.01, gamma=0.1, score=0.6212121212121212, total=   0.0s
[CV] C=0.01, gamma=0.01 ..............................................
[CV] ..... C=0.0

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .... C=0.01, gamma=0.001, score=0.6212121212121212, total=   0.0s
[CV] C=0.01, gamma=0.001 .............................................
[CV] .... C=0.01, gamma=0.001, score=0.6212121212121212, total=   0.0s
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ... C=0.01, gamma=0.0001, score=0.6180904522613065, total=   0.0s
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ... C=0.01, gamma=0.0001, score=0.6212121212121212, total=   0.0s
[CV] C=0.01, gamma=0.0001 ............................................
[CV] ... C=0.01, gamma=0.0001, score=0.6212121212121212, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......... C=0.1, gamma=1, score=0.6180904522613065, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......... C=0.1, gamma=1, score=0.6212121212121212, total=   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] .

[CV] ........ C=1000, gamma=1, score=0.6030150753768844, total=   0.0s
[CV] C=1000, gamma=1 .................................................
[CV] ........ C=1000, gamma=1, score=0.6313131313131313, total=   0.0s
[CV] C=1000, gamma=1 .................................................
[CV] ........ C=1000, gamma=1, score=0.6616161616161617, total=   0.0s
[CV] C=1000, gamma=0.1 ...............................................
[CV] ...... C=1000, gamma=0.1, score=0.6582914572864321, total=   0.0s
[CV] C=1000, gamma=0.1 ...............................................
[CV] ...... C=1000, gamma=0.1, score=0.6666666666666666, total=   0.0s
[CV] C=1000, gamma=0.1 ...............................................
[CV] ....... C=1000, gamma=0.1, score=0.696969696969697, total=   0.0s
[CV] C=1000, gamma=0.01 ..............................................
[CV] ..... C=1000, gamma=0.01, score=0.6984924623115578, total=   0.0s
[CV] C=1000, gamma=0.01 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    2.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [32]:
grid.best_params_

{'C': 1000, 'gamma': 0.001}

In [35]:
pred_grid = grid.predict(X_test)
print(confusion_matrix(y_test,pred_grid))
print('\n')
print(classification_report(y_test,pred_grid))

[[159  21]
 [ 35  79]]


             precision    recall  f1-score   support

          0       0.82      0.88      0.85       180
          1       0.79      0.69      0.74       114

avg / total       0.81      0.81      0.81       294

