# First attempt at Titanic Dataset

## Using SVM on 'Age', 'Embarked', 'Pclass' and 'Sex' features 

In [46]:
%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
import numpy as np
import plotly.express as px

## Loading in data

In [47]:
chosen_features = ['Age', 'Embarked', 'Sex', 'Pclass']

#Load in datasets
train_df = pd.read_csv('../data/train.csv', usecols=['Survived', 'PassengerId'] + chosen_features)
test_df = pd.read_csv('../data/test.csv', usecols=['PassengerId'] + chosen_features)


#combine both data sets
test_df['Survived'] = np.NAN 
combined = pd.concat([train_df, test_df])
combined.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked
0,1,0.0,3,male,22.0,S
1,2,1.0,1,female,38.0,C
2,3,1.0,3,female,26.0,S
3,4,1.0,1,female,35.0,S
4,5,0.0,3,male,35.0,S


In [48]:
print('Number of missing Age values : {}'.format(combined['Age'].isnull().sum()))
print('Number of missing Embarked values : {}'.format(combined['Embarked'].isnull().sum()))

Number of missing Age values : 263
Number of missing Embarked values : 2


## Data Preparation : 'Embarked'

In [49]:
#Low number of missing values -> statistically insignificant
#Solution : Choose one of three unique values and insert into missing cells

combined['Embarked'] = combined['Embarked'].fillna('Q')

#One-Hot Encoding
combined['ES'] = combined['Embarked'] == 'S' 
combined['EQ'] = combined['Embarked'] == 'Q'
combined['EC'] = combined['Embarked'] == 'C'

## Data Preparation : 'Age'

Inspired by approach from this dataset : https://www.kaggle.com/pliptor/divide-and-conquer-0-82296

In [50]:
#Create new column with missing Age rows

combined['Age_missing'] = combined['Age'] == np.NAN
combined['Age'] = combined['Age'].fillna(-1)

fig = px.histogram(combined, x=combined['Age'], color=combined['Survived'].notna(), nbins=150)
fig.show()

#Create Minor feature
combined['Minor'] = (combined['Age']<14.0)&(combined['Age']>=0)

## Data Preparation : 'Pclass'

In [51]:
#One-Hot Encoding for Pclass

combined['P1'] = combined['Pclass'] == 1 
combined['P2'] = combined['Pclass'] == 2
combined['P3'] = combined['Pclass'] == 3

## Data Preparation : 'Sex'

In [52]:
#Mapping values in 'Sex'

combined['Sex'] = combined['Sex'].map({'male': 0, 'female': 1})

## Final Data Prep

In [53]:
#drop unnecessary columns

combined = combined.drop(columns=['Pclass', 'Embarked', 'Age'])
combined.head()

Unnamed: 0,PassengerId,Survived,Sex,ES,EQ,EC,Age_missing,Minor,P1,P2,P3
0,1,0.0,0,True,False,False,False,False,False,False,True
1,2,1.0,1,False,False,True,False,False,True,False,False
2,3,1.0,1,True,False,False,False,False,False,False,True
3,4,1.0,1,True,False,False,False,False,True,False,False
4,5,0.0,0,True,False,False,False,False,False,False,True


In [54]:
train_df = combined.loc[combined['Survived'].isin([np.NAN]) == False]
test_df = combined.loc[combined['Survived'].isin([np.NAN]) == True]

print('Training data shape : {}'.format(train_df.shape))
print('Test data shape : {}'.format(test_df.shape))

Training data shape : (891, 11)
Test data shape : (418, 11)


## Modelling

In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [68]:
model = SVC()

feature_names = ['Sex','P1','P2','P3','EQ','ES','EC','Age_missing','Minor']

#Using grid search to find best value for regularization parameter, C

param_grid = {
    'C': [1, 2, 5, 10, 20, 50]
}

#Using 10 K-Fold
grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=10,
)

grid_search.fit(
    np.array(train_df[feature_names]), 
    np.array(train_df['Survived'])
)

GridSearchCV(cv=10, estimator=SVC(), param_grid={'C': [1, 2, 5, 10, 20, 50]})

In [69]:
#Results
print('Best C value : {}'.format(grid_search.best_params_))
grid_results = pd.DataFrame(grid_search.cv_results_)
print('Model accuracy on unseen data : {0:1.4f}'.format(grid_results['mean_test_score'][grid_search.best_index_]))

Best C value : {'C': 1}
Model accuracy on unseen data : 0.8238


In [71]:
#Prediction
pred = grid_search.predict(np.array(test_df[feature_names]))

submission = pd.DataFrame({'PassengerId':test_df['PassengerId'],'Survived':pred})
submission.to_csv('FirstAttempt.csv', index = False, float_format='%1d')
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,0.0


## Public Score obtained : 0.78708