## Challange: Build a predictive modle that answer the question "What sorts of people were more likely to survice the Titanic sninking?

In [1]:
# import modules and packages
import numpy as np
import pandas as pd
import sklearn
print(sklearn.__version__)

1.0


Import data

In [2]:
train_data = pd.read_csv('Data/train.csv', sep=',')
test_data = pd.read_csv('Data/test.csv', sep=',')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S



Survived: that's the target, 0 means the passenger did not survive, while 1 means he/she survived.

Pclass: passenger class.

Name, Sex, Age: self-explanatory

SibSp: how many siblings & spouses of the passenger aboard the Titanic.

Parch: how many children & parents of the passenger aboard the Titanic.

Ticket: ticket id

Fare: price paid (in pounds)

Cabin: passenger's cabin number

Embarked: where the passenger embarked the Titanic


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Training data consist of data about 891 passanger

Age, Cabin and Emberked columns have null values, (Age - 20%, Cabin - 77%, Embarked - only 2) 

Name, Sex, Ticket, Cabin and Emberaked columns are non-numerical data, out of these Name and Ticket data are useless

Important data to be considered, 
1. Pclass
2. Sex
3. Age
4. SibSp
5. Parch
6. Fare
7. Cabin
8. Embarked


In [5]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
train_data['Survived'].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [7]:
train_data['Age'].mean()

29.69911764705882

### Lets, expolore non-numeric data

In [8]:
train_data['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [9]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64


Only 38% Survived, that's close enough to 40%, so accuracy will be a reasonable metric to evaluate our model.

The mean Fare was £32.20, which does not seem so expensive (but it was probably a lot of money back then).

The mean Age was less than 30 years old.



### Let's create numeric pipline for numeric data

In [10]:
# custom tranformar select only desired coloumn
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names]

In [11]:
# let define numeric pipline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

numeric_pipline = Pipeline([
    ('Select_numeric_values', DataFrameSelector(['Age', 'SibSp', 'Parch', 'Fare'])), # get only numeric coloums/ attributes
    ('imputer', SimpleImputer(strategy='median')), #fill null values
])

### Let's create pipline for catogorical attibutes

Note - SimpleImputer only works for numerical data so, to full fit missing value we need custom imputer


In [12]:
# Defined custome imputer which fill most frequent value to missing values

class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [13]:
from sklearn.preprocessing import OneHotEncoder

catogorical_pipline = Pipeline([
    ('select_catogorical_data', DataFrameSelector(['Pclass', 'Sex', 'Embarked'])),
    ('imputer', MostFrequentImputer()),
    ('catogorical_encoder', OneHotEncoder(sparse=False)),
])

In [14]:
# combine numerical and catogorical piplines together with ColumnTransformer
from sklearn.compose import ColumnTransformer

numerical_attibutes = ['Age', 'SibSp', 'Parch', 'Fare']
catagorical_attributes = ['Pclass', 'Sex', 'Embarked']

full_pipline = ColumnTransformer([
    ('numeric_transformation', numeric_pipline, numerical_attibutes),
    ('catogorical_transformation', catogorical_pipline, catagorical_attributes),
])

In [15]:
# transform train_data using fullpipline
X_train = full_pipline.fit_transform(train_data)

In [16]:
y_train = train_data['Survived']

The data transformed for training

## Let's try with SVC - support vactor classifier

In [17]:
from sklearn.svm import SVC
svc_classifier = SVC(gamma='scale')
svc_classifier.fit(X_train, y_train)

SVC()

Let's validate using CrossValScore

In [18]:
from sklearn.model_selection import cross_val_score
svm_scores = cross_val_score(svc_classifier, X_train, y_train, cv=10)
svm_scores.mean()

0.6813233458177278

Let's try RandomForestClassifier()

In [19]:
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier()
forest_classifier.fit(X_train, y_train)

RandomForestClassifier()

In [20]:
forest_classifier_scores = cross_val_score(forest_classifier, X_train, y_train, cv=10)
print(forest_classifier_scores)
print(forest_classifier_scores.mean())

[0.74444444 0.78651685 0.75280899 0.84269663 0.86516854 0.84269663
 0.80898876 0.76404494 0.84269663 0.84269663]
0.8092759051186018


## Let's do prediction with test data

In [21]:
X_test = full_pipline.fit_transform(test_data)
y_pred = forest_classifier.predict(X_test)


## Evaluate model using Confusion metrix and Cross validation

In [22]:
true_labels = pd.read_csv('Data/gender_submission.csv', sep=',')
true_labels = true_labels['Survived']
true_labels.value_counts()

0    266
1    152
Name: Survived, dtype: int64

In [23]:
# confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

y_test_pred = cross_val_predict(forest_classifier, X_test, y_pred, cv=10)
confusion_matrix(true_labels, y_test_pred)

array([[235,  31],
       [ 26, 126]])

In [24]:
# cross validation
cross_val_score_test = cross_val_score(forest_classifier, X_test, y_pred, cv=10)
print(cross_val_score_test)
print(cross_val_score_test.mean())

[0.92857143 0.92857143 0.92857143 0.92857143 0.88095238 0.88095238
 0.92857143 0.85714286 0.92682927 0.90243902]
0.9091173054587689


## Export y_pred to csv

In [51]:
# create a dataframe

y_pred_list = y_pred.tolist()
df_y_pred = pd.DataFrame(
    data = [[key, value] for key, value in enumerate(y_pred_list, start=892)],
    columns=['PassengerId', 'Survived']
) 

In [54]:
df_y_pred.tail()

Unnamed: 0,PassengerId,Survived
413,1305,0
414,1306,1
415,1307,0
416,1308,0
417,1309,1


In [58]:
# export as csv
pd.DataFrame.to_csv(df_y_pred, 'Data/prediction.csv', index=False)