# 180030699 - DV Project (Prediction part)

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import tree
from time import time

## Viewing the dataset

In [64]:
data_raw = pd.read_csv("train.csv", index_col='PassengerId')
data_validate = pd.read_csv("test.csv", index_col='PassengerId')
data_raw.sample(10)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25.0,0,0,244361,13.0,,S
841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20.0,0,0,SOTON/O2 3101287,7.925,,S
605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35.0,0,0,111426,26.55,,C
268,1,3,"Persson, Mr. Ernst Ulrik",male,25.0,1,0,347083,7.775,,S
472,0,3,"Cacic, Mr. Luka",male,38.0,0,0,315089,8.6625,,S
527,1,2,"Ridsdale, Miss. Lucy",female,50.0,0,0,W./C. 14258,10.5,,S
136,0,2,"Richard, Mr. Emile",male,23.0,0,0,SC/PARIS 2133,15.0458,,C
40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C
185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4.0,0,2,315153,22.025,,S
271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31.0,,S


In [65]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [66]:
data_raw.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [67]:
data_raw.describe(include='all')

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,891,2,,,,681.0,,147,3
top,,,"Sobey, Mr. Samuel James Hayden",male,,,,1601.0,,B96 B98,S
freq,,,1,577,,,,7.0,,4,644
mean,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [68]:
data_raw['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [69]:
data_raw['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

## Cleaning and Wrangling the Data

In [70]:
data_copy = data_raw.copy(deep=True)
data_cleaner = [data_copy, data_validate]


We note that there 177 entries for Age do not exist. Instead of deleting these entries completely, we shall instead fill these age columns with the median age. We choose median over mean because there are both babies(Age is a fraction less than one) and very old people as well which might skew the value of mean.

In the case of the port of Embarkation, we see that only 2 values are null. We will use the mode of this column to fill in these values.

In [71]:
for dataset in data_cleaner:
    dataset['Age'].fillna(dataset['Age'].median(), inplace = True)
    dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)
    dataset.drop(['Cabin', 'Ticket', 'Fare', 'Name'], axis=1, inplace = True)

In [72]:
for dataset in data_cleaner:
    dataset['Family'] = dataset['SibSp'] + dataset['Parch'] + 1
    dataset.drop(['SibSp', 'Parch'], axis=1, inplace = True)

In [73]:
data_cleaner[0].head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Embarked,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,male,22.0,S,2
2,1,1,female,38.0,C,2
3,1,3,female,26.0,S,1
4,1,1,female,35.0,S,2
5,0,3,male,35.0,S,1


In [74]:
for dataset in data_cleaner:
    dataset['Sex'].loc[dataset['Sex'] == 'male'] = 0
    dataset['Sex'].loc[dataset['Sex'] == 'female'] = 1
    dataset['Embarked'].loc[dataset['Embarked'] == 'C'] = 0
    dataset['Embarked'].loc[dataset['Embarked'] == 'Q'] = 1
    dataset['Embarked'].loc[dataset['Embarked'] == 'S'] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [75]:
data_cleaner[0].head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,Embarked,Family
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,0,22.0,2,2
2,1,1,1,38.0,0,2
3,1,3,1,26.0,2,1
4,1,1,1,35.0,2,2
5,0,3,0,35.0,2,1


In [79]:
data_cleaner[0].to_csv('TitanicDataSet.csv')

## Splitting up the data

In [38]:
data_clean, data_validate = data_cleaner
data_labels = data_clean['Survived']
data_features = data_clean.drop('Survived', axis=1)

Splitting up the labels and features into training and testing sets.

In [39]:
features_train, features_test, labels_train, labels_test = train_test_split(data_features, data_labels,
                                                                            test_size=0.2, random_state=42)

Taking a look at our testing, training and validating data

##### Training Data

In [40]:
features_train.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
332,1,0,45.5,2,1
734,2,0,23.0,2,1
383,3,0,32.0,2,1
705,3,0,26.0,2,2
814,3,1,6.0,2,7


In [41]:
labels_train.head()

PassengerId
332    0
734    0
383    0
705    0
814    0
Name: Survived, dtype: int64

##### Testing Data

In [42]:
features_test.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
710,3,0,28.0,0,3
440,2,0,31.0,2,1
841,3,0,20.0,2,1
721,2,1,6.0,2,2
40,3,1,14.0,0,2


In [43]:
labels_test.head()

PassengerId
710    1
440    0
841    0
721    1
40     1
Name: Survived, dtype: int64

##### Validation Data

In [44]:
data_validate.head()

Unnamed: 0_level_0,Pclass,Sex,Age,Embarked,FamilySize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
892,3,0,34.5,1,1
893,3,1,47.0,2,2
894,2,0,62.0,1,1
895,3,0,27.0,2,1
896,3,1,22.0,2,3


## Applying Naive Bayes

In [45]:
nb_classifier = GaussianNB()

In [46]:
t0 = time()
nb_classifier.fit(features_train, labels_train)
print("Training Time: ", time()-t0, "s.", sep='')

Training Time: 0.0049860477447509766s.


In [47]:
t1 = time()
nb_pred = nb_classifier.predict(features_test)
print("Testing Time: ", time()-t1, "s.", sep='')

Testing Time: 0.0029926300048828125s.


In [61]:
nb_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [48]:
print("Accuracy: ", accuracy_score(labels_test, nb_pred))

Accuracy: 0.770949720670391.
