In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
PATH = 'titanic/'

In [3]:
df = pd.read_csv(f'{PATH}train.csv', na_values= ['NaN', ' '])

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.isna().sum()/len(df)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
dtype: float64

In [6]:
newcabin01 = df.Cabin.isna()
df = df.join(newcabin01, rsuffix='01')

In [7]:
df.Sex.mode()

0    male
dtype: object

In [8]:
df.Sex.fillna(df.Sex.mode(), inplace=True)

In [9]:
df.Sex = df.Sex.astype('category')
df['Sex01'] = df.Sex.cat.codes
df.Sex.cat.categories

Index(['female', 'male'], dtype='object')

Sex = 0 = Female

Sex = 1 = Male

In [10]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Cabin01,Sex01
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,True,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,False,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,True,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,False,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,True,1


In [11]:
df.isna().sum()/len(df)

PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.198653
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
Cabin          0.771044
Embarked       0.002245
Cabin01        0.000000
Sex01          0.000000
dtype: float64

In [12]:
df.Embarked.mode()

0    S
dtype: object

In [13]:
df.Embarked.fillna('S', inplace=True)

In [14]:
df.drop(['Cabin','Sex'],axis=1, inplace=True)

In [15]:
df[df.Age.isna()].groupby('Survived').count()

Unnamed: 0_level_0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Embarked,Cabin01,Sex01
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,125,125,125,0,125,125,125,125,125,125,125
1,52,52,52,0,52,52,52,52,52,52,52


In [16]:
df.Age.fillna(df.Age.median(), inplace = True)

In [17]:
df.Embarked = df.Embarked.astype('category')
df['Emb01'] = df.Embarked.cat.codes
df.Embarked.cat.categories

Index(['C', 'Q', 'S'], dtype='object')

In [18]:
df.SibSp.value_counts()

0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

In [19]:
df.Parch.value_counts()

0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

In [23]:
df['Ticket_len'] = df.Ticket.apply(lambda x : len(x))

In [24]:
df_fin = df.drop(['Name', 'Ticket', 'PassengerId', 'Embarked'], axis=1)

In [25]:
df_fin.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Cabin01,Sex01,Emb01,Ticket_len
0,0,3,22.0,1,0,7.25,True,1,2,9
1,1,1,38.0,1,0,71.2833,False,0,0,8
2,1,3,26.0,0,0,7.925,True,0,2,16
3,1,1,35.0,1,0,53.1,False,0,2,6
4,0,3,35.0,0,0,8.05,True,1,2,6


In [26]:
x = df_fin.iloc[:,1:]
y = df_fin.Survived

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
x_train, x_val, y_train, y_val = train_test_split(x, y)

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
m = RandomForestClassifier(n_estimators=200, min_samples_leaf=2, max_features='auto')
m.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
print('Train score: ', m.score(x_train, y_train))
print('Test score: ', m.score(x_val, y_val))

Train score:  0.9251497005988024
Test score:  0.8475336322869955


In [41]:
df_test = pd.read_csv(f'{PATH}test.csv', na_values= ['NaN', ' '])

In [42]:
df_test.isna().sum()/len(df_test)

PassengerId    0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
Age            0.205742
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.002392
Cabin          0.782297
Embarked       0.000000
dtype: float64

In [43]:
#Create Cabin column with 1 = Cabin, 0 = Not a cabin
newcabin01 = df_test.Cabin.isna()
df_test = df_test.join(newcabin01, rsuffix='01')

#Impute missing Age with its median
df_test.Age.fillna(df_test.Age.median(), inplace = True)

#Impute missing Fare with its median
df_test.Fare.fillna(df_test.Fare.median(), inplace = True)

#Convert Sex into category and create Sex01
df_test.Sex = df_test.Sex.astype('category')
df_test['Sex01'] = df_test.Sex.cat.codes
df_test.Sex.cat.categories

#Convert Embarked into category and create new embarked with only numeric
df_test.Embarked = df_test.Embarked.astype('category')
df_test['Emb01'] = df_test.Embarked.cat.codes
df_test.Embarked.cat.categories

#Create new feature call Ticket_len which contain length of ticket name
df_test['Ticket_len'] = df_test.Ticket.apply(lambda x : len(x))

In [44]:
df_test_fin = df_test.drop(['PassengerId','Name','Cabin','Ticket', 'Embarked','Sex'],axis=1)

In [45]:
df_test_fin.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Cabin01,Sex01,Emb01,Ticket_len
0,3,34.5,0,0,7.8292,True,1,1,6
1,3,47.0,1,0,7.0,True,0,2,6
2,2,62.0,0,0,9.6875,True,1,1,6
3,3,27.0,0,0,8.6625,True,1,2,6
4,3,22.0,1,1,12.2875,True,0,2,7


In [57]:
pred = m.predict(df_test_fin)

In [58]:
pred.tofile('submit5.csv', sep=',')

In [73]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

In [71]:
m2 = MLPClassifier(hidden_layer_sizes=(100,5))
m2.fit(x_train,y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [72]:
print('Train score: ', m2.score(x_train, y_train))
print('Test score: ', m2.score(x_val, y_val))

Train score:  0.75
Test score:  0.6995515695067265


In [166]:
pred = m2.predict(df_test_fin)

In [167]:
pred.tofile('submit2.csv', sep=',')