# Titanic: Machine Learning from Disaster

## Basic imports

In [1]:
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

import pandas as pd

## Importing and checking data

In [2]:
df_train = pd.read_csv('train.csv')
print df_train.head()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


## Prepping and splitting the data

In [3]:
X = df_train[["Pclass","Sex","Fare","Age","SibSp"]]
y = df_train["Survived"]


## Encoding categorical values

In [4]:
X_encoded = pd.get_dummies(X)
print X_encoded.head(10)

   Pclass     Fare   Age  SibSp  Sex_female  Sex_male
0       3   7.2500  22.0      1           0         1
1       1  71.2833  38.0      1           1         0
2       3   7.9250  26.0      0           1         0
3       1  53.1000  35.0      1           1         0
4       3   8.0500  35.0      0           0         1
5       3   8.4583   NaN      0           0         1
6       1  51.8625  54.0      0           0         1
7       3  21.0750   2.0      3           0         1
8       3  11.1333  27.0      0           1         0
9       2  30.0708  14.0      1           1         0


## Filling in missing values

In [5]:
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean',axis=0)
imp.fit(X_encoded)
X_imputed = imp.transform(X_encoded)
print X_imputed[:,2]

[ 22.          38.          26.          35.          35.          29.69911765
  54.           2.          27.          14.           4.          58.          20.
  39.          14.          55.           2.          29.69911765  31.
  29.69911765  35.          34.          15.          28.           8.          38.
  29.69911765  19.          29.69911765  29.69911765  40.          29.69911765
  29.69911765  66.          28.          42.          29.69911765  21.          18.
  14.          40.          27.          29.69911765   3.          19.
  29.69911765  29.69911765  29.69911765  29.69911765  18.           7.          21.
  49.          29.          65.          29.69911765  21.          28.5
   5.          11.          22.          38.          45.           4.
  29.69911765  29.69911765  29.          19.          17.          26.          32.
  16.          21.          26.          32.          25.          29.69911765
  29.69911765   0.83        30.          22.          29. 

## Constructing a basic nb classifier

In [6]:
X_train,X_test,y_train,y_test = train_test_split(X_imputed,y,random_state = 11, test_size= 0.2)
nb = GaussianNB()
nb.fit(X_train,y_train)
y_predict = nb.predict(X_test)
print 'Basic NB accuracy with two features: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

Basic NB accuracy with two features: 84.9%


## Constructing a basic SVM classifier

In [7]:
sv = SVC()
sv.fit(X_train,y_train)
y_predict = sv.predict(X_test)
print 'Basic SVM accuracy with two features: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

Basic SVM accuracy with two features: 80.4%


## Constructing a basic DecisionTree classifier

In [8]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_predict = dt.predict(X_test)
print 'Basic DT accuracy: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

Basic DT accuracy: 79.3%


## Constructing a basic Neural Network

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled=  scaler.transform(X_test)
print X_test_scaled
nn = MLPClassifier(solver='adam', alpha=0.0001,hidden_layer_sizes=(9, 600), random_state=11,max_iter = 10000)
nn.fit(X_train_scaled, y_train) 
y_predict = nn.predict(X_test_scaled)
print 'Basic NN accuracy: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

[[ 0.8249189  -0.32990743  0.0057571   0.41160985  1.34693328 -1.34693328]
 [ 0.8249189  -0.47522138 -0.19943268 -0.46846887 -0.74242727  0.74242727]
 [ 0.8249189  -0.4933761   0.0057571  -0.46846887 -0.74242727  0.74242727]
 ..., 
 [ 0.8249189  -0.49020117 -0.57953797 -0.46846887 -0.74242727  0.74242727]
 [-1.55780832 -0.12573522  0.0057571  -0.46846887 -0.74242727  0.74242727]
 [-1.55780832  1.13503694 -0.35147479  0.41160985 -0.74242727  0.74242727]]
Basic NN accuracy: 87.2%


## Constructing a basic RandomForest classifier

In [10]:
rf = RandomForestClassifier(bootstrap=True,random_state=1)
rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)
print 'Basic RF accuracy helene: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

Basic RF accuracy helene: 84.4%


## Building on best solution so far: RF and NN

### Hyperparameter tuning on RF

In [55]:
from sklearn.model_selection import GridSearchCV
fulllist =  range(1,20)
unevensList = [x for x in fulllist if x % 2 != 0]
parameters = {'n_estimators':range(2,50),'min_samples_split':range(2,30)}
rf = RandomForestClassifier(bootstrap=True,warm_start=True)
cv = GridSearchCV(rf, parameters,verbose = 1)
cv.fit(X_train,y_train)
y_predict = cv.predict(X_test)
print 'Optimised RF accuracy: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

Fitting 3 folds for each of 1344 candidates, totalling 4032 fits
Optimised RF accuracy: 87.2%


[Parallel(n_jobs=1)]: Done 4032 out of 4032 | elapsed:  5.7min finished


### Hyperparameter tuning on Neural network

In [88]:
from sklearn.model_selection import GridSearchCV

parameters = {'solver':['lbfgs','adam'],'alpha':[0.000001,0.00001,0.0001,0.001,0.01,0.1],'learning_rate':['adaptive','constant'],'warm_start':[True,False]}
nn = MLPClassifier(hidden_layer_sizes=(10, 5), random_state=3,max_iter = 10000)

cv = GridSearchCV(nn, parameters,verbose = 2)
cv.fit(X_train_scaled,y_train)
y_predict = cv.predict(X_test_scaled)

print 'Optimised NN accuracy: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[CV] warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=lbfgs 
[CV]  warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=lbfgs, total=   1.2s
[CV] warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=lbfgs 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=lbfgs, total=   0.7s
[CV] warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=lbfgs 
[CV]  warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=lbfgs, total=   0.5s
[CV] warm_start=False, alpha=1e-06, learning_rate=adaptive, solver=lbfgs 
[CV]  warm_start=False, alpha=1e-06, learning_rate=adaptive, solver=lbfgs, total=   1.1s
[CV] warm_start=False, alpha=1e-06, learning_rate=adaptive, solver=lbfgs 
[CV]  warm_start=False, alpha=1e-06, learning_rate=adaptive, solver=lbfgs, total=   0.7s
[CV] warm_start=False, alpha=1e-06, learning_rate=adaptive, solver=lbfgs 
[CV]  warm_start=False, alpha=1e-06, learning_rate=adaptive, solver=lbfgs, total=   0.5s
[CV] warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=adam 
[CV]  warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=adam, total=   0.1s
[CV] warm_start=True, alpha=1e-06, learning_rate=adaptive, solver=adam 
[CV]  warm_start=True, alpha=1e

[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:  1.2min finished


Optimised NN accuracy: 86.0%


In [56]:
print cv.best_params_

{'min_samples_split': 12, 'n_estimators': 10}


### Varying the NN size with new best params

In [11]:
from sklearn.model_selection import GridSearchCV

rangespace = []
for l1 in range(1,20):
    for l2 in range(1,15):
        for l3 in range(1,10):
            rangespace.append((l1,l2,l3))
            

parameters = {'hidden_layer_sizes':rangespace}
nn = MLPClassifier(random_state=11,max_iter = 10000,warm_start=True,solver='adam',learning_rate='adaptive',alpha=0.01)

cv = GridSearchCV(nn, parameters,verbose = 2)
cv.fit(X_train_scaled,y_train)
y_predict = cv.predict(X_test_scaled)

print 'Optimised NN accuracy: ' + str(round(accuracy_score(y_test,y_predict),3) *100) + '%'

Fitting 3 folds for each of 2394 candidates, totalling 7182 fits
[CV] hidden_layer_sizes=(1, 1, 1) ....................................
[CV] ..................... hidden_layer_sizes=(1, 1, 1), total=   0.2s
[CV] hidden_layer_sizes=(1, 1, 1) ....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] ..................... hidden_layer_sizes=(1, 1, 1), total=   0.2s
[CV] hidden_layer_sizes=(1, 1, 1) ....................................
[CV] ..................... hidden_layer_sizes=(1, 1, 1), total=   0.2s
[CV] hidden_layer_sizes=(1, 1, 2) ....................................
[CV] ..................... hidden_layer_sizes=(1, 1, 2), total=   0.0s
[CV] hidden_layer_sizes=(1, 1, 2) ....................................
[CV] ..................... hidden_layer_sizes=(1, 1, 2), total=   0.0s
[CV] hidden_layer_sizes=(1, 1, 2) ....................................
[CV] ..................... hidden_layer_sizes=(1, 1, 2), total=   0.0s
[CV] hidden_layer_sizes=(1, 1, 3) ....................................
[CV] ..................... hidden_layer_sizes=(1, 1, 3), total=   0.0s
[CV] hidden_layer_sizes=(1, 1, 3) ....................................
[CV] ..................... hidden_layer_sizes=(1, 1, 3), total=   0.0s
[CV] hidden_layer_sizes=(1, 1, 3) ....................................
[CV] .



[CV] .................... hidden_layer_sizes=(2, 10, 1), total=   0.5s
[CV] hidden_layer_sizes=(2, 10, 1) ...................................
[CV] .................... hidden_layer_sizes=(2, 10, 1), total=   0.7s
[CV] hidden_layer_sizes=(2, 10, 2) ...................................
[CV] .................... hidden_layer_sizes=(2, 10, 2), total=   0.6s
[CV] hidden_layer_sizes=(2, 10, 2) ...................................
[CV] .................... hidden_layer_sizes=(2, 10, 2), total=   0.6s
[CV] hidden_layer_sizes=(2, 10, 2) ...................................
[CV] .................... hidden_layer_sizes=(2, 10, 2), total=   0.8s
[CV] hidden_layer_sizes=(2, 10, 3) ...................................
[CV] .................... hidden_layer_sizes=(2, 10, 3), total=   0.1s
[CV] hidden_layer_sizes=(2, 10, 3) ...................................
[CV] .................... hidden_layer_sizes=(2, 10, 3), total=   0.1s
[CV] hidden_layer_sizes=(2, 10, 3) ...................................
[CV] .

[Parallel(n_jobs=1)]: Done 7182 out of 7182 | elapsed: 91.8min finished


Optimised NN accuracy: 87.7%


## Using best classifier on test set

### Import and treat test data

In [43]:
df_test = pd.read_csv('test.csv')
X_test_final = df_test[["Pclass","Sex","Fare","Age","SibSp"]]
X_test_final = pd.get_dummies(X_test_final)
imp = Imputer(missing_values='NaN', strategy='mean',axis=0)
imp.fit(X_test_final)
X_test_final = imp.transform(X_test_final)

### Treat full training data

In [44]:
X_train_final = imp.transform(X_imputed)
scaler = StandardScaler()
scaler.fit(X_train_final)
X_train_final = scaler.transform(X_train_final)
X_test_final = scaler.transform(X_test_final)

### Construct, train and predict

In [45]:
nn = MLPClassifier(random_state=11,max_iter = 10000,warm_start=True,solver='adam',learning_rate='adaptive',alpha=0.01,hidden_layer_sizes=(5, 10,5))
nn.fit(X_train_final, y) 
y_predict_final = nn.predict(X_test_final)

In [54]:
df_entry =  pd.Series.to_frame(df_test["PassengerId"])
df_entry["Survived"] = pd.Series(y_predict_final)
df_entry.to_csv("prediction_1_nn.csv",sep=',')
print df_entry.columns

Index([u'PassengerId', u'Survived'], dtype='object')


## Second attempt: using RF classisifier

In [59]:
df_test = pd.read_csv('test.csv')
X_test_final = df_test[["Pclass","Sex","Fare","Age","SibSp"]]
X_test_final = pd.get_dummies(X_test_final)
imp = Imputer(missing_values='NaN', strategy='mean',axis=0)
imp.fit(X_test_final)
X_test_final = imp.transform(X_test_final)
X_train_final = imp.transform(X_imputed)

rf = RandomForestClassifier(bootstrap=True,warm_start=True,n_estimators=20,min_samples_split=2)
rf.fit(X_train_final, y) 
y_predict_final = rf.predict(X_test_final)

df_entry =  pd.Series.to_frame(df_test["PassengerId"])
df_entry["Survived"] = pd.Series(y_predict_final)
df_entry.to_csv("prediction_3_rf.csv",sep=',')
print df_entry.columns

Index([u'PassengerId', u'Survived'], dtype='object')
