# Section 1-1 - Basic Neural Network - Titanic

In the previous section, we simply iterated through randomly-generated matrices and chose the best-performing one. We build on this approach by reducing loss in a systematic way via stochastic gradient descent. In particular, we'll be using TensorFlow, an open source library developed by Google, and Keras, a high-level wrapper on top of TensorFlow.

In [43]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from time import time

np.random.seed(1337)

titanic = pd.read_csv('train.csv')

titanic_test = pd.read_csv('test.csv')

In [44]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [45]:
titanic.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [46]:
titanic_test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

In [47]:
# updating age data as median for each pclass
# FOR TRAIN
for pclass in range(1,4):
    selector = (titanic['Age'].isnull()) & (titanic['Pclass'] == pclass)
    good_age = (titanic['Age'].notnull()) & (titanic['Pclass'] == pclass)
    titanic[selector] = titanic[selector].fillna(titanic[good_age].median())

In [48]:
# updating age data as median for each pclass
# FOR TEST
for pclass in range(1,4):
    selector = (titanic_test['Age'].isnull()) & (titanic_test['Pclass'] == pclass)
    good_age = (titanic_test['Age'].notnull()) & (titanic_test['Pclass'] == pclass)
    titanic_test[selector] = titanic_test[selector].fillna(titanic_test[good_age].median())

In [49]:
# Replace all the occurences of male with the number 0
titanic.loc[titanic["Sex"] == "male", "Sex"] = 0
titanic_test.loc[titanic_test["Sex"] == "male", "Sex"] = 0

In [50]:
# Replace all the occurences of female with the number 1
titanic.loc[titanic["Sex"] == "female", "Sex"] = 1
titanic_test.loc[titanic_test["Sex"] == "female", "Sex"] = 1

In [51]:
# FOR TEST
# add missing fare data
selector = (titanic_test['Fare'].isnull())
pclass = titanic_test[selector]['Pclass']
print(pclass.values[0])
titanic_test[selector]['Fare'] = titanic_test[titanic_test["Pclass"] == pclass.values[0]]['Fare'].median()


3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [52]:
from collections import Counter

# FOR TRAIN
for pclass in range(1,4):
    selector = (titanic['Embarked'].isnull()) & (titanic['Pclass'] == pclass)
    embark = (titanic['Embarked'].notnull()) & (titanic['Pclass'] == pclass)
    titanic[selector] = titanic[selector].fillna(Counter(titanic[embark]["Embarked"]).most_common()[0][1])

In [53]:
# FOR TEST
for pclass in range(1,4):
    selector = (titanic_test['Embarked'].isnull()) & (titanic_test['Pclass'] == pclass)
    embark = (titanic_test['Embarked'].notnull()) & (titanic_test['Pclass'] == pclass)
    titanic_test[selector] = titanic_test[selector].fillna(Counter(titanic_test[embark]["Embarked"]).most_common()[0][1])

In [54]:
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2

titanic_test.loc[titanic_test["Embarked"] == "S", "Embarked"] = 0
titanic_test.loc[titanic_test["Embarked"] == "C", "Embarked"] = 1
titanic_test.loc[titanic_test["Embarked"] == "Q", "Embarked"] = 2

In [55]:
# Generating a familysize column
titanic["FamilySize"] = titanic["SibSp"] + titanic["Parch"]
titanic_test["FamilySize"] = titanic_test["SibSp"] + titanic_test["Parch"]

# The .apply method generates a new series
titanic["NameLength"] = titanic["Name"].apply(lambda x: len(x))
titanic_test["NameLength"] = titanic_test["Name"].apply(lambda x: len(x))

In [56]:
import re

# A function to get the title from a name
def get_title(name):
    # Use a regular expression to search for a title  
    # Titles always consist of capital and lowercase letters, and end with a period
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it
    if title_search:
        return title_search.group(1)
    return ""

# Get all of the titles, and print how often each one occurs
titles = titanic["Name"].apply(get_title)
print(pd.value_counts(titles))

# Map each title to an integer  
# Some titles are very rare, so they're compressed into the same codes as other titles
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 8, "Mme": 8, "Don": 9, "Dona": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}

test_titles = titanic_test["Name"].apply(get_title)

for k,v in title_mapping.items():
    titles[titles == k] = v
    test_titles[test_titles == k] = v

# Verify that we converted everything
print(pd.value_counts(titles))

# Add in the title column
titanic["Title"] = titles
titanic_test["Title"] = test_titles

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Major         2
Col           2
Jonkheer      1
Mme           1
Don           1
Capt          1
Ms            1
Lady          1
Countess      1
Sir           1
Name: Name, dtype: int64
1     517
2     183
3     125
4      40
5       7
6       6
7       5
10      3
8       3
9       2
Name: Name, dtype: int64


In [57]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,NameLength,Title
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,0,1,23,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,1,1,51,3
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,22,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,0,1,44,3
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,0,0,24,1
5,6,0,3,"Moran, Mr. James",0,24.0,0,0,330877,8.4583,,2,0,16,1
6,7,0,1,"McCarthy, Mr. Timothy J",0,54.0,0,0,17463,51.8625,E46,0,0,23,1
7,8,0,3,"Palsson, Master. Gosta Leonard",0,2.0,3,1,349909,21.0750,,0,4,30,4
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",1,27.0,0,2,347742,11.1333,,0,2,49,3
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",1,14.0,1,0,237736,30.0708,,1,1,35,3


In [58]:
import operator

# A dictionary mapping family name to ID
family_id_mapping = {}

# A function to get the ID for a particular row
def get_family_id(row):
    # Find the last name by splitting on a comma
    last_name = row["Name"].split(",")[0]
    # Create the family ID
    family_id = "{0}{1}".format(last_name, row["FamilySize"])
    # Look up the ID in the mapping
    if family_id not in family_id_mapping:
        if len(family_id_mapping) == 0:
            current_id = 1
        else:
            # Get the maximum ID from the mapping, and add 1 to it if we don't have an ID
            current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)
        family_id_mapping[family_id] = current_id
    return family_id_mapping[family_id]

# Get the family IDs with the apply method
family_ids = titanic.apply(get_family_id, axis=1)
test_family_ids = titanic_test.apply(get_family_id, axis=1)

# There are a lot of family IDs, so we'll compress all of the families with less than three members into one code
family_ids[titanic["FamilySize"] < 3] = -1
test_family_ids[titanic_test["FamilySize"] < 3] = -1

# Print the count of each unique ID
print(pd.value_counts(family_ids))
print(pd.value_counts(test_family_ids))

titanic["FamilyId"] = family_ids
titanic_test["FamilyId"] = test_family_ids

-1      800
 14       8
 149      7
 63       6
 50       6
 59       6
 17       5
 384      4
 27       4
 25       4
 162      4
 8        4
 84       4
 340      4
 43       3
 269      3
 58       3
 633      2
 167      2
 280      2
 510      2
 90       2
 83       1
 625      1
 376      1
 449      1
 498      1
 588      1
dtype: int64
-1      384
 149      4
 25       3
 280      3
 27       2
 59       2
 633      2
 510      2
 167      2
 90       2
 162      1
 759      1
 449      1
 84       1
 269      1
 58       1
 43       1
 794      1
 918      1
 17       1
 14       1
 8        1
dtype: int64


In [59]:
scaler = StandardScaler()
features = ["Pclass",
            "Sex",
            "Age",
            "SibSp",
            "Parch",
            "Fare",
            "Embarked", 
            "FamilySize", 
            "NameLength",
            "Title",
            "FamilyId"]

train = titanic#titanic.iloc[:712, :]

print(titanic.head(5))
X_train = scaler.fit_transform(train[features].values)
y_train = train['Survived'].values
y_train_onehot = pd.get_dummies(train['Survived']).values

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name  Sex   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris    0  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...    1  38.0      1      0   
2                             Heikkinen, Miss. Laina    1  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)    1  35.0      1      0   
4                           Allen, Mr. William Henry    0  35.0      0      0   

             Ticket     Fare Cabin Embarked  FamilySize  NameLength Title  \
0         A/5 21171   7.2500   NaN        0           1          23     1   
1          PC 17599  71.2833   C85        1           1          51     3   
2  STON/O2. 3101282   7.9250   NaN        0           0          



In [60]:
#normalize
from sklearn.preprocessing import normalize

titanic = titanic.drop("Cabin", axis=1)
titanic = titanic.drop("Ticket", axis=1)
titanic = titanic.drop("Name", axis=1)

titanic_test = titanic_test.drop("Cabin", axis=1)
titanic_test = titanic_test.drop("Ticket", axis=1)
titanic_test = titanic_test.drop("Name", axis=1)

In [61]:
titanic_test.count()
titanic_test = titanic_test.fillna(titanic_test["Fare"].median())

In [62]:
titanic_test.count()

PassengerId    418
Pclass         418
Sex            418
Age            418
SibSp          418
Parch          418
Fare           418
Embarked       418
FamilySize     418
NameLength     418
Title          418
FamilyId       418
dtype: int64

In [63]:
# def normalize(df):
#     result = df.copy()
#     for feature_name in df.columns[1:]:
#         max_value = df[feature_name].max()
#         min_value = df[feature_name].min()
#         result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
#     return result

# titanic = normalize(titanic)
# titanic_test = normalize(titanic_test)


In [64]:
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,NameLength,Title,FamilyId
0,1,0,3,0,22.0,1,0,7.2500,0,1,23,1,-1
1,2,1,1,1,38.0,1,0,71.2833,1,1,51,3,-1
2,3,1,3,1,26.0,0,0,7.9250,0,0,22,2,-1
3,4,1,1,1,35.0,1,0,53.1000,0,1,44,3,-1
4,5,0,3,0,35.0,0,0,8.0500,0,0,24,1,-1
5,6,0,3,0,24.0,0,0,8.4583,2,0,16,1,-1
6,7,0,1,0,54.0,0,0,51.8625,0,0,23,1,-1
7,8,0,3,0,2.0,3,1,21.0750,0,4,30,4,8
8,9,1,3,1,27.0,0,2,11.1333,0,2,49,3,-1
9,10,1,2,1,14.0,1,0,30.0708,1,1,35,3,-1


In [65]:
titanic_test.head(5)
#pd.get_dummies(titanic_test)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,NameLength,Title,FamilyId
0,892,3,0,34.5,0,0,7.8292,2,0,16,1,-1
1,893,3,1,47.0,1,0,7.0,0,1,32,3,-1
2,894,2,0,62.0,0,0,9.6875,2,0,25,1,-1
3,895,3,0,27.0,0,0,8.6625,0,0,16,1,-1
4,896,3,1,22.0,1,1,12.2875,0,2,44,3,-1


In [66]:
df_test = titanic.iloc[712:, :]

X_test = scaler.transform(df_test[features].values)
y_test = df_test['Survived'].values



## Benchmark

In [67]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=0, verbose=3)
model = model.fit(X_train, train['Survived'].values)

y_prediction = model.predict(X_test)
print("\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test)))

building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10

accuracy 0.988826815642


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


## 1-layer Neural Network

Instead of generating a linear stack of layers with Numpy, we'll be implementing our model using Keras. We initialize our model, add a layer that inputs vectors of length 4 and outputs vectors of length 2, and finally add a softmax layer. We configure the learning process in the compilation step by specifying the optimizer, loss function and performance metric.

Stochastic gradient descent acts by changing the weights gradually in the 'direction' that decreases the average loss. In other words, a particular weight would be increased if acts to decrease loss, or the weight decreased if it acts to increase loss. TensorFlow does the heavy-lifting by efficiently handling these numerical computations under the hood. A simple example of stochastic gradient descent is illustrated in the Appendix.

In [68]:
from keras.models import Sequential
from keras.layers import Dense, Activation

start = time()

model = Sequential()
model.add(Dense(input_dim=len(features), units=2))
model.add(Activation("softmax"))


model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.fit(X_train, y_train_onehot)

print('\ntime taken %s seconds' % str(time() - start))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

time taken 2.089813709259033 seconds


In [69]:
y_prediction = model.predict_classes(X_test)
print("\n\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test)))

 32/179 [====>.........................] - ETA: 1s

accuracy 0.793296089385


We notice that the loss reduces systematically as the model 'learns' from the data. The rate of loss reduction, however, seems to indicate that loss could be further reduced.

## 2-layer Neural Network

In [70]:
start = time()

model = Sequential()
model.add(Dense(input_dim=len(features), units=100))
model.add(Dense(units=2))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.fit(X_train, y_train_onehot, epochs=10)

print('\ntime taken %s seconds' % str(time() - start))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

time taken 1.5181419849395752 seconds


In [71]:
y_prediction = model.predict_classes(X_test)
print("\n\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test)))

 32/179 [====>.........................] - ETA: 1s

accuracy 0.826815642458


The loss reduction 'flattens out' more compared to the 1-layer example, and the accuracy improves to 81%.

## 3-layer Neural Network

In [72]:
start = time()

model = Sequential()
model.add(Dense(input_dim=len(features), units=100))
model.add(Dense(units=100))
model.add(Dense(units=2))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.fit(X_train, y_train_onehot, epochs=20)

print('\ntime taken %s seconds' % str(time() - start))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

time taken 4.060379266738892 seconds


In [73]:
y_prediction = model.predict_classes(X_test)
print("\n\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test)))

 32/179 [====>.........................] - ETA: 1s

accuracy 0.832402234637


# 5 layer model

In [74]:
start = time()

model = Sequential()
model.add(Dense(input_dim=len(features), units=512))
model.add(Dense(units=128))
model.add(Activation("sigmoid"))
model.add(Dense(units=32))
model.add(Dense(units=2))
model.add(Activation("softmax"))

model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

model.fit(X_train, y_train_onehot, epochs=20)

print('\ntime taken %s seconds' % str(time() - start))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

time taken 4.80978798866272 seconds


In [75]:
y_prediction = model.predict_classes(X_test)
print("\n\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test)))

 32/179 [====>.........................] - ETA: 1s

accuracy 0.821229050279


# LSTM MODEL

In [76]:
y_prediction = model.predict_classes(X_test)
print("\n\naccuracy", np.sum(y_prediction == y_test) / float(len(y_test)))

 32/179 [====>.........................] - ETA: 0s

accuracy 0.821229050279


In [77]:
X_test

array([[ -1.56610693e+00,  -7.37695132e-01,   1.43034315e+00, ...,
         -3.19650794e-01,  -6.21284528e-01,  -2.18080848e-01],
       [  8.27377244e-01,  -7.37695132e-01,  -5.01684984e-03, ...,
         -1.04049842e-01,  -6.21284528e-01,  -2.18080848e-01],
       [ -3.69364841e-01,  -7.37695132e-01,   1.73252420e+00, ...,
         -6.43052222e-01,  -6.21284528e-01,  -2.18080848e-01],
       ..., 
       [  8.27377244e-01,   1.35557354e+00,  -3.82743164e-01, ...,
          1.40515682e+00,   1.63913365e-01,   8.85888367e+00],
       [ -1.56610693e+00,  -7.37695132e-01,  -2.31652639e-01, ...,
         -6.43052222e-01,  -6.21284528e-01,  -2.18080848e-01],
       [  8.27377244e-01,  -7.37695132e-01,   2.21618939e-01, ...,
         -8.58653174e-01,  -6.21284528e-01,  -2.18080848e-01]])

In [78]:
titanic_test[features]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,NameLength,Title,FamilyId
0,3,0,34.5,0,0,7.8292,2,0,16,1,-1
1,3,1,47.0,1,0,7.0000,0,1,32,3,-1
2,2,0,62.0,0,0,9.6875,2,0,25,1,-1
3,3,0,27.0,0,0,8.6625,0,0,16,1,-1
4,3,1,22.0,1,1,12.2875,0,2,44,3,-1
5,3,0,14.0,0,0,9.2250,0,0,26,1,-1
6,3,1,30.0,0,0,7.6292,2,0,20,2,-1
7,2,0,26.0,1,1,29.0000,0,2,28,1,-1
8,3,1,18.0,0,0,7.2292,1,0,41,3,-1
9,3,0,21.0,2,0,24.1500,0,2,23,1,-1


In [79]:
titanic_test = titanic_test.fillna(0)

In [80]:
titanic_test.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,NameLength,Title,FamilyId
0,892,3,0,34.5,0,0,7.8292,2,0,16,1,-1
1,893,3,1,47.0,1,0,7.0,0,1,32,3,-1
2,894,2,0,62.0,0,0,9.6875,2,0,25,1,-1
3,895,3,0,27.0,0,0,8.6625,0,0,16,1,-1
4,896,3,1,22.0,1,1,12.2875,0,2,44,3,-1


In [81]:
X_test = scaler.transform(titanic_test[features].values)

In [82]:
titanic_test[features]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,FamilySize,NameLength,Title,FamilyId
0,3,0,34.5,0,0,7.8292,2,0,16,1,-1
1,3,1,47.0,1,0,7.0000,0,1,32,3,-1
2,2,0,62.0,0,0,9.6875,2,0,25,1,-1
3,3,0,27.0,0,0,8.6625,0,0,16,1,-1
4,3,1,22.0,1,1,12.2875,0,2,44,3,-1
5,3,0,14.0,0,0,9.2250,0,0,26,1,-1
6,3,1,30.0,0,0,7.6292,2,0,20,2,-1
7,2,0,26.0,1,1,29.0000,0,2,28,1,-1
8,3,1,18.0,0,0,7.2292,1,0,41,3,-1
9,3,0,21.0,2,0,24.1500,0,2,23,1,-1


In [83]:
predictions = model.predict_classes(X_test.astype(float))#[:,1]
    
predictions[predictions <= .5] = 0
predictions[predictions > .5] = 1
predictions = predictions.astype(int)
submission = pd.DataFrame({
        "PassengerId": titanic_test["PassengerId"],
        "Survived": predictions
    })



In [84]:
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0,

In [85]:
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [86]:
submission.to_csv("gender_submission.csv", index=False)

While we're able to reduce loss on the training set a little further, the best performance obtained is merely comparable to our benchmark. Since the dataset is small, there isn't as much for the model to 'learn' from (or for that matter, predict on). We'll apply techniques developed so far on a much larger dataset in the next section.