In [642]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


In [643]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
test_df_backup = test_df.copy()

In [644]:
train_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [645]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [646]:
#The features Paseesnger ID, Name and ticket are not major factors for survival
#so lets drop them
train_df = train_df.drop(["PassengerId"], axis = 1)
train_df = train_df.drop(["Name"], axis = 1)
train_df = train_df.drop(["Ticket"], axis = 1)

In [647]:
train_df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [648]:
#too many values (77%) are missing for the column, so lets drop it
train_df = train_df.drop(["Cabin"], axis = 1)

#Scope for improvement 1: now age is filled with just the mean, instead,
#for age of passenger lets fill in ages with random numbers within one standard deviation of mean
#as this region includes 68% of the values and also doesnt put in small / large values
age_mean = train_df["Age"].mean()
train_df["Age"].fillna(age_mean, inplace = True)

#lets fill in embarked with 'S' as most people embarked in Southampton
train_df["Embarked"].fillna('S', inplace = True)


In [649]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [650]:
# converting data into numeric

#Sex
Sex_dict = {"male":1, "female":0}
train_df["Sex"] = train_df["Sex"].map(Sex_dict)

#Embarked
train_df["Embarked_C"] = (train_df["Embarked"] == 'C')
train_df["Embarked_Q"] = (train_df["Embarked"] == 'Q')
train_df["Embarked_S"] = (train_df["Embarked"] == 'S')
train_df = train_df.drop(["Embarked"], axis = 1)

In [651]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,7.25,False,False,True
1,1,1,0,38.0,1,0,71.2833,True,False,False
2,1,3,0,26.0,0,0,7.925,False,False,True
3,1,1,0,35.0,1,0,53.1,False,False,True
4,0,3,1,35.0,0,0,8.05,False,False,True


In [652]:
#lets convert the age into a more useful form

train_df["Child"] = (train_df["Age"] <= 11)
train_df["Young"] = (train_df["Age"] > 11) & (train_df["Age"] <= 20)
train_df["Adult"] = (train_df["Age"] > 20) & (train_df["Age"] <= 35)
train_df["Mature"] = (train_df["Age"] > 35) & (train_df["Age"] <= 50)
train_df["Old"] = (train_df["Age"] > 50) 

train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Child,Young,Adult,Mature,Old
0,0,3,1,22.0,1,0,7.25,False,False,True,False,False,True,False,False
1,1,1,0,38.0,1,0,71.2833,True,False,False,False,False,False,True,False
2,1,3,0,26.0,0,0,7.925,False,False,True,False,False,True,False,False
3,1,1,0,35.0,1,0,53.1,False,False,True,False,False,True,False,False
4,0,3,1,35.0,0,0,8.05,False,False,True,False,False,True,False,False


In [653]:
#normalize the Fare and Age and store the skewing values:

age_mean = train_df["Age"].mean()
age_stddev = train_df["Age"].std()
train_df["Age"] = (train_df["Age"] - age_mean)/age_stddev

fare_mean = train_df["Fare"].mean()
fare_stddev = train_df["Fare"].std()
train_df["Fare"] = (train_df["Fare"] - fare_mean)/fare_stddev

In [654]:
train_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Child,Young,Adult,Mature,Old
0,0,3,1,-0.592148,1,0,-0.502163,False,False,True,False,False,True,False,False
1,1,1,0,0.63843,1,0,0.786404,True,False,False,False,False,False,True,False
2,1,3,0,-0.284503,0,0,-0.48858,False,False,True,False,False,True,False,False
3,1,1,0,0.407697,1,0,0.420494,False,False,True,False,False,True,False,False
4,0,3,1,0.407697,0,0,-0.486064,False,False,True,False,False,True,False,False


In [655]:
#do the same processes to the test dataframe

#drop the useless
test_df = test_df.drop(["PassengerId"], axis = 1)
test_df = test_df.drop(["Name"], axis = 1)
test_df = test_df.drop(["Ticket"], axis = 1)

#take care of nan
test_df = test_df.drop(["Cabin"], axis = 1)
test_df["Age"].fillna(age_mean, inplace = True)
test_df["Embarked"].fillna('S', inplace = True)
#test df has one value of fare nan
test_df["Fare"].fillna(test_df["Fare"].mean(), inplace = True)

#convert to numeric
#Sex
test_df["Sex"] = test_df["Sex"].map(Sex_dict)
#Embarked
test_df["Embarked_C"] = (test_df["Embarked"] == 'C')
test_df["Embarked_Q"] = (test_df["Embarked"] == 'Q')
test_df["Embarked_S"] = (test_df["Embarked"] == 'S')
test_df = test_df.drop(["Embarked"], axis = 1)

#convert age
test_df["Child"] = (test_df["Age"] <= 11)
test_df["Young"] = (test_df["Age"] > 11) & (test_df["Age"] <= 20)
test_df["Adult"] = (test_df["Age"] > 20) & (test_df["Age"] <= 35)
test_df["Mature"] = (test_df["Age"] > 35) & (test_df["Age"] <= 50)
test_df["Old"] = (test_df["Age"] > 50) 

#Normalize Age and fare using the same skewing constants
test_df["Age"] = (test_df["Age"] - age_mean)/age_stddev
test_df["Fare"] = (test_df["Fare"] - fare_mean)/fare_stddev

test_df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S,Child,Young,Adult,Mature,Old
0,3,1,0.369241,0,0,-0.490508,False,True,False,False,False,True,False,False
1,3,0,1.330631,1,0,-0.507194,False,False,True,False,False,False,True,False
2,2,1,2.484298,0,0,-0.453112,False,True,False,False,False,False,False,True
3,3,1,-0.207592,0,0,-0.473739,False,False,True,False,False,True,False,False
4,3,0,-0.592148,1,1,-0.400792,False,False,True,False,False,True,False,False


In [656]:
X = train_df.drop(["Survived"], axis = 1).values.astype('float32')
y = train_df["Survived"].values.astype('int32')

X_test = test_df.values.astype('float32')

In [657]:
from sklearn.model_selection import train_test_split

In [658]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size = 0.25)

In [659]:
X_train

array([[ 1.00000000e+00,  0.00000000e+00,  1.00052364e-01, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.00000000e+00,  1.00000000e+00,  3.09958744e+00, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 1.00000000e+00,  0.00000000e+00,  4.37189279e-15, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 3.00000000e+00,  0.00000000e+00,  4.37189279e-15, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 3.00000000e+00,  1.00000000e+00, -1.05361497e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 3.00000000e+00,  0.00000000e+00,  1.33063078e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00]], dtype=float32)

In [660]:
import tensorflow as tf

In [661]:
#initialize model

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(25, activation = 'relu', input_shape=(14,)),
    tf.keras.layers.Dense(5, activation = 'relu'),
    tf.keras.layers.Dense(5, activation = 'relu'),
    tf.keras.layers.Dense(1)
])


#predictions
y_pred = model(X_train[:1]).numpy()
y_pred

array([[0.12846227]], dtype=float32)

In [662]:
adam_op = tf.keras.optimizers.Adam(
    learning_rate= 0.001 , beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam')


model.compile(optimizer = adam_op,
             loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics = ['accuracy'])

In [663]:
model.fit(X_train, y_train, epochs = 1)

Train on 668 samples


<tensorflow.python.keras.callbacks.History at 0x7f123d273eb8>

In [664]:
dev_loss, dev_accuracy = model.evaluate(X_dev, y_dev, verbose = 2)

223/223 - 0s - loss: 0.7060 - accuracy: 0.6233


In [665]:
# iteration over different NN sizes:

# size                train_loss    train_accuracy    dev_loss   dev_accuracy  dropout           inference
# 15 10 10 10 1 ..... 0.4592        0.8069            0.4386     0.7982        no overfitting    
# 15 10 1 ........... 0.4171        0.8204            0.4153     0.8161        no overfitting     
# 7 5 5 1 ........... 0.3919        0.8293            0.4239     0.8341        no overfitting     
# 20 5 1 ............ 0.4198        0.8278            0.4271     0.8072        no overfitting     
# 25 5 1 ............ 0.3811        0.8398            0.4522     0.8430        0.1 0             25 best for first layer    
# 30 5 1 ............ 0.3948        0.8308            0.4024     0.8341        0.1 0
# 25 5 5 1 .......... 0.3657        0.8473            0.4108     0.8430        0.1 0            

In [666]:
#now lets train the model with best result on the entire dataset and calculate the submission


model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(25, activation = 'relu', input_shape=(14,)),
    tf.keras.layers.Dense(5, activation = 'relu'),
    tf.keras.layers.Dense(5, activation = 'relu'),
    tf.keras.layers.Dense(1)
])


# alpha 0.01
# define optimizer with alpha
adam_op = tf.keras.optimizers.Adam(
    learning_rate= 0.001 , beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam')

model.compile(optimizer = adam_op,
             loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics = ['accuracy'])
#train 10 epochs
model.fit(X, y, epochs = 20)

# alpha 0.001
# define optimizer with alpha
adam_op = tf.keras.optimizers.Adam(
    learning_rate= 0.001 , beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam')

model.compile(optimizer = adam_op,
             loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics = ['accuracy'])
#train 8 epochs
model.fit(X, y, epochs = 10)

Train on 891 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 891 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f123e5589e8>

In [667]:
predictions = model(X_test).numpy()
predictions = 1/(1 + np.exp(-predictions))
binary_predictions = predictions > 0.5
print(predictions)

[[0.10675918]
 [0.38868147]
 [0.08732238]
 [0.09613194]
 [0.5893589 ]
 [0.12757371]
 [0.7068591 ]
 [0.11504515]
 [0.6702491 ]
 [0.07599391]
 [0.08756807]
 [0.30386868]
 [0.921625  ]
 [0.0772842 ]
 [0.94195867]
 [0.8471336 ]
 [0.17658016]
 [0.18992342]
 [0.6210139 ]
 [0.54345995]
 [0.4272423 ]
 [0.6761581 ]
 [0.8882225 ]
 [0.43406993]
 [0.9675373 ]
 [0.05056777]
 [0.85574126]
 [0.18103938]
 [0.3567077 ]
 [0.19893801]
 [0.11598974]
 [0.09409959]
 [0.46331894]
 [0.49985173]
 [0.4462242 ]
 [0.22556755]
 [0.634193  ]
 [0.69061244]
 [0.10312463]
 [0.10986763]
 [0.13469973]
 [0.39093843]
 [0.07243817]
 [0.82653254]
 [0.9317717 ]
 [0.10392543]
 [0.4115247 ]
 [0.12458088]
 [0.92960596]
 [0.43542397]
 [0.4103704 ]
 [0.21794887]
 [0.379905  ]
 [0.94969666]
 [0.21115915]
 [0.13178155]
 [0.07614566]
 [0.10406586]
 [0.07030672]
 [0.9527212 ]
 [0.11554017]
 [0.14887045]
 [0.11221774]
 [0.7160411 ]
 [0.59125596]
 [0.8424135 ]
 [0.645884  ]
 [0.3688028 ]
 [0.36707544]
 [0.8340767 ]
 [0.71829695]
 [0.12

In [671]:
print(np.shape(binary_predictions))

(418, 1)


In [672]:
submission = pd.DataFrame(test_df_backup["PassengerId"])
submission["Survived"] = binary_predictions.astype("int32")
submission

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [673]:
submission.to_csv('submissions.csv')