In [30]:
'''
   In this program, I'm gonna follow the following steps to predict
   the probability of survival.
       1. analyze and select features
       2. create a neural network to predict
       3. check the performance of a model
       4. adjust the model
'''

import pandas as pd
from sklearn.preprocessing import StandardScaler

train_data = pd.read_csv("./train.csv")

print(train_data.head(5))


## Features

# Name has no effect on the model, so I just ignore it.
# Sex, first check if there is missing value
train_data['Sex'].isnull().sum(axis=0)

# map the two genders to 0 and 1
train_data.Sex = train_data.Sex.map({'male':0, 'female':1})
train_data[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean()

# We could see from above that female has higher survival rate than male.

# Pclass
train_data['Sex'].isnull().sum(axis=0)

train_data[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean()

# We could see from above that the higher the class, the higher the survival rate.

# Age
train_data['Age'].isnull().sum(axis=0)

# A lot of missing values, I would use it as a feature to train the model.

# SibSp 
train_data.SibSp.isnull().sum(axis=0)

train_data[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean()

# Parch
train_data.Parch.isnull().sum(axis=0)
train_data[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean()

# Feature engineering, combine Parch and SibSp to be the number of families
train_data['Families'] = train_data['SibSp'] + train_data['Parch'] + 1
train_data[['Families', 'Survived']].groupby(['Families'], as_index=False).mean()

# Fare
# We need to normalize the fare data because it is too large
train_data.Fare.isnull().sum(axis=0)
train_data[['Fare', 'Survived']].groupby(['Survived'], as_index=False).mean()

# From above, we see that it seems survived people bought more expensive tickets.

# I will first leave Cabin, Ticket, Embarked aside, and train the model with the features I mentioned before.

## Train the model

Features = ['Pclass', 'Sex', 'Families', 'Fare']

# Normalize the Fare
scaler = StandardScaler()
fare_norm = scaler.fit_transform(train_data.Fare.values.reshape(-1, 1))
train_data['fare_norm'] = fare_norm

y_train = train_data['Survived'].values
X_train = train_data[['Pclass', 'Sex', 'Families', 'fare_norm']].values

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [31]:
# Neural Network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [32]:
# creat model

model = Sequential(
    [   
        tf.keras.Input(shape=(4,)),
        Dense(units = 2, activation = 'relu'),
        Dense(units = 1, activation = 'sigmoid')

    ], name = "titanic" 
)   

In [33]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
)

history = model.fit(X_train, y_train,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [34]:
model.summary()

Model: "titanic"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 2)                 10        
                                                                 
 dense_5 (Dense)             (None, 1)                 3         
                                                                 
Total params: 13
Trainable params: 13
Non-trainable params: 0
_________________________________________________________________


In [35]:
test_data = pd.read_csv('./test.csv')

In [36]:
test_data.Sex = test_data.Sex.map({'male':0, 'female':1})
fare_norm = scaler.fit_transform(test_data.Fare.values.reshape(-1, 1))
test_data['fare_norm'] = fare_norm
test_data['Families'] = test_data['SibSp'] + test_data['Parch'] + 1
X_test = test_data[['Pclass', 'Sex', 'Families', 'fare_norm']].values

In [37]:
print(X_test)

[[ 3.          0.          1.         -0.49781052]
 [ 3.          1.          2.         -0.51265996]
 [ 2.          0.          1.         -0.46453181]
 ...
 [ 3.          0.          1.         -0.50818292]
 [ 3.          0.          1.         -0.4938564 ]
 [ 3.          0.          3.         -0.23762123]]


In [81]:
y_pred = model.predict(X_test)
y_final = (y_pred > 0.5).astype(int).reshape(X_test.shape[0])

output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': y_final})

     PassengerId  Pclass                                          Name  Sex  \
0            892       3                              Kelly, Mr. James    0   
1            893       3              Wilkes, Mrs. James (Ellen Needs)    1   
2            894       2                     Myles, Mr. Thomas Francis    0   
3            895       3                              Wirz, Mr. Albert    0   
4            896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)    1   
..           ...     ...                                           ...  ...   
413         1305       3                            Spector, Mr. Woolf    0   
414         1306       1                  Oliva y Ocana, Dona. Fermina    1   
415         1307       3                  Saether, Mr. Simon Sivertsen    0   
416         1308       3                           Ware, Mr. Frederick    0   
417         1309       3                      Peter, Master. Michael J    0   

      Age  SibSp  Parch              Ticket      Fa

In [39]:
print(output)

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         0
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [54]:
output.to_csv('result.csv', index=False)

In [76]:
''' 
    The accuracy on test data is only 70.5%.
    check the model 
    in terms of bias and variance.
    Next I will calculate the cost for cross validation data set and train data.
    From here, we could know if it is underfit or overfit.
'''

In [82]:
y_pred_train = model.predict(X_train)
y_final_train = (y_pred_train > 0.5).astype(int).reshape(X_train.shape[0])

output_train = pd.DataFrame({'PassengerId': train_data['PassengerId'], 'Survived': y_final_train, 
                         'real_sur': train_data['Survived']})
counter_train = 0
for i in range(len(output_train)):
    if output_train.Survived.values[i] == output_train.real_sur.values[i]:
        counter_train = counter_train + 1
hit_rate_train = i/len(output_train)
print(f'The accuracy for train data is {hit_rate_train}')

The accuracy for train data is 0.9988776655443322
