# 01_Analysis and Submission 
in this notebook titanic dataset is used and the goal is to predict the survivors of the catastrophic event happend on  1912. The results are later submitted to the Kaggle 


Dataset: https://www.kaggle.com/competitions/titanic/data

In [19]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LeakyReLU,BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.initializers import GlorotUniform

import datetime
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt

# 01 Analysis of the Data 

In [2]:
current_directory = os.getcwd()
data_directory = current_directory + "/data/titanic"
train_csv = data_directory + "/train.csv"
test_csv = data_directory + "/test.csv"

In [3]:
df_train = pd.read_csv(train_csv)
display(df_train)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [4]:
print(df_train.info())
print(df_train.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.48659

In [5]:
#Cabin has only 204 rows , too many missing rows that's why it is dropped
if "Cabin" in df_train.columns:
    df_train.drop(columns=["Cabin"],inplace=True)
#Age also has missing values. The mean here is 29.6 with std of 14.52
#the missing values are sampled from normal distribution with thos mean and std
normal_ages = np.random.normal(loc=29.69,scale=14.52,size=df_train['Age'].isnull().sum())
df_train.loc[df_train['Age'].isnull(),'Age'] = normal_ages
#Replace missing values in Embarked with most common values - mode
df_train["Embarked"].fillna(df_train["Embarked"].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train["Embarked"].fillna(df_train["Embarked"].mode()[0],inplace=True)


In [6]:
#convert catgeroical variables
df_train["Sex"]=LabelEncoder().fit_transform(df_train["Sex"])
#Convert Embarked to one hot encoding - If just convert categories to numerical
#data it can induce that the labels in Embarked are ordered and have different importance
df_train = pd.get_dummies(df_train, columns=["Embarked"],drop_first=True)

In [7]:
#feature and target
X = df_train[["Pclass","Sex","Age","SibSp","Parch","Fare","Embarked_Q","Embarked_S"]]
y = df_train["Survived"]

In [8]:
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.3,random_state=42)
display(X_train)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_Q,Embarked_S
445,1,1,4.000000,0,2,81.8583,False,True
650,3,1,45.642409,0,0,7.8958,False,True
172,3,0,1.000000,1,1,11.1333,False,True
450,2,1,36.000000,1,2,27.7500,False,True
314,2,1,43.000000,1,1,26.2500,False,True
...,...,...,...,...,...,...,...,...
106,3,0,21.000000,0,0,7.6500,False,True
270,1,1,17.765684,0,0,31.0000,False,True
860,3,1,41.000000,2,0,14.1083,False,True
435,1,0,14.000000,1,2,120.0000,False,True


In [9]:
scaler=StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

In [10]:
print("X_train shape:", X_train.shape)  # Should print (number_of_samples, number_of_features)
print("y_train shape:", y_train.shape)  # Should print (number_of_samples, )

# Check data types
print("X_train dtype:", X_train.dtype)  # Should ideally be float32 or float64
print("y_train dtype:", y_train.dtype)  # Should ideally be float32 or float64, depending on the task


X_train shape: (623, 8)
y_train shape: (623,)
X_train dtype: float64
y_train dtype: int64


In [21]:
log_dir = os.path.join("logs","run_7_w_dropout_l2_001_lRelu_Bnorm_Xav_init", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
model = Sequential([
    Dense(1024,input_dim=X_train.shape[1],kernel_regularizer=l2(0.0001),kernel_initializer=GlorotUniform()),
    BatchNormalization(),
    LeakyReLU(alpha=0.01),
    Dropout(0.5),
    Dense(512,kernel_regularizer=l2(0.0001),kernel_initializer=GlorotUniform()),
    LeakyReLU(alpha=0.01),
    Dropout(0.5),
    Dense(256,kernel_regularizer=l2(0.0001),kernel_initializer=GlorotUniform()),
    BatchNormalization(),
    LeakyReLU(alpha=0.01),
    Dropout(0.5),
    Dense(128,kernel_regularizer=l2(0.0001),kernel_initializer=GlorotUniform()),
    BatchNormalization(),
    LeakyReLU(alpha=0.01),
    Dropout(0.5),
    Dense(64,kernel_regularizer=l2(0.0001),kernel_initializer=GlorotUniform()),
    BatchNormalization(),
    LeakyReLU(alpha=0.01),
    Dropout(0.5),
    Dense(32,kernel_regularizer=l2(0.0001),kernel_initializer=GlorotUniform()),
    BatchNormalization(),
    LeakyReLU(alpha=0.01),
    Dropout(0.5),
    Dense(1, activation="sigmoid")
])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer,loss="binary_crossentropy",metrics=['accuracy'])
train = model.fit(X_train,y_train,epochs=300,batch_size=128,validation_data=(X_val,y_val),callbacks=[tensorboard_callback])

Epoch 1/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 257ms/step - accuracy: 0.4707 - loss: 1.0370 - val_accuracy: 0.4179 - val_loss: 0.8269
Epoch 2/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 153ms/step - accuracy: 0.4933 - loss: 0.9710 - val_accuracy: 0.3657 - val_loss: 0.8284
Epoch 3/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 122ms/step - accuracy: 0.5308 - loss: 1.0018 - val_accuracy: 0.3731 - val_loss: 0.8294
Epoch 4/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 126ms/step - accuracy: 0.4660 - loss: 1.0064 - val_accuracy: 0.3806 - val_loss: 0.8296
Epoch 5/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 144ms/step - accuracy: 0.5255 - loss: 0.9693 - val_accuracy: 0.3918 - val_loss: 0.8288
Epoch 6/300
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 153ms/step - accuracy: 0.5190 - loss: 0.9479 - val_accuracy: 0.3806 - val_loss: 0.8283
Epoch 7/300
[1m5/5[0m [32m━━━━━