#### Setup

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import tensorflow as tf

#### Load Data

In [3]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
data = [train, test]

#### Clean Data

In [4]:
# Null Values
for df in data:
    mean_age = df['Age'].mean()
    df['Age'] = df['Age'].fillna(mean_age)
    df['Age'] = df['Age'].astype(int)
    mode_embarked = df['Embarked'].mode()[0]
    df['Embarked'] = df['Embarked'].fillna(mode_embarked)
    df['Embarked'] = df['Embarked'].astype(str)

#drop string columns
for df in data:
    df.drop(columns = ['Name', 'Ticket', 'Cabin'], inplace = True)

# Feature Engineering
for df in data:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['IsAlone'] = 1
    df.loc[df['FamilySize'] > 1, 'IsAlone'] = 0
    df['Fare_per_person'] = df['Fare'] / df['FamilySize']

for df in data:
    print (df.head())



   PassengerId  Survived  Pclass     Sex  Age  SibSp  Parch     Fare Embarked  \
0            1         0       3    male   22      1      0   7.2500        S   
1            2         1       1  female   38      1      0  71.2833        C   
2            3         1       3  female   26      0      0   7.9250        S   
3            4         1       1  female   35      1      0  53.1000        S   
4            5         0       3    male   35      0      0   8.0500        S   

   FamilySize  IsAlone  Fare_per_person  
0           2        0          3.62500  
1           2        0         35.64165  
2           1        1          7.92500  
3           2        0         26.55000  
4           1        1          8.05000  
   PassengerId  Pclass     Sex  Age  SibSp  Parch     Fare Embarked  \
0          892       3    male   34      0      0   7.8292        Q   
1          893       3  female   47      1      0   7.0000        S   
2          894       2    male   62      0      

#### Data Preprocessing

In [5]:
cat_cols = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'FamilySize']
num_cols = ['Age', 'Fare']

def preprocess(df, cat_cols, num_cols):
    encoder = OneHotEncoder(sparse_output = False)
    scaler = StandardScaler()
    
    #encode columns
    encoded_cols = encoder.fit_transform(df[cat_cols])
    encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))
    df = pd.concat([df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
    
    #scale columns
    scaled_cols = scaler.fit_transform(df[num_cols])
    scaled_df = pd.DataFrame(scaled_cols, columns=num_cols)
    df[num_cols] = scaled_df
    
    #drop old columns
    df.drop(columns=cat_cols + num_cols, inplace=True)
    
    return df
    
test = preprocess(test, cat_cols, num_cols)
train = preprocess(train, cat_cols, num_cols)

#manually add missing column because I'm not sure how to do it automatically
train['Parch_9'] = 0
           


In [6]:
print(test.columns)
print(train.columns)
print(test.shape)
print(train.shape)

Index(['PassengerId', 'IsAlone', 'Fare_per_person', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2',
       'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1',
       'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize_1',
       'FamilySize_2', 'FamilySize_3', 'FamilySize_4', 'FamilySize_5',
       'FamilySize_6', 'FamilySize_7', 'FamilySize_8', 'FamilySize_11'],
      dtype='object')
Index(['PassengerId', 'Survived', 'IsAlone', 'Fare_per_person', 'Pclass_1',
       'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1',
       'SibSp_2', 'SibSp_3', 'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_0',
       'Parch_1', 'Parch_2', 'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6',
       'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize_1',
       'FamilySize_2', 'FamilySize_3', 'FamilySize_4', 'FamilySize_5',
       'FamilySize_6', 'FamilySi

In [7]:
from sklearn.model_selection import train_test_split
#create train test split
X = train.drop(columns = 'Survived')
y = train['Survived']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0) 


In [17]:
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from kerastuner import HyperModel
from kerastuner.tuners import Hyperband

# Hypermodel
class TitanicHyperModel(HyperModel):
    def build(self, hp):
        model = keras.Sequential()
        model.add(layers.Input(shape=(X_train.shape[1],)))
        model.add(layers.BatchNormalization())
        
        for i in range(hp.Int('num_layers', 1, 5)):
            units = hp.Int('units_' + str(i), 
                           min_value=64, 
                           max_value=1024, 
                           step=32)
            activation = hp.Choice('activation_' + str(i), ['relu', 'leaky_relu'])

            kernel_regularizer = regularizers.l2(hp.Float('l2_regularization_' + str(i),
                                                          min_value=1e-7,
                                                          max_value=0.1,
                                                          sampling='LOG'))
            model.add(layers.Dense(units=units,
                                   activation=activation,
                                   kernel_regularizer=kernel_regularizer))
            model.add(layers.BatchNormalization())
            model.add(layers.Dropout(hp.Float('dropout_' + str(i),
                                              min_value=0.1,
                                              max_value=0.5,
                                              step=0.1)))
        model.add(layers.Dense(1, activation='sigmoid'))

        model.compile(
            optimizer=keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate',
                                       min_value=0.001,
                                       max_value=0.01,
                                       sampling='LOG')
            ),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        return model

tuner = Hyperband(
    TitanicHyperModel(),
    objective='val_accuracy',  # Change this to 'val_loss'
    max_epochs=100,
    factor=3,
    directory='my_dir',
    project_name='Titanic Survival Prediction'
)

#*********************#
MY_PATIENCE = 10
MY_EPOCHS = 100
MY_MIN_DELTA = 1e-4
#*********************#

tuner.search(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=MY_EPOCHS,
    callbacks=[keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=MY_PATIENCE,
        min_delta=MY_MIN_DELTA,
        restore_best_weights=True
    )],
    verbose=1
)

best_model = tuner.get_best_models(num_models=1)[0]
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

print(best_hyperparameters.values)
print(best_model.summary())

best_model.compile(
    optimizer=best_hyperparameters.get('optimizer'),
    loss='binary_crossentropy',
    metrics=['accuracy']
)
best_model.evaluate(X_valid, y_valid)

#### Plots of Model

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

history = best_model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid)
)

sns.set_style('whitegrid')
sns.set_context('notebook')

plt.figure(figsize=(10, 6))
sns.lineplot(data=history.history['accuracy'], label='Train Accuracy', marker='o')
sns.lineplot(data=history.history['val_accuracy'], label='Validation Accuracy', marker='o')

plt.xlabel('Epochs', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.title('Training and Validation Accuracy over Epochs', fontsize=16)
plt.legend(fontsize=12)
plt.show()

In [None]:
#Plot Training and validation loss
plt.figure(figsize=(12, 6))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss vs val_loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
#actual vs predicted values
y_preds = best_model.predict(X_valid)

plt.figure(figsize=(10, 6))
plt.scatter(y_valid, y_preds, alpha=0.5)
plt.plot([y_valid.min(), y_valid.max()], [y_valid.min(), y_valid.max()], 'r--', lw=2)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted House Prices')
plt.show()

In [None]:
predictions = best_model.predict(test)

# Convert predictions to binary (0 or 1)
binary_predictions = (predictions > 0.5).astype(int)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': binary_predictions.flatten()
})

# Save predictions to CSV
predictions_df.to_csv('Titanic_NN_predictions.csv', index=False)