# Top 5% Custom Neural Network solution
Hey! Welcome to the start of your Kaggle journey! Exciting!

Throughout this notebook I tried to simplify as much as I could and make sure everything is straightforward.

By no means is this notebook perfect, I'm learning just as you are. 

As further exercises, here are a few ideas to get you going inside this notebook:
- The onehot encoding can be further optimized
- Evaluation can be enhanced with Cross-Validation using KFold or other technique
- Tweak and experiment with the Custom Multilayer Perceptron (neural network) hyperparameters such as:
    - Amount of neurons per hidden layer
    - Amount of hidden layers
    - Regularization (eg. nn.Dropout)
    - Different optimizer (eg. Adam, AdamW, RMSProp)

Hope you'll learn something from this notebook!

Good luck! ðŸ˜„

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch import nn
from sklearn.model_selection import train_test_split
from pathlib import Path

input_dir = '/kaggle/input/titanic/'
target = 'Survived'

if torch.cuda.is_available():
    device = 'cuda:0' # first GPU available
else:
    device = 'cpu'
print(f'Using device: {device}')

## Preprocessing
In this custom function a couple of things are done:

- Fill NaN/null values with '0'
- Numerical Features get normalized
- Categorical Features get onehot encoded

In [None]:
# use global scaler to preserve fit for test transform
scaler = StandardScaler()
onehot = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

def prepare(df, scaler=scaler, onehot=onehot, test_set=False):
    df = df.drop(columns=['PassengerId', target], errors='ignore')

    numerical_features = df.select_dtypes(include=np.number).columns.tolist()

    df[numerical_features] = df[numerical_features].fillna(0)

    # apply normalization to features (x - x.mean()) / x.std()
    # for test set we only transform the data, otherwise we leak data and cheat = overly optimistic (inaccurate) evaluation
    if test_set:
        df[numerical_features] = scaler.transform(df[numerical_features])
    else:
        df[numerical_features] = scaler.fit_transform(df[numerical_features])

    categorical_features = df.select_dtypes(exclude=np.number).columns.tolist()

    if test_set:
        encoded_features = onehot.transform(df[categorical_features])
    else:
        encoded_features = onehot.fit_transform(df[categorical_features])

    # get names of new onehot columns, drop existing columns and replace with onehot columns
    new_cols = onehot.get_feature_names_out()

    encoded_df = pd.DataFrame(encoded_features, columns=new_cols, index=df.index)
    
    df = df.drop(columns=categorical_features)
    df = pd.concat([df, encoded_df], axis=1)
    
    return df

### Prepare train/valid splits

In [None]:
train_csv = Path(input_dir) / 'train.csv'
df_train = pd.read_csv(train_csv)

X_train, X_valid, y_train, y_valid = train_test_split(df_train, df_train[target], test_size=0.2, random_state=42)

print(X_train.shape, X_valid.shape)

X_train = prepare(X_train)
X_valid = prepare(X_valid, test_set=True) # test_set=True for validation so we don't leak scaler info into validation set

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_valid shape: {X_valid.shape}')
print(f'y_valid shape: {y_valid.shape}')
X_train

### Prepare data for neural network

In [None]:
# unsqueeze(1) inserts a new axis at index 1. y_train shape (712,) -> (712, 1)
# y_train becomes a column vector after unsqueeze, column vectors are needed for output layer of our custom MLP (inference): nn.Linear(64, 1)
X_train = torch.tensor(X_train.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1)
X_valid = torch.tensor(X_valid.values, dtype=torch.float32)
y_valid = torch.tensor(y_valid.values, dtype=torch.float32).unsqueeze(1)

train_ds = TensorDataset(X_train, y_train)
valid_ds = TensorDataset(X_valid, y_valid)

batch_size = 64

pin_memory = True if 'cuda' in device else False

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=pin_memory) # pin_memory speeds up data transfer from CPU to GPU
valid_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False, pin_memory=pin_memory) # we don't shuffle test/valid sets for reproducibility reasons

## Custom Neural Network

In [None]:
class CustomMLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LeakyReLU(0.1),
            nn.Linear(256, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

In [None]:
net = CustomMLP(X_train.shape[1]) # initialize with X_train input dimension

In [None]:
def train(net, dataloader, num_epochs, lr, lr_period, lr_decay, momentum, device):
    net = net.to(device)

    loss = nn.BCELoss()
    optim = torch.optim.SGD(net.parameters(), lr=lr, momentum=momentum)

    scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=lr_period, gamma=lr_decay)
    
    for epoch in range(num_epochs):
        net.train()
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            optim.zero_grad()
            preds = net(X)
            l = loss(preds, y)
            l.backward()
            optim.step()
        scheduler.step()
        if (epoch+1) % 5 == 0:
            print(f'Epoch {epoch + 1} Loss: {l.item():.6f}')       

## X_train training

In [None]:
train(
    net,
    train_loader,
    num_epochs=100,
    lr=0.1,
    momentum=0.9,
    device=device,
    lr_period=5,
    lr_decay=0.99
)

## Network Accuracy Evaluation

In [None]:
def eval_acc(net, dataloader, device):
    net.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            preds_raw = net(X)
            preds = (preds_raw > 0.5).float() # convert probabilities to prediction, 0 or 1
            correct += (preds == y).sum().item()
            total += y.size(0)

    return correct / total

In [None]:
acc = eval_acc(net, valid_loader, device)
print(f'Validation accuracy: {acc:.4f}')

## Retrain Network on full dataset

In [None]:
X_train_full = prepare(df_train)
y_train_full = df_train[target]

X_train_full = torch.tensor(X_train_full.values, dtype=torch.float32)
y_train_full = torch.tensor(y_train_full.values, dtype=torch.float32).unsqueeze(1)

full_trainset = TensorDataset(X_train_full, y_train_full)
full_loader = DataLoader(
    full_trainset,
    batch_size=batch_size,
    shuffle=True,
    pin_memory=True
)

net = CustomMLP(X_train_full.shape[1]) # recreate model to reset pretrained weights and use X_train_full input dimension

train(
    net,
    full_loader,
    num_epochs=100,
    lr=0.1,
    momentum=0.9,
    device=device,
    lr_period=5,
    lr_decay=0.99
)

## Make predictions and create submissions.csv

In [None]:
test_csv = Path(input_dir) / 'test.csv'
df_test = pd.read_csv(test_csv)

X_test = prepare(df_test, test_set=True)

X_test = torch.tensor(X_test.values, dtype=torch.float32).to(device)

with torch.no_grad():
    preds = net(X_test)
    preds = (preds > 0.5).int().squeeze().cpu().numpy()

passenger_ids = df_test['PassengerId']

submission = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Survived': preds
})
submission.to_csv('submission.csv', index=False)