In [1]:
import pathlib

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import compose, impute, linear_model, model_selection, pipeline, preprocessing 
import torch
from torch import nn, optim, utils
import torchmetrics

In [2]:
INPUT_DIR = pathlib.Path("/kaggle/input/kaust-academy-ai-week-november-2022")
WORKING_DIR = pathlib.Path("/kaggle/working")

# 1. Load the training data

In [3]:
_train_df = pd.read_csv(
    INPUT_DIR / "train.csv",
    index_col="PassengerId",
)

# need to have some validation data
_seed = 42
train_df, val_df = model_selection.train_test_split(
    _train_df,
    test_size=0.1,
    random_state = np.random.RandomState(_seed),
    stratify=_train_df.loc[:, "Transported"],
)

# 2. Divide the training (validation) features from the training (validation) target 

In [4]:
train_features = train_df.drop("Transported", axis=1)
train_target =  train_df.loc[:, "Transported"]

val_features = val_df.drop("Transported", axis=1)
val_target =  val_df.loc[:, "Transported"]

# 3. Data preprocessing

In [5]:
boolean_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
)

categorical_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="most_frequent"),
    preprocessing.OneHotEncoder(),
)

numeric_preprocessing = pipeline.make_pipeline(
    impute.SimpleImputer(strategy="mean")
)

to_torch_tensor = pipeline.make_pipeline(
    preprocessing.FunctionTransformer(lambda arr: arr.astype(np.float32)),
    preprocessing.FunctionTransformer(lambda arr: torch.from_numpy(arr))
)

feature_column_transformer = compose.make_column_transformer(
    (boolean_preprocessing, ["CryoSleep", "VIP"]),
    (categorical_preprocessing, ["HomePlanet", "Destination"]),
    (numeric_preprocessing, compose.make_column_selector(dtype_include=np.float64)),
    remainder = "drop",
)

feature_preprocessing = pipeline.make_pipeline(
    feature_column_transformer ,
    to_torch_tensor
)

target_preprocessing = pipeline.make_pipeline(
    preprocessing.FunctionTransformer(lambda df: df.to_numpy()),
    to_torch_tensor
)

# 4. Create your datasets and dataloaders

In [6]:
BATCH_SIZE = 32
NUM_WORKERS = 2

train_features_tensor = feature_preprocessing.fit_transform(train_features)
train_target_tensor = target_preprocessing.fit_transform(train_target)

train_dataset = utils.data.TensorDataset(train_features_tensor, train_target_tensor)
train_dataloader = utils.data.DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

val_features_tensor = feature_preprocessing.transform(val_features)
val_target_tensor = target_preprocessing.transform(val_target)

val_dataset = utils.data.TensorDataset(val_features_tensor, val_target_tensor)
val_dataloader = utils.data.DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
)

In [7]:
%config Completer.use_jedi = False

In [8]:
print(train_features_tensor)

tensor([[  1.,   0.,   0.,  ...,   0.,   0.,   0.],
        [  1.,   0.,   0.,  ...,   0.,   0.,   0.],
        [  0.,   0.,   1.,  ...,   0.,   4., 811.],
        ...,
        [  1.,   0.,   1.,  ...,   0.,   0.,   0.],
        [  1.,   0.,   1.,  ...,   0.,   0.,   0.],
        [  1.,   0.,   1.,  ...,   0.,   0.,   0.]])


# 5. Define a multi-layer perceptron classifier

In [9]:
_, in_features = train_features_tensor.shape
hidden_features = [256,  64,32, 16,]
model_fn = nn.Sequential(
    # TODO: Define your network here!
    nn.Linear(14,256),
    nn.ReLU(),
    nn.BatchNorm1d(256),

    nn.Linear(256,32),
    nn.ReLU(),

#     nn.Linear(16,2),
#     nn.ReLU(),
#     nn.BatchNorm1d(64),
# nn.Linear(64,32),
#     nn.ReLU(),
     nn.BatchNorm1d(32),
     nn.Linear(32,16),
    nn.ReLU(),
    nn.Dropout(),

    nn.Linear(16,1),
    nn.Sigmoid()
    
    
)
loss_fn = nn.BCELoss()
_optimizer_kwargs = {
 "momentum": 0.001,
 "nesterov": False,
}
optimizer = optim.SGD(model_fn.parameters(), lr=1e-6, **_optimizer_kwargs)


# 6. Train your classifier

In [10]:
epochs = 260
log_epochs = 20

for epoch in range(epochs):
    
    train_losses = []
    for features, targets in train_dataloader:
        
        # forward pass
        predictions = model_fn(features) 
        predictions=predictions.squeeze(1)

        train_loss = loss_fn(predictions, targets)
        train_losses.append(train_loss)
        
        # backward pass
        train_loss.backward()        
        optimizer.step()        
        optimizer.zero_grad()
        
    train_loss = (torch.stack(train_losses)
                       .mean())
    
    with torch.no_grad():
        
        val_losses = []
        for features, targets in val_dataloader:
            predictions = model_fn(features)
            predictions=predictions.squeeze(1)
            
            val_loss = loss_fn(predictions, targets)
            val_losses.append(val_loss)
    
        val_loss = (torch.stack(val_losses)
                         .mean())

    if epoch % log_epochs == 0:
        print(f'Epoch {epoch}, Training Loss {train_loss.item():.4f}, Validation Loss {val_loss.item():.4f}')

Epoch 0, Training Loss 0.6901, Validation Loss 0.6958
Epoch 20, Training Loss 0.6908, Validation Loss 0.6879
Epoch 40, Training Loss 0.6906, Validation Loss 0.6924
Epoch 60, Training Loss 0.6867, Validation Loss 0.6844
Epoch 80, Training Loss 0.6853, Validation Loss 0.6808
Epoch 100, Training Loss 0.6846, Validation Loss 0.6812
Epoch 120, Training Loss 0.6814, Validation Loss 0.6871
Epoch 140, Training Loss 0.6822, Validation Loss 0.6783
Epoch 160, Training Loss 0.6813, Validation Loss 0.6808
Epoch 180, Training Loss 0.6794, Validation Loss 0.6760
Epoch 200, Training Loss 0.6782, Validation Loss 0.6810
Epoch 220, Training Loss 0.6756, Validation Loss 0.6780
Epoch 240, Training Loss 0.6722, Validation Loss 0.6732


# 7. Load the testing features

In [11]:
test_features =  pd.read_csv(
    INPUT_DIR / "test.csv",
    index_col="PassengerId",
)

# 8. Make predictions using the test features

In [12]:
features_tensor = feature_preprocessing.transform(test_features)

In [13]:
with torch.no_grad():
    probas = model_fn(features_tensor)
predictions = probas[:, 0] > 0.5

# 9. Load the sample submission file 

In [14]:
sample_submission_df = pd.read_csv(
    INPUT_DIR / "sample_submission.csv",
    index_col="PassengerId",
)

# 10. Create the submission file

In [15]:
_ = (pd.DataFrame({"Transported": predictions}, index=sample_submission_df.index)
       .to_csv(WORKING_DIR / "submission.csv"))  