In [1]:
import torch
from torch import optim
from torch import nn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path

In [2]:
def print_unique(df: pd.DataFrame) -> None:
    """Prints the column names and their unique values."""
    if isinstance(df, pd.core.series.Series):
        print(f"{df.unique()}")
    elif isinstance(df, pd.core.frame.DataFrame):
        for col in df:
            print(f"{col}: {df[col].unique()}")
    else:
        raise TypeError(f"Expected DataFrame or Series, recieved {type(df)}")

# Peep the raw data

In [3]:
raw_df = pd.read_csv(Path('./data/train.csv'))
display(raw_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Drop irrelevant columns and deal with NaNs
Use correlation, the number of unique values and common sense to decide irrelevance. For example, `PassangerId` which is simply counting up is assumed to have no impact on passanger mortality. `Name` most likely won't be relevant since all the information it conveys, like marriage status, or socio-economic status (Dr. or Master) are accounted for in other variables.

In [4]:
# I'll drop the 'Ticket' and 'PassengerId' columns for now, as I suspect they have no bearing on survival
cleaned_df = raw_df.drop(columns=['PassengerId', 'Ticket', 'Name'])
display(cleaned_df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


In [5]:
# I see a few NaNs
def count_nans(df: pd.DataFrame) -> None:
    "Prints a count of the occurences of NaNs in each column"
    for col in df.columns:
        print(f"{col} NaNs: {df[col].isna().sum()}")

In [6]:
count_nans(cleaned_df)

Survived NaNs: 0
Pclass NaNs: 0
Sex NaNs: 0
Age NaNs: 177
SibSp NaNs: 0
Parch NaNs: 0
Fare NaNs: 0
Cabin NaNs: 687
Embarked NaNs: 2


### Note on `Cabin` variable
It would be nice to include as, undoubtedly, a passenger's position on the ship affects their mortality in event of a major accident but, there are 687 `NaN` values. That's 687 out of 891. Too many null values to be useful, so I'll drop it. 

In [7]:
cleaned_df.drop(columns=['Cabin'], inplace=True)

In [8]:
cleaned_df['Age'].fillna(cleaned_df['Age'].mean(), inplace=True)
cleaned_df['Embarked'].fillna('UNK', inplace=True)
count_nans(cleaned_df)

Survived NaNs: 0
Pclass NaNs: 0
Sex NaNs: 0
Age NaNs: 0
SibSp NaNs: 0
Parch NaNs: 0
Fare NaNs: 0
Embarked NaNs: 0


# Encode Sex and Embarked columns

In [10]:
print_unique(cleaned_df['Embarked'])

['S' 'C' 'Q' 'UNK']


In [12]:
cleaned_df['Sex'].replace({'male': 1, 'female': 0}, inplace=True)
cleaned_df['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2, 'UNK': 3}, inplace=True)
display(cleaned_df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.000000,1,0,7.2500,0
1,1,1,0,38.000000,1,0,71.2833,1
2,1,3,0,26.000000,0,0,7.9250,0
3,1,1,0,35.000000,1,0,53.1000,0
4,0,3,1,35.000000,0,0,8.0500,0
...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,0
887,1,1,0,19.000000,0,0,30.0000,0
888,0,3,0,29.699118,1,2,23.4500,0
889,1,1,1,26.000000,0,0,30.0000,1


# Split the data
Even though the test set is withheld by Kaggle, I'm only allowed so many attempts, so I'll withhold a small chunk to iterably test hyper-parameter selections.

In [13]:
X = cleaned_df.drop(columns=['Survived'])
y = cleaned_df['Survived']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Normalizing continuous variables between 0 - 1
Via `sklearn`'s `MinMaxScaler()`. An idea to test later would be to normalize `Age` and `Fare` using their own scales.

In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train[['Age', 'Fare']])

MinMaxScaler()

In [18]:
X_train_scaled = X_train.copy()
X_train_scaled[['Age', 'Fare']] = scaler.transform(X_train[['Age', 'Fare']])
X_test_scaled = X_test.copy()
X_test_scaled[['Age','Fare']] = scaler.transform(X_test[['Age', 'Fare']])

In [19]:
display(X_train_scaled)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
165,3,1,0.107816,0,2,0.040062,0
541,3,0,0.107816,4,2,0.061045,0
625,1,1,0.761247,0,0,0.063086,0
388,3,1,0.367921,0,0,0.015086,2
76,3,1,0.367921,0,0,0.015412,0
...,...,...,...,...,...,...,...
106,3,0,0.258608,0,0,0.014932,0
270,1,1,0.367921,0,0,0.060508,0
860,3,1,0.509927,2,0,0.027538,0
435,1,0,0.170646,1,2,0.234224,0


# Oversampling

In [20]:
y_train.value_counts()

0    495
1    306
Name: Survived, dtype: int64

In [24]:
oversampler = SMOTE()
X_train_scaled_os, y_train_os = oversampler.fit_resample(X_train_scaled, y_train)

In [25]:
y_train_os.value_counts()

1    495
0    495
Name: Survived, dtype: int64

# Tensor pre-processing

In [75]:
train_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_train_scaled_os.values, y_train_os)]
validation_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_test_scaled.values, y_test.values)]

train_ds[0][0].shape, train_ds[0][0], train_ds[0][1]


(torch.Size([7]),
 tensor([3.0000, 1.0000, 0.1078, 0.0000, 2.0000, 0.0401, 0.0000],
        requires_grad=True),
 tensor(1., requires_grad=True))

In [76]:
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=len(train_ds) // 10, shuffle=False)
val_dataloader = torch.utils.data.DataLoader(validation_ds, batch_size=len(validation_ds) // 10, shuffle=False)

# Model
A simple dense neural network.

In [101]:
input_neuron_count = train_ds[0][0].shape[0]
model = nn.Sequential(
    nn.Linear(input_neuron_count, input_neuron_count // 2),
    nn.ReLU(),
    nn.Linear(input_neuron_count // 2, 1)
)

In [103]:
loss_fcn = nn.BCEWithLogitsLoss()
adam_optim = optim.SGD(model.parameters(), lr=1e-1)

# My custom training loop

In [99]:
# I'm reusing my custom training loop
def training_loop(epochs, model, loss_fcn, optimizer, train_dataloader, val_dataloader,
                  *, save_best=True, metrics=True, logging=True):
    """
    Custom training loop
    Parameters:
        epochs: int, number of epochs to train for
        model: nn.Module or subclass thereof, from which to obtain predictions
        loss_fcn: any pytorch loss function
        optimizer: any optimizer
        train_dataloader: PyTorch dataloader from which to pull data
        val_dataloader: "
        save_best: bool, WARNING only use on smaller models, cache and serialize best model at end of training
        metrics: bool, control calculation and printing of numbers to the screen
        logging: bool, control printing of tensor info to screen after each step
    Returns: 
        Trained model
    """
    
    highest_accuracy = 0
    cached_model = None
    for epoch in range(epochs):
        for features, labels in train_dataloader:
            labels.unsqueeze_(1)
            if logging: print_t_info([features, labels], ["Train Feats: ", "Train Labels: "])
            
            train_predictions = model(features)
            
            train_loss = loss_fcn(train_predictions, labels)
            
            if logging: print_t_info([train_predictions, train_loss], ["Train Preds:", "Train Loss:"])
            
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            
            
        total = 0
        correct = 0
        
        for features, labels in val_dataloader:
            labels.unsqueeze_(1)
            if logging: print_t_info([features, labels], ['Val Feats: ', 'Val Labels:'])
            
            val_predictions = model(features)
            
            val_loss = loss_fcn(val_predictions, labels)
            
            if logging: print_t_info([val_predictions, val_loss], ['Val Preds:', 'Val Loss:'])
            
            
        if metrics:
            total += val_predictions.shape[0]
            correct = int(((val_predictions > 0.5) == labels.type(torch.BoolTensor)).sum())
            print(f"Epoch {epoch:03}\tTrain Loss: {train_loss:.4}\tVal Loss: "
                  f"{val_loss:.4}\tAccuracy: {correct/total:%}")
        if save_best:
            latest_accuracy = correct/total
            if latest_accuracy > highest_accuracy:
                highest_accuracy = latest_accuracy
                cached_model = model
                
        if save_best:
            model_pth = Path(f'./models/{highest_accuracy * 100}_model')
            torch.save(cached_model, model_pth)
            print(f"Model saved to {model_pth}")
        
    return model

In [104]:
training_loop(1000, model, loss_fcn, a_optim, train_dataloader, val_dataloader, logging=False, save_best=False)

Epoch 000	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 001	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 002	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 003	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 004	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 005	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 006	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 007	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 008	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 009	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 010	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 011	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 012	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 013	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 014	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 015	

Epoch 136	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 137	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 138	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 139	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 140	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 141	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 142	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 143	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 144	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 145	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 146	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 147	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 148	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 149	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 150	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 151	

Epoch 274	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 275	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 276	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 277	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 278	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 279	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 280	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 281	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 282	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 283	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 284	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 285	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 286	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 287	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 288	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 289	

Epoch 412	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 413	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 414	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 415	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 416	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 417	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 418	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 419	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 420	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 421	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 422	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 423	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 424	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 425	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 426	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 427	

Epoch 550	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 551	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 552	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 553	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 554	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 555	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 556	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 557	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 558	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 559	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 560	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 561	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 562	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 563	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 564	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 565	

Epoch 688	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 689	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 690	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 691	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 692	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 693	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 694	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 695	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 696	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 697	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 698	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 699	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 700	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 701	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 702	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 703	

Epoch 826	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 827	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 828	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 829	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 830	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 831	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 832	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 833	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 834	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 835	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 836	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 837	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 838	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 839	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 840	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 841	

Epoch 964	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 965	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 966	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 967	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 968	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 969	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 970	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 971	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 972	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 973	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 974	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 975	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 976	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 977	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 978	Train Loss: 0.8855	Val Loss: 0.703	Accuracy: 55.555556%
Epoch 979	

Sequential(
  (0): Linear(in_features=7, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)