In [1]:
import torch
from torch import optim
from torch import nn
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import pandas as pd
from pathlib import Path

In [2]:
def print_unique(df: pd.DataFrame) -> None:
    """Prints the column names and their unique values."""
    if isinstance(df, pd.core.series.Series):
        print(f"{df.unique()}")
    elif isinstance(df, pd.core.frame.DataFrame):
        for col in df:
            print(f"{col}: {df[col].unique()}")
    else:
        raise TypeError(f"Expected DataFrame or Series, recieved {type(df)}")

# Peep the raw data

In [3]:
raw_df = pd.read_csv(Path('./data/train.csv'))
display(raw_df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


# Drop irrelevant columns and deal with NaNs
Use correlation, the number of unique values and common sense to decide irrelevance. For example, `PassangerId` which is simply counting up is assumed to have no impact on passanger mortality. `Name` most likely won't be relevant since all the information it conveys, like marriage status, or socio-economic status (Dr. or Master) are accounted for in other variables.

In [4]:
# I'll drop the 'Ticket' and 'PassengerId' columns for now, as I suspect they have no bearing on survival
cleaned_df = raw_df.drop(columns=['PassengerId', 'Ticket', 'Name'])
display(cleaned_df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.2500,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.9250,,S
3,1,1,female,35.0,1,0,53.1000,C123,S
4,0,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,,S
887,1,1,female,19.0,0,0,30.0000,B42,S
888,0,3,female,,1,2,23.4500,,S
889,1,1,male,26.0,0,0,30.0000,C148,C


In [5]:
# I see a few NaNs
def count_nans(df: pd.DataFrame) -> None:
    "Prints a count of the occurences of NaNs in each column"
    for col in df.columns:
        print(f"{col} NaNs: {df[col].isna().sum()}")

In [6]:
count_nans(cleaned_df)

Survived NaNs: 0
Pclass NaNs: 0
Sex NaNs: 0
Age NaNs: 177
SibSp NaNs: 0
Parch NaNs: 0
Fare NaNs: 0
Cabin NaNs: 687
Embarked NaNs: 2


### Note on `Cabin` variable
It would be nice to include as, undoubtedly, a passenger's position on the ship affects their mortality in event of a major accident but, there are 687 `NaN` values. That's 687 out of 891. Too many null values to be useful, so I'll drop it. 

In [7]:
cleaned_df.drop(columns=['Cabin'], inplace=True)

In [8]:
cleaned_df['Age'].fillna(cleaned_df['Age'].mean(), inplace=True)
cleaned_df['Embarked'].fillna('UNK', inplace=True)
count_nans(cleaned_df)

Survived NaNs: 0
Pclass NaNs: 0
Sex NaNs: 0
Age NaNs: 0
SibSp NaNs: 0
Parch NaNs: 0
Fare NaNs: 0
Embarked NaNs: 0


# Encode Sex and Embarked columns

In [10]:
print_unique(cleaned_df['Embarked'])

['S' 'C' 'Q' 'UNK']


In [12]:
cleaned_df['Sex'].replace({'male': 1, 'female': 0}, inplace=True)
cleaned_df['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2, 'UNK': 3}, inplace=True)
display(cleaned_df)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.000000,1,0,7.2500,0
1,1,1,0,38.000000,1,0,71.2833,1
2,1,3,0,26.000000,0,0,7.9250,0
3,1,1,0,35.000000,1,0,53.1000,0
4,0,3,1,35.000000,0,0,8.0500,0
...,...,...,...,...,...,...,...,...
886,0,2,1,27.000000,0,0,13.0000,0
887,1,1,0,19.000000,0,0,30.0000,0
888,0,3,0,29.699118,1,2,23.4500,0
889,1,1,1,26.000000,0,0,30.0000,1


# Split the data
Even though the test set is withheld by Kaggle, I'm only allowed so many attempts, so I'll withhold a small chunk to iterably test hyper-parameter selections.

In [13]:
X = cleaned_df.drop(columns=['Survived'])
y = cleaned_df['Survived']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Normalizing continuous variables between 0 - 1
Via `sklearn`'s `MinMaxScaler()`. An idea to test later would be to normalize `Age` and `Fare` using their own scales.

In [17]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train[['Age', 'Fare']])

MinMaxScaler()

In [18]:
X_train_scaled = X_train.copy()
X_train_scaled[['Age', 'Fare']] = scaler.transform(X_train[['Age', 'Fare']])
X_test_scaled = X_test.copy()
X_test_scaled[['Age','Fare']] = scaler.transform(X_test[['Age', 'Fare']])

In [19]:
display(X_train_scaled)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
165,3,1,0.107816,0,2,0.040062,0
541,3,0,0.107816,4,2,0.061045,0
625,1,1,0.761247,0,0,0.063086,0
388,3,1,0.367921,0,0,0.015086,2
76,3,1,0.367921,0,0,0.015412,0
...,...,...,...,...,...,...,...
106,3,0,0.258608,0,0,0.014932,0
270,1,1,0.367921,0,0,0.060508,0
860,3,1,0.509927,2,0,0.027538,0
435,1,0,0.170646,1,2,0.234224,0


# Oversampling

In [20]:
y_train.value_counts()

0    495
1    306
Name: Survived, dtype: int64

In [24]:
oversampler = SMOTE()
X_train_scaled_os, y_train_os = oversampler.fit_resample(X_train_scaled, y_train)

In [25]:
y_train_os.value_counts()

1    495
0    495
Name: Survived, dtype: int64

# Tensor pre-processing

In [75]:
train_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_train_scaled_os.values, y_train_os)]
validation_ds = [(torch.tensor(X_data, dtype=torch.float, requires_grad=True), torch.tensor(y_data, dtype=torch.float, requires_grad=True)) for X_data, y_data in zip(X_test_scaled.values, y_test.values)]

train_ds[0][0].shape, train_ds[0][0], train_ds[0][1]


(torch.Size([7]),
 tensor([3.0000, 1.0000, 0.1078, 0.0000, 2.0000, 0.0401, 0.0000],
        requires_grad=True),
 tensor(1., requires_grad=True))

In [76]:
train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=len(train_ds) // 10, shuffle=False)
val_dataloader = torch.utils.data.DataLoader(validation_ds, batch_size=len(validation_ds) // 10, shuffle=False)

# Model
A simple dense neural network.

In [204]:
input_neuron_count = train_ds[0][0].shape[0]
model = nn.Sequential(
    nn.Linear(input_neuron_count, input_neuron_count // 2),
    nn.ReLU(),
    nn.Linear(input_neuron_count // 2, 1)
)

In [205]:
loss_fcn = nn.BCEWithLogitsLoss()
adam_optim = optim.Adam(model.parameters(), lr=1e-1)

# My custom training loop

In [206]:
from typing import Union
def print_t_info(ts: list[Union[torch.tensor, float]], msg: list[str]=None) -> None:
    "Print all relevant tensor info in a pretty interface"
    if isinstance(ts, torch.Tensor): ts = [ts]
    if isinstance(msg, str): msg = [msg]
    if not isinstance(ts, list): raise TypeError(f"Expected tensor or list of tensors got {type(ts)}")
    if not isinstance(msg, list): raise TypeError(f"Expected str or list of strs, go {type(msg)}")
    for count, tensor in enumerate(ts):
        if msg:
            try: print(f"{msg[count]}", end='')
            except IndexError:
                print(f"{msg[-1]}", end='')
            if isinstance(tensor, float):
                print(f"\t{tensor}")
                break
            else:
                print(f"\tShape: {tensor.shape}\tdtype: {tensor.dtype}\tContiguous: {tensor.is_contiguous()}")
        else:
            print(f"Tensor #{count:02}\tShape: {tensor.shape}\tdtype: {tensor.dtype}\tContiguous: {tensor.is_contiguous()}")
        

In [217]:
# I'm reusing my custom training loop
import os

def training_loop(epochs, model, loss_fcn, optimizer, train_dataloader, val_dataloader,
                  *, save_best=True, metrics=True, logging=True):
    """
    Custom training loop
    Parameters:
        epochs: int, number of epochs to train for
        model: nn.Module or subclass thereof, from which to obtain predictions
        loss_fcn: any pytorch loss function
        optimizer: any optimizer
        train_dataloader: PyTorch dataloader from which to pull data
        val_dataloader: "
        save_best: bool, WARNING only use on smaller models, cache and serialize best model at end of training
        metrics: bool, control calculation and printing of numbers to the screen
        logging: bool, control printing of tensor info to screen after each step
    Returns: 
        Trained model
    """
    
    highest_accuracy = 0
    cached_model = None
    for epoch in range(epochs):
        
        if logging: print(f"{len(train_dataloader)} Training batches with:")
            
        for features, labels in train_dataloader:
            labels.unsqueeze_(1)
            
            if logging: print_t_info([features, labels], ["Features: ", "Labels: "])
            
            train_predictions = model(features)
            
            train_loss = loss_fcn(train_predictions, labels)
            
            if logging: print_t_info([train_predictions, train_loss.item()], ["Train Preds:", "Train Loss:"])
            
            optimizer.zero_grad()
            train_loss.backward()
            optimizer.step()
            
            
        total = 0
        correct = 0
        
        if logging: print(f"{len(val_dataloader)} Validation batches with:")
            
        for features, labels in val_dataloader:
            labels.unsqueeze_(1)
            if logging: print_t_info([features, labels], ['Val Feats: ', 'Val Labels:'])
            
            val_predictions = model(features)
            
            val_loss = loss_fcn(val_predictions, labels)
            
            if logging: print_t_info([val_predictions, val_loss.item()], ['Val Preds:', 'Val Loss:'])
            
            if metrics:
                total += val_predictions.shape[0]
                correct += int(((val_predictions > 0.5) == labels.type(torch.BoolTensor)).sum())
            
        if metrics:
            print(f"Epoch {epoch:03}\tTrain Loss: {train_loss:.4}\tVal Loss: "
                  f"\t{val_loss:.4}\tCorrect: {correct}\tAccuracy: {correct/total:%}")  
                
        if save_best:
            latest_accuracy = correct/total
            
            if latest_accuracy > highest_accuracy:
                highest_accuracy = latest_accuracy
                cached_model = model
                model_pth = Path(f'./models/{highest_accuracy * 100:.5}_model')
                os.makedirs(str(model_pth.parents[0].absolute()), exist_ok=True)
                torch.save(cached_model, model_pth)
                print(f"Model saved to {model_pth}")
        
    return model

In [218]:
training_loop(100, model, loss_fcn, adam_optim, train_dataloader, val_dataloader, logging=False, save_best=True)

Epoch 000	Train Loss: 0.4355	Val Loss: 	0.211	Correct: 74	Accuracy: 82.222222%
Model saved to models/82.222_model
Epoch 001	Train Loss: 0.4308	Val Loss: 	0.2168	Correct: 74	Accuracy: 82.222222%
Epoch 002	Train Loss: 0.432	Val Loss: 	0.2166	Correct: 74	Accuracy: 82.222222%
Epoch 003	Train Loss: 0.4354	Val Loss: 	0.2093	Correct: 74	Accuracy: 82.222222%
Epoch 004	Train Loss: 0.4288	Val Loss: 	0.2149	Correct: 74	Accuracy: 82.222222%
Epoch 005	Train Loss: 0.4324	Val Loss: 	0.2124	Correct: 74	Accuracy: 82.222222%
Epoch 006	Train Loss: 0.4306	Val Loss: 	0.2112	Correct: 74	Accuracy: 82.222222%
Epoch 007	Train Loss: 0.4283	Val Loss: 	0.2174	Correct: 74	Accuracy: 82.222222%
Epoch 008	Train Loss: 0.429	Val Loss: 	0.2162	Correct: 74	Accuracy: 82.222222%
Epoch 009	Train Loss: 0.43	Val Loss: 	0.2155	Correct: 74	Accuracy: 82.222222%
Epoch 010	Train Loss: 0.4277	Val Loss: 	0.2164	Correct: 74	Accuracy: 82.222222%
Epoch 011	Train Loss: 0.4263	Val Loss: 	0.2172	Correct: 74	Accuracy: 82.222222%
Epoch 012	

Sequential(
  (0): Linear(in_features=7, out_features=3, bias=True)
  (1): ReLU()
  (2): Linear(in_features=3, out_features=1, bias=True)
)

# Time to perform test set predictions

In [220]:
best_model = torch.load('./models/83.333_model')
test_df = pd.read_csv('./data/test.csv')
display(test_df)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


# Preprocess
Below lies a very verbose function which performs all of the aforementioned preprocessing steps in this notebook on the test dataframe.

In [269]:
def preprocess(df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """Returns DataFrame with all nessecary preprocessing steps performed."""
    
    # Drop irrelevant columns
    cleaned_df = df.drop(columns=['PassengerId', 'Ticket', 'Name', 'Cabin'])
    
    cleaned_df['Sex'].replace({'male': 1, 'female': 0}, inplace=True)
    
    # Fill 'Age' with sensible numbers according to the distribution
    cleaned_df['Age'].fillna(cleaned_df['Age'].mean(), inplace=True)
    
    # Fill 'Embarked' and encode
    cleaned_df['Embarked'].fillna('UNK', inplace=True)
    cleaned_df['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2, 'UNK': 3}, inplace=True)
    
    scaler = MinMaxScaler()
    cleaned_df[['Age', 'Fare']] = scaler.fit_transform(cleaned_df[['Age', 'Fare']])
    
    return cleaned_df 

In [284]:
def create_dl(df: pd.core.frame.DataFrame) -> torch.utils.data.DataLoader:
    """Create a dataloader, from a given DataFrame."""
    
    test_ds = [torch.from_numpy(df.iloc[row,:].values).float() for row in range(len(df))]    # Removing requires_grad
    test_dl = torch.utils.data.DataLoader(test_ds, batch_size=len(test_ds)//10, shuffle=False)
    
    return test_dl

In [318]:
def inference(test_df: pd.core.frame.DataFrame, model: nn.Module, f: Union[Path, str]=None,
             *, series: pd.core.series.Series=None, write: bool=True) -> None:
    """
    Performs inference given a model and a dataframe and optionally write the predictions to a file.
    test_df: Cleaned, normalized DataFrame.
    """
    
    if series is not None: preds_df = series
    
    count = 0
    test_dl = create_dl(test_df)
    all_preds = []
    
    for samples in test_dl:
        print(f"samples looks like: {samples.shape}")
        with torch.no_grad():
            test_preds = model(samples)    # Get predictions
            test_preds.squeeze_()
            
        # Ensure we sigmoid here, my model uses nn.BCEWithLogitsLoss so my model doesn't end with a sigmoid
        all_preds.extend(test_preds.sigmoid().tolist())    

    print(f"Made {len(all_preds)} predictions, there are {len(preds_df)} passengers.")
    preds_df = pd.concat([series, pd.DataFrame(all_preds, columns=['Survived'])],
                         names=['PassengerId', 'Survived'], axis=1)
    
    # Go from continuous number to binary
    is_alive = preds_df['Survived'] > 0.5
    is_dead = preds_df['Survived'] <= 0.5
    preds_df['Survived'][is_alive] = 1
    preds_df['Survived'][is_dead] = 0
    
    display(preds_df)
    
    # Write to csv
    if write: preds_df.to_csv('./data/predictions.csv')

In [277]:
display(test_df)
pp_test_df = preprocess(test_df)
display(pp_test_df)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,1,0.452723,0,0,0.015282,2
1,3,0,0.617566,1,0,0.013663,0
2,2,1,0.815377,0,0,0.018909,2
3,3,1,0.353818,0,0,0.016908,0
4,3,0,0.287881,1,1,0.023984,0
...,...,...,...,...,...,...,...
413,3,1,0.396975,0,0,0.015713,0
414,1,0,0.512066,0,0,0.212559,1
415,3,1,0.505473,0,0,0.014151,0
416,3,1,0.396975,0,0,0.015713,0


In [319]:
inference(pp_test_df, best_model, series=test_df['PassengerId'], write=False)

samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([41, 7])
samples looks like: torch.Size([8, 7])
Made 418 predictions, there are 418 passengers.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds_df['Survived'][is_alive] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preds_df['Survived'][is_dead] = 0


Unnamed: 0,PassengerId,Survived
0,892,0.0
1,893,0.0
2,894,0.0
3,895,0.0
4,896,1.0
...,...,...
413,1305,0.0
414,1306,1.0
415,1307,0.0
416,1308,0.0
