In [1]:
import os

import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

import torch
import torch.nn as nn
from torch.nn import functional as F

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
train_data = pd.read_csv(os.path.join('data', 'titanic', 'train.csv'))

In [3]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
class titanicDataset(Dataset):
    def __init__(self, path):
        super(titanicDataset, self).__init__()
        raw_data = pd.read_csv(path)
        features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
        label = ['Survived']
        self.y = raw_data.loc[:, label]

        def binning(x):
            if x == 0:
                return 0.0
            elif 1 <= x <= 2:
                return 1.0
            elif 3 <= x <= 5:
                return 2.0
            else:
                return 3.0
        binned_family = (raw_data.loc[:, 'SibSp'] + raw_data.loc[:, 'Parch']).apply(binning)
        self.x = raw_data.loc[:, features]
        self.x['binned_family'] = binned_family
        
        self.x['Sex'] = (self.x['Sex'] == 'male').astype(float)
        
        def cat2vec(x):
            if x == 'S':
                return 0.0
            elif x == 'C':
                return 1.0
            elif x == 'Q':
                return 2.0
            else:
                return 0.0
        
        self.x['Embarked'] = self.x['Embarked'].apply(cat2vec)
        
        self.x['Pclass'] = self.x['Pclass'].astype(float) - 1.0
        self.x['Age'] = self.x['Age'].fillna(float(int(self.x['Age'].mean())))
        self.x['Fare'] = self.x['Fare'].fillna(self.x['Fare'].mean())
        
        # as torch dataloader doesn't support returning pd dataframe, convert to a dictionary
        self.x = self.x.to_dict()
        self.y = self.y.to_numpy()
        self.columns = list(self.x.keys())
    
    def __len__(self):
        return self.y.size
        
        
    def __getitem__(self, index):
        out_x = {}
        for col in self.columns:
            out_x[col] = self.x[col][index]
        return out_x, self.y[index]

In [5]:
dataset = titanicDataset(os.path.join('data', 'titanic', 'train.csv'))

In [6]:
dataset[0]

({'Pclass': 2.0,
  'Sex': 1.0,
  'Age': 22.0,
  'Fare': 7.25,
  'Embarked': 0.0,
  'binned_family': 1.0},
 array([0], dtype=int64))

In [7]:
dataloader = DataLoader(dataset, batch_size=2, num_workers=0)

In [62]:
class tabularModel(nn.Module):
    def __init__(self, embedding_size=None, categorical_columns=None, continuous_columns=None):
        super().__init__()
        self.categorical_columns = categorical_columns
        self.continuous_columns = continuous_columns
        if embedding_size is not None and categorical_columns is not None:
            try:
                assert len(embedding_size) == len(categorical_columns)
                self.embeddings = nn.ModuleList([nn.Embedding(categories, size) for categories,size in embedding_size])
                n_emb = sum(e.embedding_dim for e in self.embeddings) #length of all embeddings combined
                self.n_emb, self.n_cont = len(categorical_columns), len(continuous_columns)
                self.lin1 = nn.Linear(self.n_emb + self.n_cont, 256)
                self.lin2 = nn.Linear(256, 64)
                self.lin3 = nn.Linear(64, 4)
                self.bn1 = nn.BatchNorm1d(self.n_cont)
                self.bn2 = nn.BatchNorm1d(256)
                self.bn3 = nn.BatchNorm1d(64)
                self.emb_drop = nn.Dropout(0.6)
                self.drops = nn.Dropout(0.3)
            except AssertionError:
                print("length of embedding size must be equal to the size of categorical columns!")
        else:
            raise ValueError("Embedding size and categorical columns must be specified!")


    def forward(self, x):
        x_cat = []
        x_cont = []
        for cat in self.categorical_columns:
            x_cat.append(x[cat])
        for cat in self.continuous_columns:
            x_cont.append(x[cat])
        # reshape two tensors to shape (batch_size, num_columns, 1)
        x_cat = torch.cat(x_cat).reshape(len(self.categorical_columns), -1, 1).permute(1, 0, 2).long()
        x_cont = torch.cat(x_cont, 0).reshape(len(continuous_columns), -1, 1).permute(1, 0, 2).double()
        
        x = [e(x_cat[:,i]) for i,e in enumerate(self.embeddings)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        x2 = self.bn1(x_cont)
        
        print(x.shape, x2.shape)
        
        x = torch.cat([x, x2], 1)
        
        x = F.relu(self.lin1(x))
        x = self.drops(x)
        x = self.bn2(x)
        x = F.relu(self.lin2(x))
        x = self.drops(x)
        x = self.bn3(x)
        x = self.lin3(x)
        
        return x

In [48]:
categorical_columns = ['Pclass', 'Sex', 'Embarked', 'binned_family']
categories = [3, 2, 3, 4]

In [49]:
embedding_size = list(zip(categories, [128] * 4))
embedding_size

[(3, 128), (2, 128), (3, 128), (4, 128)]

In [50]:
dataset.columns

['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'binned_family']

In [51]:
continuous_columns = [tmp for tmp in dataset.columns if tmp not in categorical_columns]

In [52]:
x, y = next(iter(dataloader))

In [53]:
x_cat = []
x_cont = []
for cat in categorical_columns:
    x_cat.append(x[cat])
for cat in continuous_columns:
    x_cont.append(x[cat])

In [54]:
x_cat, x_cont

([tensor([2., 0.], dtype=torch.float64),
  tensor([1., 0.], dtype=torch.float64),
  tensor([0., 1.], dtype=torch.float64),
  tensor([1., 1.], dtype=torch.float64)],
 [tensor([22., 38.], dtype=torch.float64),
  tensor([ 7.2500, 71.2833], dtype=torch.float64)])

In [55]:
x_cat_tmp = torch.cat(x_cat).reshape(4, -1, 1).permute(1, 0, 2)
x_cat_tmp

tensor([[[2.],
         [1.],
         [0.],
         [1.]],

        [[0.],
         [0.],
         [1.],
         [1.]]], dtype=torch.float64)

In [56]:
x_cat_tmp[:, 0, :]

tensor([[2.],
        [0.]], dtype=torch.float64)

In [57]:
torch.cat(x_cont)#.reshape(4, -1, 1)

tensor([22.0000, 38.0000,  7.2500, 71.2833], dtype=torch.float64)

In [58]:
[e for i, e in enumerate(model.embeddings)]

[Embedding(3, 128), Embedding(2, 128), Embedding(3, 128), Embedding(4, 128)]

In [59]:
tmp_embed = nn.Embedding(3, 128)

In [63]:
model = tabularModel(embedding_size=embedding_size, 
                     categorical_columns=categorical_columns, 
                     continuous_columns=continuous_columns).double()

In [64]:
model(x)

torch.Size([2, 4, 128]) torch.Size([2, 2, 1])


RuntimeError: Sizes of tensors must match except in dimension 1. Got 128 and 1 in dimension 2 (The offending index is 1)

In [35]:
torch.__version__

'1.6.0+cpu'