In [30]:
import torch, torchvision
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import sklearn
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [31]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode

<contextlib.ExitStack at 0x19cdd48f040>

In [85]:
class TitanicDataSet(torch.utils.data.Dataset):
  def __init__(self, csv_file: str, train: bool):
    original_frame = pd.read_csv(csv_file)
    
    generator = torch.Generator().manual_seed(42)
    train_indices, test_indices = [ds.indices for ds in torch.utils.data.random_split(original_frame, [0.8, 0.2], generator=generator)]
    
    if train:
      data = original_frame.iloc[train_indices]
    else:
      data = original_frame.iloc[test_indices]

    feature_frame = data[["Pclass", "Age", "SibSp", "Parch", "Fare", "Sex", "Embarked"]]

    # scale numerical columns
    numerical_features = feature_frame.select_dtypes(exclude=object)
    categorical_features = feature_frame.select_dtypes(include=object)
    scaler = MinMaxScaler((0, 1))
    scaler.fit(numerical_features)
    numerical_features_arr = scaler.transform(numerical_features)

    # one hot encode categorical features
    onehot_enc = OneHotEncoder()
    onehot_enc.fit(categorical_features)
    onehot_features_arr = onehot_enc.transform(categorical_features).toarray()

    # concatenate
    total_feature_arr = np.concatenate([numerical_features_arr, onehot_features_arr], axis=1)
    self.feature_arr = total_feature_arr
    self.label_arr = original_frame["Survived"].to_numpy()

  def __len__(self):
      return len(self.feature_arr)

  def __getitem__(self, idx):
      if torch.is_tensor(idx):
          idx = idx.tolist()

      return (self.feature_arr[idx], self.label_arr[idx])


In [86]:
titanic_train = TitanicDataSet('data/titanic.csv', train=True)
titanic_val = TitanicDataSet('data/titanic.csv', train=False)


In [92]:
dataloader = torch.utils.data.DataLoader(dataset=titanic_train, batch_size=64)
features, labels = next(iter(dataloader))
features.shape

torch.Size([64, 11])