In [43]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict

In [44]:
import torch
from torch import nn
from torchmetrics import Accuracy

In [45]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [46]:
import category_encoders as ce

In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [48]:
torch.manual_seed(101)

<torch._C.Generator at 0x1a08047ebd0>

# Train Data

In [49]:
train_data = pd.read_csv(r'C:\Users\User\Desktop\Projects\Kaggle\Kaggle_Practice\spaceship_titanic\data\train.csv', index_col='PassengerId')
train_data.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [50]:
target_col = 'Transported'

In [51]:
expenditures = ["RoomService", 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [52]:
def preprocess(data: pd.DataFrame, cols2sum: List[str]):
    data['Total_exp'] = data[cols2sum].sum(axis=1)
    data["InGroup"]   = create_ingroup_from_index(data)
    parsed_cabin = parse_cabin(data)

    return parsed_cabin.drop(['Name'], axis=1)

def parse_cabin(data: pd.DataFrame):
    data['Deck']     = data['Cabin'].apply(lambda x: x.split(sep='/')[0] if type(x) == str else x)
    # data['Cabin_#']  = data['Cabin'].apply(lambda x: x.split(sep='/')[1] if type(x) == str else x)
    data['Portside'] = data['Cabin'].apply(lambda x: x.split(sep='/')[2] if type(x) == str else x)

    return data.drop(['Cabin'] + expenditures, axis=1)

def create_ingroup_from_index(data: pd.DataFrame):
    df = data.index.to_frame()
    df[['group_id', 'ingroup_id']] = df['PassengerId'].str.split("_", n = 1, expand = True)
    grouped = df.groupby('group_id').count()
    grouped['ingroup'] = np.where(grouped['ingroup_id'] > 1, 1, 0)
    return pd.merge(df['group_id'], grouped['ingroup'], left_on=['group_id'], right_index=True)['ingroup']

In [53]:
train_df = preprocess(data=train_data, cols2sum=expenditures)

# Impute nulls

In [54]:
most_freq_imputer = SimpleImputer(strategy='most_frequent')
median_imputer    = SimpleImputer(strategy='median')
mean_imputer      = SimpleImputer(strategy='mean')

In [55]:
train_df.isnull().sum()

HomePlanet     201
CryoSleep      217
Destination    182
Age            179
VIP            203
Transported      0
Total_exp        0
InGroup          0
Deck           199
Portside       199
dtype: int64

In [56]:
def impute_col(data: pd.DataFrame, col: str, imputer):
    imputed = imputer.fit_transform(data[[col]]).ravel()
    print(f"len of original and imputed column name {col} are equal: ",len(data[col]) == len(imputed))
    return imputed

In [57]:
train_df['Age']         = impute_col(train_df, 'Age'        , mean_imputer)

train_df['Deck']        = impute_col(train_df, 'Deck'       , most_freq_imputer)
train_df['Portside']    = impute_col(train_df, 'Portside'   , most_freq_imputer)
train_df['Destination'] = impute_col(train_df, 'Destination', most_freq_imputer)
train_df['CryoSleep']   = impute_col(train_df, 'CryoSleep'  , most_freq_imputer)
train_df['HomePlanet']  = impute_col(train_df, 'HomePlanet' , most_freq_imputer)
train_df['VIP']         = impute_col(train_df, 'VIP',         most_freq_imputer)

len of original and imputed column name Age are equal:  True
len of original and imputed column name Deck are equal:  True
len of original and imputed column name Portside are equal:  True
len of original and imputed column name Destination are equal:  True
len of original and imputed column name CryoSleep are equal:  True
len of original and imputed column name HomePlanet are equal:  True
len of original and imputed column name VIP are equal:  True


In [58]:
def change_type_cols_inplace(data: pd.DataFrame, cols: List[str], change_type = bool):
    for col in cols:
        data[col] = data[col].astype(change_type)

In [59]:
change_type_cols_inplace(train_df, cols=['VIP','CryoSleep', target_col], change_type=float)

In [60]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   HomePlanet   8693 non-null   object 
 1   CryoSleep    8693 non-null   float64
 2   Destination  8693 non-null   object 
 3   Age          8693 non-null   float64
 4   VIP          8693 non-null   float64
 5   Transported  8693 non-null   float64
 6   Total_exp    8693 non-null   float64
 7   InGroup      8693 non-null   int64  
 8   Deck         8693 non-null   object 
 9   Portside     8693 non-null   object 
dtypes: float64(5), int64(1), object(4)
memory usage: 747.1+ KB


# Scaling

In [61]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [62]:
min_max_scale  = MinMaxScaler()
standard_scale = StandardScaler()

In [63]:
train_df['Age'] = min_max_scale.fit_transform(train_df[['Age']])

# Split `Train` and `Val`

In [64]:
TRAIN_VAL_CONSTANT = len(train_df) // 10
TRAIN_VAL_CONSTANT

869

In [65]:
train = train_df[:-TRAIN_VAL_CONSTANT]
val   = train_df[-TRAIN_VAL_CONSTANT:]

In [66]:
train.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,Total_exp,InGroup,Deck,Portside
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0001_01,Europa,0.0,TRAPPIST-1e,0.493671,0.0,0.0,0.0,0,B,P
0002_01,Earth,0.0,TRAPPIST-1e,0.303797,0.0,1.0,736.0,0,F,S
0003_01,Europa,0.0,TRAPPIST-1e,0.734177,1.0,0.0,10383.0,1,A,S
0003_02,Europa,0.0,TRAPPIST-1e,0.417722,0.0,0.0,5176.0,1,A,S
0004_01,Earth,0.0,TRAPPIST-1e,0.202532,0.0,1.0,1091.0,0,F,S


# Encoding

In [67]:
cols2encode = ['HomePlanet', 'Destination' ,'Deck', 'Portside']

In [68]:
def encode_loo(df: pd.DataFrame, loo_enc: ce.LeaveOneOutEncoder ,target_col: str = target_col):    
    return loo_enc.fit_transform(df, df[target_col])

In [69]:
loo_enc = ce.LeaveOneOutEncoder(cols=cols2encode)

Separating the **Leave One Out** encoding for the `train` and `val` as the encoder grabs the mean of the `target_col` for each category. Thus, when evaluating with a validation set we should act as the `target_col` is not known.

In [70]:
X_train = loo_enc.fit_transform(train.drop(target_col, axis=1), train[target_col])
X_val   = loo_enc.transform(val.drop(target_col, axis=1))

In [71]:
train_enc = X_train.merge(train[target_col], left_index=True, right_index=True)
train_enc.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,Total_exp,InGroup,Deck,Portside,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0001_01,0.663556,0.0,0.476632,0.493671,0.0,0.0,0,0.732017,0.45,0.0
0002_01,0.431528,0.0,0.47645,0.303797,0.0,736.0,0,0.445397,0.561603,1.0
0003_01,0.663556,0.0,0.476632,0.734177,1.0,10383.0,1,0.508696,0.561851,0.0
0003_02,0.663556,0.0,0.476632,0.417722,0.0,5176.0,1,0.508696,0.561851,0.0
0004_01,0.431528,0.0,0.47645,0.202532,0.0,1091.0,0,0.445397,0.561603,1.0


In [72]:
val_enc = X_val.merge(val[target_col], left_index=True, right_index=True)
val_enc.head()

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,Total_exp,InGroup,Deck,Portside,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
8346_01,0.43166,0.0,0.476545,0.364911,0.0,774.0,0,0.445604,0.449881,0.0
8348_01,0.524497,1.0,0.476545,0.594937,0.0,0.0,0,0.355414,0.449881,1.0
8354_01,0.663212,1.0,0.476545,0.189873,0.0,0.0,1,0.730986,0.561712,1.0
8354_02,0.663212,1.0,0.476545,0.164557,0.0,0.0,1,0.730986,0.561712,1.0
8356_01,0.43166,1.0,0.513475,0.240506,0.0,0.0,0,0.522609,0.449881,0.0


# Create `TensorDataset` and `DataLoader`

In [73]:
from torch.utils.data import DataLoader, TensorDataset

In [74]:
import torch

In [75]:
def create_dataset_dataloader(df: pd.DataFrame, target_col: str, dtype = torch.float32, testing: bool = False) -> Tuple[TensorDataset, DataLoader]:
    if testing:
        dataset = TensorDataset(torch.tensor(df.values, dtype=dtype))  
        dataloader = DataLoader(dataset=dataset, batch_size=len(dataset), shuffle=True, drop_last=True)

        return dataset, dataloader
    else:    
        dataset = TensorDataset(torch.tensor(df.drop(target_col, axis=1).values, dtype=dtype),
                                torch.tensor(df[target_col].values, dtype=dtype)) 
        dataloader = DataLoader(dataset=dataset, batch_size=len(dataset), shuffle=True, drop_last=True)

        return dataset, dataloader


In [76]:
train_dataset, train_dataloader = create_dataset_dataloader(train_enc, target_col)
val_dataset,  val_dataloader    = create_dataset_dataloader(val_enc, target_col)

In [77]:
len(train_dataloader)

1

# Model

In [78]:
NUM_FEATS = len(train.drop(target_col, axis=1).columns)
NUM_CLASSES = 1

In [None]:
class BasicNNModel(nn.Module):
    def __init__(self, input_features, output_features, hidden_units):
        super().__init__()
        self.linear_layer_stack = nn.Sequential(
            nn.Linear(in_features=input_features, out_features=2**hidden_units),
            nn.Dropout(p=0.01),
            nn.Linear(in_features=2**hidden_units, out_features=2**hidden_units),
            nn.Dropout(p=0.01),
            nn.Linear(in_features=2**hidden_units, out_features=output_features), 
        )
    def forward(self, x):
        return self.linear_layer_stack(x)

In [80]:
model = BasicNNModel(input_features=NUM_FEATS, output_features=NUM_CLASSES, hidden_units=6)

In [None]:
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [82]:
accuracy_ = Accuracy('binary')

# Model Train Loop

## Train Step

In [None]:
def train_step(model: nn.Module,
               dataloader: torch.utils.data.DataLoader,
               loss_fn: nn.Module,
               optimizer: torch.optim.Optimizer,
               device: torch.device) -> Tuple[float, float]:
    
    model.train()
    train_loss, train_acc = 0, 0

    for batch, (X,y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        y_pred_logits = model(X).ravel()
        loss = loss_fn(y_pred_logits, y)
        train_loss += loss.item()
        y_pred = torch.sigmoid(y_pred_logits).round()
        train_acc = accuracy_(y_pred, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step



    train_loss = train_loss / len(dataloader)
    train_acc = train_acc / len(dataloader)
    return train_loss, train_acc 

## Validation Step

In [None]:
def validation_step(model: nn.Module,
              dataloader: torch.utils.data.DataLoader,
              loss_fn: nn.Module,
              device: torch.device) -> Tuple[float, float]:
    model.eval()
    val_loss, val_acc = 0, 0
    with torch.inference_mode():
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            val_pred_logits = model(X).ravel()
            loss = loss_fn(val_pred_logits, y)
            val_loss += loss.item()
            val_pred = torch.sigmoid(val_pred_logits).round()
            val_acc = accuracy_(val_pred, y)

    val_loss = val_loss /len(dataloader)
    val_acc = val_acc /len(dataloader)
    return val_loss, val_acc



In [85]:
def train_model(model: nn.Module,
          train_dataloader: torch.utils.data.DataLoader,
          val_dataloader: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          loss_fn: nn.Module,
          epochs: int,
          device: torch.device) -> Dict[str, List]:
    results = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}
    for epoch in range(epochs):
        train_loss, train_acc = train_step(model     = model,
                                          dataloader = train_dataloader,
                                          loss_fn    = loss_fn,
                                          optimizer  = optimizer,
                                          device     = device)
        val_loss, val_acc = validation_step(model      = model,
                                            dataloader = val_dataloader,
                                            loss_fn    = loss_fn,
                                            device     = device)
        if epoch % 10 == 0:
            print(f"Epoch: {epoch} | Loss: {train_loss:.5f}, Acc: {train_acc:.2f}% | Val Loss: {val_loss:.5f}, Val Acc: {val_acc:.2f}%")
        if epoch == epochs-1:
            print(f"Last Epoch:\nEpoch: {epoch} | Loss: {train_loss:.5f}, Acc: {train_acc:.2f}% | Val Loss: {val_loss:.5f}, Val Acc: {val_acc:.2f}%")
        results["train_loss"].append(train_loss)
        results["train_acc"].append(train_acc)
        results["val_loss"].append(val_loss)
        results["val_acc"].append(val_acc)
    
    return results

In [86]:
results = train_model(model=  model,
          train_dataloader = train_dataloader,
          val_dataloader = val_dataloader,
          optimizer = optimizer,
          loss_fn = loss_fn,
          epochs = 100,
          device = device)

Epoch: 0 | Loss: 22.84910, Acc: 0.74% | Val Loss: 17.43481, Val Acc: 0.74%
Epoch: 10 | Loss: 22.97643, Acc: 0.74% | Val Loss: 17.43481, Val Acc: 0.74%
Epoch: 20 | Loss: 22.58407, Acc: 0.74% | Val Loss: 17.43481, Val Acc: 0.74%
Epoch: 30 | Loss: 22.71318, Acc: 0.74% | Val Loss: 17.43480, Val Acc: 0.74%
Epoch: 40 | Loss: 22.74231, Acc: 0.74% | Val Loss: 17.43480, Val Acc: 0.74%
Epoch: 50 | Loss: 22.74458, Acc: 0.74% | Val Loss: 17.43480, Val Acc: 0.74%
Epoch: 60 | Loss: 22.70651, Acc: 0.74% | Val Loss: 17.43480, Val Acc: 0.74%
Epoch: 70 | Loss: 22.65010, Acc: 0.74% | Val Loss: 17.43481, Val Acc: 0.74%
Epoch: 80 | Loss: 22.89098, Acc: 0.74% | Val Loss: 17.43481, Val Acc: 0.74%
Epoch: 90 | Loss: 22.68027, Acc: 0.74% | Val Loss: 17.43480, Val Acc: 0.74%
Last Epoch:
Epoch: 99 | Loss: 22.63132, Acc: 0.74% | Val Loss: 17.43480, Val Acc: 0.74%


NameError: name 'stop' is not defined

# Test

In [88]:
test_data = pd.read_csv(r'C:\Users\User\Desktop\Projects\Kaggle\Kaggle_Practice\spaceship_titanic\data\test.csv', index_col='PassengerId')

## Preprocess

In [89]:
test_df = preprocess(data=test_data, cols2sum=expenditures)

## Impute

In [90]:
test_df['Age']         = impute_col(test_df, 'Age'        , mean_imputer)

test_df['Deck']        = impute_col(test_df, 'Deck'       , most_freq_imputer)
test_df['Portside']    = impute_col(test_df, 'Portside'   , most_freq_imputer)
test_df['Destination'] = impute_col(test_df, 'Destination', most_freq_imputer)
test_df['CryoSleep']   = impute_col(test_df, 'CryoSleep'  , most_freq_imputer)
test_df['HomePlanet']  = impute_col(test_df, 'HomePlanet' , most_freq_imputer)
test_df['VIP']         = impute_col(test_df, 'VIP',         most_freq_imputer)

len of original and imputed column name Age are equal:  True
len of original and imputed column name Deck are equal:  True
len of original and imputed column name Portside are equal:  True
len of original and imputed column name Destination are equal:  True
len of original and imputed column name CryoSleep are equal:  True
len of original and imputed column name HomePlanet are equal:  True
len of original and imputed column name VIP are equal:  True


In [91]:
change_type_cols_inplace(test_df, cols=['VIP','CryoSleep'], change_type=float)

## Scale

In [92]:
test_df['Age'] = min_max_scale.fit_transform(test_df[['Age']])

## Encode

In [93]:
test_df_enc = loo_enc.transform(test_df)

In [95]:
test_dataset, test_dataloader = create_dataset_dataloader(test_df_enc, target_col, testing=True)

In [96]:
model.eval()
with torch.no_grad():
    test_pred_logits = model(test_dataset.tensors[0]).to(device)#
    test_pred = torch.sigmoid(test_pred_logits).round()

# Final Results

In [98]:
final = pd.DataFrame(test_pred.numpy().ravel(), index=test_data.index, columns=['Transported']).reset_index()
final['Transported'] = final['Transported'].astype(bool)
final

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [None]:
# final.to_csv(r'C:\Users\User\Desktop\Projects\Kaggle\Kaggle_Practice\spaceship_titanic\results\NN_MostFreq_impute_TotalExp_InGroup_LOOencoder.csv', index=False)

# Save Model

In [None]:
# torch.save(model.state_dict(), r'C:\Users\User\Desktop\Projects\Kaggle\Kaggle_Practice\spaceship_titanic\models\NN_LinDropLinDropLin_6multiplier')