### Imports

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('future.no_silent_downcasting', True)

import sys
sys.path.append("..")
from utils import prepare_data

### Handling the data

##### Load data, process and audit

In [20]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

train_processed, train_stats = prepare_data(train, is_train=True)
test_processed = prepare_data(test, train_stats=train_stats, is_train=False)

print(train_processed.head())

  HomePlanet CryoSleep  Destination    VIP  RoomService  FoodCourt  \
0     Europa     False  TRAPPIST-1e  False          0.0        0.0   
1      Earth     False  TRAPPIST-1e  False        109.0        9.0   
2     Europa     False  TRAPPIST-1e   True         43.0     3576.0   
3     Europa     False  TRAPPIST-1e  False          0.0     1283.0   
4      Earth     False  TRAPPIST-1e  False        303.0       70.0   

   ShoppingMall     Spa  VRDeck  Transported    Age_Group  Total_Spendings  \
0           0.0     0.0     0.0        False        Adult              0.0   
1          25.0   549.0    44.0         True  Young_Adult            736.0   
2           0.0  6715.0    49.0        False       Senior          10383.0   
3         371.0  3329.0   193.0        False        Adult           5176.0   
4         151.0   565.0     2.0         True         Teen           1091.0   

   Is_Spender Cabin_Deck Cabin_Side  Group_Size  
0           0          B          P           1  
1         

In [22]:
print("=" * 50)
print("DATA AUDIT")
print("=" * 50)
print(f"\nTrain processed shape: {train_processed.shape}")
print(f"Test processed shape: {test_processed.shape}")
print(f"\nColoumns in train processed:")
print(f"{train_processed.columns}")
print(f"\nData types in train processed")
print(f"{train_processed.dtypes}")
print(f"\nTarget distribution")
print(f"{train_processed["Transported"].value_counts()}")

DATA AUDIT

Train processed shape: (8693, 16)
Test processed shape: (4277, 15)

Coloumns in train processed:
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported',
       'Age_Group', 'Total_Spendings', 'Is_Spender', 'Cabin_Deck',
       'Cabin_Side', 'Group_Size'],
      dtype='object')

Data types in train processed
HomePlanet           object
CryoSleep            object
Destination          object
VIP                  object
RoomService         float64
FoodCourt           float64
ShoppingMall        float64
Spa                 float64
VRDeck              float64
Transported            bool
Age_Group          category
Total_Spendings     float64
Is_Spender            int64
Cabin_Deck           object
Cabin_Side           object
Group_Size            int64
dtype: object

Target distribution
Transported
True     4378
False    4315
Name: count, dtype: int64


##### Encoding and Normalization

In [23]:
X_train = train_processed.drop("Transported", axis=1)
y_train = train_processed["Transported"].astype(int)

X_final_test = test_processed.copy(deep=True)

print(f"\nX_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_final_test shape {X_final_test.shape}")

print(f"\ny_train distribution:")
print(f"{y_train.value_counts()}")


X_train shape: (8693, 15)
y_train shape: (8693,)
X_final_test shape (4277, 15)

y_train distribution:
Transported
1    4378
0    4315
Name: count, dtype: int64


One Hot Encoding for categorical features

In [49]:
categorical_features = ["HomePlanet", "Destination", "Cabin_Deck", "Cabin_Side", "Age_Group"]

X_train_encoded = pd.get_dummies(X_train, columns=categorical_features, drop_first=True, dtype=int)
X_final_test_encoded = pd.get_dummies(X_final_test, columns=categorical_features, drop_first=True, dtype=int)

X_train_encoded, X_final_test_encoded = X_train_encoded.align(X_final_test_encoded, join="left", axis=1, fill_value=0)

print(f"Encoded train shape: {X_train_encoded.shape}")
print(f"Encoded final test shape: {X_final_test_encoded.shape}")


Encoded train shape: (8693, 30)
Encoded final test shape: (4277, 30)


In [50]:
X_train_encoded.head(10)

Unnamed: 0,CryoSleep,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_Spendings,Is_Spender,Group_Size,...,Cabin_Deck_F,Cabin_Deck_G,Cabin_Deck_T,Cabin_Deck_Unknown,Cabin_Side_S,Cabin_Side_Unknown,Age_Group_Teen,Age_Group_Young_Adult,Age_Group_Adult,Age_Group_Senior
0,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,False,False,109.0,9.0,25.0,549.0,44.0,736.0,1,1,...,1,0,0,0,1,0,0,1,0,0
2,False,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,1,2,...,0,0,0,0,1,0,0,0,0,1
3,False,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,1,2,...,0,0,0,0,1,0,0,0,1,0
4,False,False,303.0,70.0,151.0,565.0,2.0,1091.0,1,1,...,1,0,0,0,1,0,1,0,0,0
5,False,False,0.0,483.0,0.0,291.0,0.0,774.0,1,1,...,1,0,0,0,0,0,0,0,1,0
6,False,False,42.0,1539.0,3.0,0.0,0.0,1584.0,1,2,...,1,0,0,0,1,0,0,1,0,0
7,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0,2,...,0,1,0,0,1,0,0,1,0,0
8,False,False,0.0,785.0,17.0,216.0,0.0,1018.0,1,1,...,1,0,0,0,1,0,0,0,1,0
9,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0,3,...,0,0,0,0,0,0,1,0,0,0


Normalization

In [51]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [52]:
numerical_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Total_Spendings", "Group_Size"]

scaler.fit(X_train_encoded[numerical_cols])

X_train_scaled = X_train_encoded.copy(deep=True)
X_final_test_scaled = X_final_test_encoded.copy(deep=True)

X_train_scaled[numerical_cols] = scaler.transform(X_train_encoded[numerical_cols])
X_final_test_scaled[numerical_cols] = scaler.transform(X_final_test_encoded[numerical_cols])

print("Scaled!")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_final_test_scaled.shape}")
print(f"{X_train_scaled.head(10)}")

Scaled!
X_train_scaled shape: (8693, 30)
X_test_scaled shape: (4277, 30)
  CryoSleep    VIP  RoomService  FoodCourt  ShoppingMall       Spa    VRDeck  \
0     False  False    -0.333105  -0.281027     -0.283579 -0.270626 -0.263003   
1     False  False    -0.168073  -0.275387     -0.241771  0.217158 -0.224205   
2     False   True    -0.268001   1.959998     -0.283579  5.695623 -0.219796   
3     False  False    -0.333105   0.523010      0.336851  2.687176 -0.092818   
4     False  False     0.125652  -0.237159     -0.031059  0.231374 -0.261240   
5     False  False    -0.333105   0.021662     -0.283579 -0.012074 -0.263003   
6     False  False    -0.269515   0.683441     -0.278562 -0.270626 -0.263003   
7      True  False    -0.333105  -0.281027     -0.283579 -0.270626 -0.263003   
8     False  False    -0.333105   0.210921     -0.255149 -0.078711 -0.263003   
9      True  False    -0.333105  -0.281027     -0.283579 -0.270626 -0.263003   

   Total_Spendings  Is_Spender  Group_Size  ..

Convert binary cols to numbers

In [56]:
X_train_scaled["CryoSleep"] = X_train_scaled["CryoSleep"].astype(int)
X_train_scaled["VIP"] = X_train_scaled["VIP"].astype(int)

X_final_test_scaled["CryoSleep"] = X_final_test_scaled["CryoSleep"].astype(int)
X_final_test_scaled["VIP"] = X_final_test_scaled["VIP"].astype(int)

print("Data types after conversion:")
print(X_train_scaled.dtypes.value_counts())

Data types after conversion:
int64      23
float64     7
Name: count, dtype: int64


### Training

In [None]:
from sklearn.model_selection import train_test_split