### Imports

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('future.no_silent_downcasting', True)

import sys
sys.path.append("..")
from utils import prepare_data

### Handling the data

##### Load data, process and audit

In [13]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

train_processed, train_stats = prepare_data(train, is_train=True)
test_processed = prepare_data(test, train_stats=train_stats, is_train=False)

print(train_processed.head())

  HomePlanet CryoSleep  Destination    VIP  RoomService  FoodCourt  \
0     Europa     False  TRAPPIST-1e  False          0.0        0.0   
1      Earth     False  TRAPPIST-1e  False        109.0        9.0   
2     Europa     False  TRAPPIST-1e   True         43.0     3576.0   
3     Europa     False  TRAPPIST-1e  False          0.0     1283.0   
4      Earth     False  TRAPPIST-1e  False        303.0       70.0   

   ShoppingMall     Spa  VRDeck  Transported    Age_Group  Total_Spendings  \
0           0.0     0.0     0.0        False        Adult              0.0   
1          25.0   549.0    44.0         True  Young_Adult            736.0   
2           0.0  6715.0    49.0        False       Senior          10383.0   
3         371.0  3329.0   193.0        False        Adult           5176.0   
4         151.0   565.0     2.0         True         Teen           1091.0   

   Is_Spender Cabin_Deck Cabin_Side  Group_Size  
0           0          B          P           1  
1         

In [14]:
print("=" * 50)
print("DATA AUDIT")
print("=" * 50)
print(f"\nTrain processed shape: {train_processed.shape}")
print(f"Test processed shape: {test_processed.shape}")
print(f"\nColoumns in train processed:")
print(f"{train_processed.columns}")
print(f"\nData types in train processed")
print(f"{train_processed.dtypes}")
print(f"\nTarget distribution")
print(f"{train_processed["Transported"].value_counts()}")

DATA AUDIT

Train processed shape: (8693, 16)
Test processed shape: (4277, 15)

Coloumns in train processed:
Index(['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'RoomService',
       'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported',
       'Age_Group', 'Total_Spendings', 'Is_Spender', 'Cabin_Deck',
       'Cabin_Side', 'Group_Size'],
      dtype='object')

Data types in train processed
HomePlanet           object
CryoSleep            object
Destination          object
VIP                  object
RoomService         float64
FoodCourt           float64
ShoppingMall        float64
Spa                 float64
VRDeck              float64
Transported            bool
Age_Group          category
Total_Spendings     float64
Is_Spender            int64
Cabin_Deck           object
Cabin_Side           object
Group_Size            int64
dtype: object

Target distribution
Transported
True     4378
False    4315
Name: count, dtype: int64


##### Encoding and Normalization

In [15]:
X_train_first = train_processed.drop("Transported", axis=1)
y_train_first = train_processed["Transported"].astype(int)

X_final_test = test_processed.copy(deep=True)

print(f"\nX_train shape: {X_train_first.shape}")
print(f"y_train shape: {y_train_first.shape}")
print(f"X_final_test shape {X_final_test.shape}")

print(f"\ny_train distribution:")
print(f"{y_train_first.value_counts()}")


X_train shape: (8693, 15)
y_train shape: (8693,)
X_final_test shape (4277, 15)

y_train distribution:
Transported
1    4378
0    4315
Name: count, dtype: int64


One Hot Encoding for categorical features

In [16]:
categorical_features = ["HomePlanet", "Destination", "Cabin_Deck", "Cabin_Side", "Age_Group"]

X_train_encoded = pd.get_dummies(X_train_first, columns=categorical_features, drop_first=True, dtype=int)
X_final_test_encoded = pd.get_dummies(X_final_test, columns=categorical_features, drop_first=True, dtype=int)

X_train_encoded, X_final_test_encoded = X_train_encoded.align(X_final_test_encoded, join="left", axis=1, fill_value=0)

print(f"Encoded train shape: {X_train_encoded.shape}")
print(f"Encoded final test shape: {X_final_test_encoded.shape}")


Encoded train shape: (8693, 30)
Encoded final test shape: (4277, 30)


In [17]:
X_train_encoded.head(10)

Unnamed: 0,CryoSleep,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_Spendings,Is_Spender,Group_Size,...,Cabin_Deck_F,Cabin_Deck_G,Cabin_Deck_T,Cabin_Deck_Unknown,Cabin_Side_S,Cabin_Side_Unknown,Age_Group_Teen,Age_Group_Young_Adult,Age_Group_Adult,Age_Group_Senior
0,False,False,0.0,0.0,0.0,0.0,0.0,0.0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,False,False,109.0,9.0,25.0,549.0,44.0,736.0,1,1,...,1,0,0,0,1,0,0,1,0,0
2,False,True,43.0,3576.0,0.0,6715.0,49.0,10383.0,1,2,...,0,0,0,0,1,0,0,0,0,1
3,False,False,0.0,1283.0,371.0,3329.0,193.0,5176.0,1,2,...,0,0,0,0,1,0,0,0,1,0
4,False,False,303.0,70.0,151.0,565.0,2.0,1091.0,1,1,...,1,0,0,0,1,0,1,0,0,0
5,False,False,0.0,483.0,0.0,291.0,0.0,774.0,1,1,...,1,0,0,0,0,0,0,0,1,0
6,False,False,42.0,1539.0,3.0,0.0,0.0,1584.0,1,2,...,1,0,0,0,1,0,0,1,0,0
7,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0,2,...,0,1,0,0,1,0,0,1,0,0
8,False,False,0.0,785.0,17.0,216.0,0.0,1018.0,1,1,...,1,0,0,0,1,0,0,0,1,0
9,True,False,0.0,0.0,0.0,0.0,0.0,0.0,0,3,...,0,0,0,0,0,0,1,0,0,0


Normalization

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [19]:
numerical_cols = ["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Total_Spendings", "Group_Size"]

scaler.fit(X_train_encoded[numerical_cols])

X_train_scaled = X_train_encoded.copy(deep=True)
X_final_test_scaled = X_final_test_encoded.copy(deep=True)

X_train_scaled[numerical_cols] = scaler.transform(X_train_encoded[numerical_cols])
X_final_test_scaled[numerical_cols] = scaler.transform(X_final_test_encoded[numerical_cols])

print("Scaled!")
print(f"X_train_scaled shape: {X_train_scaled.shape}")
print(f"X_test_scaled shape: {X_final_test_scaled.shape}")
print(f"{X_train_scaled.head(10)}")

Scaled!
X_train_scaled shape: (8693, 30)
X_test_scaled shape: (4277, 30)
  CryoSleep    VIP  RoomService  FoodCourt  ShoppingMall       Spa    VRDeck  \
0     False  False    -0.333105  -0.281027     -0.283579 -0.270626 -0.263003   
1     False  False    -0.168073  -0.275387     -0.241771  0.217158 -0.224205   
2     False   True    -0.268001   1.959998     -0.283579  5.695623 -0.219796   
3     False  False    -0.333105   0.523010      0.336851  2.687176 -0.092818   
4     False  False     0.125652  -0.237159     -0.031059  0.231374 -0.261240   
5     False  False    -0.333105   0.021662     -0.283579 -0.012074 -0.263003   
6     False  False    -0.269515   0.683441     -0.278562 -0.270626 -0.263003   
7      True  False    -0.333105  -0.281027     -0.283579 -0.270626 -0.263003   
8     False  False    -0.333105   0.210921     -0.255149 -0.078711 -0.263003   
9      True  False    -0.333105  -0.281027     -0.283579 -0.270626 -0.263003   

   Total_Spendings  Is_Spender  Group_Size  ..

Convert binary cols to numbers

In [20]:
X_train_scaled["CryoSleep"] = X_train_scaled["CryoSleep"].astype(int)
X_train_scaled["VIP"] = X_train_scaled["VIP"].astype(int)

X_final_test_scaled["CryoSleep"] = X_final_test_scaled["CryoSleep"].astype(int)
X_final_test_scaled["VIP"] = X_final_test_scaled["VIP"].astype(int)

print("Data types after conversion:")
print(X_train_scaled.dtypes.value_counts())

Data types after conversion:
int64      23
float64     7
Name: count, dtype: int64


### Training

#### Split data

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_train_scaled,
    y_train_first, 
    test_size=0.2, 
    random_state=42,
    stratify=y_train_first
)

print(f"\nX_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_train distribution: {y_train.value_counts(normalize=True)}")

print("=" * (50))

print(f"\nX_val shape: {X_val.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_val distribution: {y_val.value_counts(normalize=True)}")


X_train shape: (6954, 30)
y_train shape: (6954,)
y_train distribution: Transported
1    0.503595
0    0.496405
Name: proportion, dtype: float64

X_val shape: (1739, 30)
y_val shape: (1739,)
y_val distribution: Transported
1    0.503738
0    0.496262
Name: proportion, dtype: float64


#### Choosing model

Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [33]:
LR = LogisticRegression(random_state=42)

LR.fit(X_train, y_train)
y_pred = LR.predict(X_val)

print("Accuracy for Logistic Regression is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")

Accuracy for Logistic Regression is:
0.79068

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.78      0.79       863
           1       0.79      0.80      0.79       876

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739


Confusion Matrix:
[[675 188]
 [176 700]]


Grid Search for log reg

In [39]:
from sklearn.model_selection import GridSearchCV

param_grid ={
    'C': [0.01, 0.1, 1, 10]
}

grid_LR = GridSearchCV(
                    estimator=LogisticRegression(random_state=42),
                    param_grid=param_grid,
                    scoring="accuracy",
                    cv=5,
                    verbose=1,
                    n_jobs=-1
                    )
grid_LR.fit(X_train, y_train)

print(f"Best Parameters: {grid_LR.best_params_}")
print(f"Best CV score: {grid_LR.best_score_:.5f}")

best_model_lr = grid_LR.best_estimator_

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Parameters: {'C': 10}
Best CV score: 0.79163


In [48]:
y_pred = best_model_lr.predict(X_val)

print("Accuracy for Tuned Random Forest is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")

Accuracy for Tuned Random Forest is:
0.79011

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.78      0.79       863
           1       0.79      0.80      0.79       876

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739


Confusion Matrix:
[[673 190]
 [175 701]]


Random Forest

In [36]:
RF = RandomForestClassifier(random_state=42, n_jobs=-1)

RF.fit(X_train, y_train)
y_pred = RF.predict(X_val)

print("Accuracy for Random Forest is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")

Accuracy for Random Forest is:
0.80449

Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.83      0.81       863
           1       0.82      0.78      0.80       876

    accuracy                           0.80      1739
   macro avg       0.81      0.80      0.80      1739
weighted avg       0.81      0.80      0.80      1739


Confusion Matrix:
[[714 149]
 [191 685]]


Grid Search for Random Forest

In [46]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "max_features": ['sqrt', 'log2'],
}
grid_RF = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    verbose=1,
    n_jobs=-1    
)

grid_RF.fit(X_train, y_train)

print(f"Best Parameters: {grid_RF.best_params_}")
print(f"Best Score: {grid_RF.best_score_}")

best_model_rf = grid_RF.best_estimator_

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best Parameters: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200}
Best Score: 0.7966656150277477


In [47]:
y_pred = best_model_rf.predict(X_val)

print("Accuracy for Tuned Random Forest is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")

Accuracy for Tuned Random Forest is:
0.79701

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80       863
           1       0.80      0.80      0.80       876

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739


Confusion Matrix:
[[687 176]
 [177 699]]


XGBoost

In [52]:
import xgboost as xgb
from xgboost import XGBClassifier

In [53]:
xgb_model = XGBClassifier(
    random_state=42,
    n_jobs=-1,
    eval_metric="logloss"
)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_val)

print("Accuracy for Tuned Random Forest is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")

Accuracy for Tuned Random Forest is:
0.80966

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.80      0.81       863
           1       0.81      0.82      0.81       876

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739


Confusion Matrix:
[[694 169]
 [162 714]]


Grid Search for XGBoost

In [54]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss', n_jobs=-1),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_xgb.fit(X_train, y_train)

print(f"Best Parameters: {grid_xgb.best_params_}")
print(f"Best Score: {grid_xgb.best_score_}")

best_model_xgb = grid_xgb.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.3, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best Score: 0.8011222194063585


In [55]:
y_pred = best_model_xgb.predict(X_val)

print("Accuracy for Tuned Random Forest is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")

Accuracy for Tuned Random Forest is:
0.80794

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.78      0.80       863
           1       0.80      0.83      0.81       876

    accuracy                           0.81      1739
   macro avg       0.81      0.81      0.81      1739
weighted avg       0.81      0.81      0.81      1739


Confusion Matrix:
[[676 187]
 [147 729]]


In [64]:
from lightgbm import LGBMClassifier

lgbm_model = LGBMClassifier(random_state=42, n_jobs=-1)


lgbm_model.fit(X_train, y_train)

y_pred = lgbm_model.predict(X_val)

print("Accuracy for LightGBM is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1583
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380
Accuracy for LightGBM is:
0.80449

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.78      0.80       863
           1       0.79      0.83      0.81       876

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739


Confusion Matrix:
[[676 187]
 [153 723]]


In [66]:
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 6, -1],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_lgbm = GridSearchCV(
    estimator=LGBMClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_lgbm.fit(X_train, y_train)

print(f"Best Parameters: {grid_lgbm.best_params_}")
print(f"Best Score: {grid_lgbm.best_score_}")

best_model_lgbm = grid_lgbm.best_estimator_

Fitting 5 folds for each of 108 candidates, totalling 540 fits
[LightGBM] [Info] Number of positive: 2801, number of negative: 2762
[LightGBM] [Info] Number of positive: 2801, number of negative: 2762
[LightGBM] [Info] Number of positive: 2802, number of negative: 2761
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004958 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1583
[LightGBM] [Info] Number of data points in the train set: 5563, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503505 -> initscore=0.014021
[LightGBM] [Info] Start training from score 0.014021
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] To

In [67]:
y_pred = best_model_lgbm.predict(X_val)

print("Accuracy for LightGBM is:")
print(f"{accuracy_score(y_val, y_pred, normalize=True):.5}")
print(f"\nClassification Report:")
print(f"{classification_report(y_val, y_pred)}")
print("\nConfusion Matrix:")
print(f"{confusion_matrix(y_val, y_pred)}")

Accuracy for LightGBM is:
0.80391

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.79      0.80       863
           1       0.80      0.82      0.81       876

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739


Confusion Matrix:
[[680 183]
 [158 718]]


final prediction

In [56]:
final_predictions = xgb_model.predict(X_final_test_scaled)
print(f"Number of predictions: {len(final_predictions)}")
print(f"Predictions distribution:")
print(pd.Series(final_predictions).value_counts())

Number of predictions: 4277
Predictions distribution:
1    2164
0    2113
Name: count, dtype: int64


In [58]:
test_original = pd.read_csv("../data/test.csv")

submission = pd.DataFrame({
    'PassengerId': test_original['PassengerId'],
    'Transported': final_predictions.astype(bool)
})

print("Submission preview:")
print(submission.head(10))
print(f"\nSubmission shape: {submission.shape}")
print(f"Transported distribution:")
print(submission['Transported'].value_counts())
import os
os.makedirs("../submissions", exist_ok=True)
submission.to_csv("../submissions/submission_xgboost.csv", index=False)
print("\nSubmission file saved to: submissions/submission_xgboost.csv")


Submission preview:
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01        False
5     0027_01        False
6     0029_01         True
7     0032_01         True
8     0032_02         True
9     0033_01         True

Submission shape: (4277, 2)
Transported distribution:
Transported
True     2164
False    2113
Name: count, dtype: int64

Submission file saved to: submissions/submission_xgboost.csv
