# Requirements

pandas

numpy

matplotlib

seaborn

Scikit-Learn 

polars

H2O.ai

In [1]:
import polars as pl
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
import h2o
from h2o.automl import H2OAutoML
from h2o.frame import H2OFrame
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pl.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pl.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [3]:
train.head(2)

PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
str,str,bool,str,str,f64,bool,f64,f64,f64,f64,f64,str,bool
"""0001_01""","""Europa""",False,"""B/0/P""","""TRAPPIST-1e""",39.0,False,0.0,0.0,0.0,0.0,0.0,"""Maham Ofracculy""",False
"""0002_01""","""Earth""",False,"""F/0/S""","""TRAPPIST-1e""",24.0,False,109.0,9.0,25.0,549.0,44.0,"""Juanna Vines""",True


In [4]:
test.head(2)

PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
str,str,bool,str,str,f64,bool,f64,f64,f64,f64,f64,str
"""0013_01""","""Earth""",True,"""G/3/S""","""TRAPPIST-1e""",27.0,False,0.0,0.0,0.0,0.0,0.0,"""Nelly Carsoning"""
"""0018_01""","""Earth""",False,"""F/4/S""","""TRAPPIST-1e""",19.0,False,0.0,9.0,0.0,2823.0,0.0,"""Lerome Peckers"""


# Basic Statistics

In [5]:
train.describe()

statistic,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
str,str,str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str,f64
"""count""","""8693""","""8492""",8476.0,"""8494""","""8511""",8514.0,8490.0,8512.0,8510.0,8485.0,8510.0,8505.0,"""8493""",8693.0
"""null_count""","""0""","""201""",217.0,"""199""","""182""",179.0,203.0,181.0,183.0,208.0,183.0,188.0,"""200""",0.0
"""mean""",,,0.358306,,,28.82793,0.023439,224.687617,458.077203,173.729169,311.138778,304.854791,,0.503624
"""std""",,,,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,,
"""min""","""0001_01""","""Earth""",0.0,"""A/0/P""","""55 Cancri e""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,"""Aard Curle""",0.0
"""25%""",,,,,,19.0,,0.0,0.0,0.0,0.0,0.0,,
"""50%""",,,,,,27.0,,0.0,0.0,0.0,0.0,0.0,,
"""75%""",,,,,,38.0,,47.0,76.0,27.0,59.0,46.0,,
"""max""","""9280_02""","""Mars""",1.0,"""T/3/P""","""TRAPPIST-1e""",79.0,1.0,14327.0,29813.0,23492.0,22408.0,24133.0,"""Zubeneb Pasharne""",1.0


In [6]:
train.shape

(8693, 14)

In [7]:
train.null_count()

PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,201,217,199,182,179,203,181,183,208,183,188,200,0


In [8]:
test.shape

(4277, 13)

In [9]:
test.null_count()

PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,87,93,100,92,91,93,82,106,98,101,80,94


In [10]:
# Null Percentage

In [11]:
# train
null_percentage_train = train.null_count() / train.height * 100
null_percentage_train

PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,2.312205,2.496261,2.289198,2.093639,2.059128,2.335212,2.082135,2.105142,2.39273,2.105142,2.16266,2.300702,0.0


In [12]:
# test
null_percentage_test = test.null_count() / test.height * 100
null_percentage_test

PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.0,2.034136,2.174421,2.338087,2.15104,2.12766,2.174421,1.917232,2.478373,2.291326,2.361468,1.87047,2.197802


In [13]:
# Get unique value counts for each column
unique_value_counts = {col: train[col].unique().shape[0] for col in train.columns}

# Display the unique value counts
for col, count in unique_value_counts.items():
    print(f"Column '{col}': Unique Value Count -> {count}")

Column 'PassengerId': Unique Value Count -> 8693
Column 'HomePlanet': Unique Value Count -> 4
Column 'CryoSleep': Unique Value Count -> 3
Column 'Cabin': Unique Value Count -> 6561
Column 'Destination': Unique Value Count -> 4
Column 'Age': Unique Value Count -> 81
Column 'VIP': Unique Value Count -> 3
Column 'RoomService': Unique Value Count -> 1274
Column 'FoodCourt': Unique Value Count -> 1508
Column 'ShoppingMall': Unique Value Count -> 1116
Column 'Spa': Unique Value Count -> 1328
Column 'VRDeck': Unique Value Count -> 1307
Column 'Name': Unique Value Count -> 8474
Column 'Transported': Unique Value Count -> 2


In [14]:
# Removing unnecessary columns

In [15]:
train = train.drop(['PassengerId','Cabin','Name'])

In [16]:
test = test.drop(['PassengerId','Cabin'])

In [17]:
test = test.drop('Name')

# Handling Nan Vals.

In [18]:
train.describe()

statistic,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
str,str,f64,str,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""8492""",8476.0,"""8511""",8514.0,8490.0,8512.0,8510.0,8485.0,8510.0,8505.0,8693.0
"""null_count""","""201""",217.0,"""182""",179.0,203.0,181.0,183.0,208.0,183.0,188.0,0.0
"""mean""",,0.358306,,28.82793,0.023439,224.687617,458.077203,173.729169,311.138778,304.854791,0.503624
"""std""",,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,
"""min""","""Earth""",0.0,"""55 Cancri e""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",,,,19.0,,0.0,0.0,0.0,0.0,0.0,
"""50%""",,,,27.0,,0.0,0.0,0.0,0.0,0.0,
"""75%""",,,,38.0,,47.0,76.0,27.0,59.0,46.0,
"""max""","""Mars""",1.0,"""TRAPPIST-1e""",79.0,1.0,14327.0,29813.0,23492.0,22408.0,24133.0,1.0


In [19]:
# Identify numerical and categorical columns
num_cols = [col for col, dtype in zip(train.columns, train.dtypes) if dtype in [pl.Int64, pl.Float64, pl.UInt64]]
cat_cols = [col for col, dtype in zip(train.columns, train.dtypes) if dtype in [pl.Utf8, pl.Categorical]]

# Display the results
print("Numerical Columns:", num_cols)
print("Categorical Columns:", cat_cols)

Numerical Columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
Categorical Columns: ['HomePlanet', 'Destination']


In [20]:
num_cols.append('CryoSleep')

In [21]:
num_cols.append('VIP')

In [22]:
num_cols

['Age',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'CryoSleep',
 'VIP']

In [23]:
cat_cols

['HomePlanet', 'Destination']

In [24]:
%time

# 1. Impute median value for numerical columns
for col in num_cols:
    median_value = train[col].median()
    train = train.with_columns(
        pl.when(pl.col(col).is_null())
        .then(median_value)
        .otherwise(pl.col(col))
        .alias(col)
    )

# 2. Encode categorical columns to numerical values
cat_df = train.select(cat_cols).to_pandas()
label_encoders = {}

for col in cat_cols:
    encoder = LabelEncoder()
    cat_df[col] = cat_df[col].fillna("missing")  # Temporarily fill NaN with placeholder
    cat_df[col] = encoder.fit_transform(cat_df[col])
    label_encoders[col] = encoder

# Replace placeholder with NaN for imputation
cat_df.replace({"missing": np.nan}, inplace=True)

# Apply KNN Imputer
knn_imputer = KNNImputer(n_neighbors=3, weights="uniform", metric="nan_euclidean")
cat_imputed = knn_imputer.fit_transform(cat_df)

# Convert imputed values back to original categorical form
for i, col in enumerate(cat_cols):
    cat_df[col] = label_encoders[col].inverse_transform(cat_imputed[:, i].round().astype(int))

# Convert the imputed data back to a Polars DataFrame
cat_imputed_df = pl.DataFrame(cat_df)

# Update the original Polars DataFrame with imputed categorical values
train = train.with_columns(cat_imputed_df)

print(train)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.34 µs
shape: (8_693, 11)
┌────────────┬───────────┬───────────────┬──────┬───┬──────────────┬────────┬────────┬─────────────┐
│ HomePlanet ┆ CryoSleep ┆ Destination   ┆ Age  ┆ … ┆ ShoppingMall ┆ Spa    ┆ VRDeck ┆ Transported │
│ ---        ┆ ---       ┆ ---           ┆ ---  ┆   ┆ ---          ┆ ---    ┆ ---    ┆ ---         │
│ str        ┆ f64       ┆ str           ┆ f64  ┆   ┆ f64          ┆ f64    ┆ f64    ┆ bool        │
╞════════════╪═══════════╪═══════════════╪══════╪═══╪══════════════╪════════╪════════╪═════════════╡
│ Europa     ┆ 0.0       ┆ TRAPPIST-1e   ┆ 39.0 ┆ … ┆ 0.0          ┆ 0.0    ┆ 0.0    ┆ false       │
│ Earth      ┆ 0.0       ┆ TRAPPIST-1e   ┆ 24.0 ┆ … ┆ 25.0         ┆ 549.0  ┆ 44.0   ┆ true        │
│ Europa     ┆ 0.0       ┆ TRAPPIST-1e   ┆ 58.0 ┆ … ┆ 0.0          ┆ 6715.0 ┆ 49.0   ┆ false       │
│ Europa     ┆ 0.0       ┆ TRAPPIST-1e   ┆ 33.0 ┆ … ┆ 371.0        ┆ 3329.0 ┆ 193.0  ┆ false       │
│ Earth 

In [25]:
# Same for the Test data

In [26]:
for col in num_cols:
    median_value = train[col].median()  # Use median from training data
    test = test.with_columns(
        pl.when(pl.col(col).is_null())
        .then(median_value)
        .otherwise(pl.col(col))
        .alias(col)
    )

# 2. Encode categorical columns using training encoders
test_cat_df = test.select(cat_cols).to_pandas()

for col in cat_cols:
    encoder = label_encoders[col]  # Use the encoder from the training data
    test_cat_df[col] = test_cat_df[col].fillna("missing")  # Temporarily fill NaN with placeholder
    test_cat_df[col] = encoder.transform(test_cat_df[col])

# Replace placeholder with NaN for imputation
test_cat_df.replace({"missing": np.nan}, inplace=True)

# 3. Impute missing values using the trained KNN Imputer
test_cat_imputed = knn_imputer.transform(test_cat_df)

# Convert imputed values back to original categorical form
for i, col in enumerate(cat_cols):
    test_cat_df[col] = label_encoders[col].inverse_transform(test_cat_imputed[:, i].round().astype(int))

# Convert the imputed data back to a Polars DataFrame
test_cat_imputed_df = pl.DataFrame(test_cat_df)

# Update the original Polars DataFrame with imputed categorical values
test = test.with_columns(test_cat_imputed_df)

print(test)

shape: (4_277, 10)
┌────────────┬───────────┬───────────────┬──────┬───┬───────────┬──────────────┬────────┬────────┐
│ HomePlanet ┆ CryoSleep ┆ Destination   ┆ Age  ┆ … ┆ FoodCourt ┆ ShoppingMall ┆ Spa    ┆ VRDeck │
│ ---        ┆ ---       ┆ ---           ┆ ---  ┆   ┆ ---       ┆ ---          ┆ ---    ┆ ---    │
│ str        ┆ f64       ┆ str           ┆ f64  ┆   ┆ f64       ┆ f64          ┆ f64    ┆ f64    │
╞════════════╪═══════════╪═══════════════╪══════╪═══╪═══════════╪══════════════╪════════╪════════╡
│ Earth      ┆ 1.0       ┆ TRAPPIST-1e   ┆ 27.0 ┆ … ┆ 0.0       ┆ 0.0          ┆ 0.0    ┆ 0.0    │
│ Earth      ┆ 0.0       ┆ TRAPPIST-1e   ┆ 19.0 ┆ … ┆ 9.0       ┆ 0.0          ┆ 2823.0 ┆ 0.0    │
│ Europa     ┆ 1.0       ┆ 55 Cancri e   ┆ 31.0 ┆ … ┆ 0.0       ┆ 0.0          ┆ 0.0    ┆ 0.0    │
│ Europa     ┆ 0.0       ┆ TRAPPIST-1e   ┆ 38.0 ┆ … ┆ 6652.0    ┆ 0.0          ┆ 181.0  ┆ 585.0  │
│ Earth      ┆ 0.0       ┆ TRAPPIST-1e   ┆ 20.0 ┆ … ┆ 0.0       ┆ 635.0        ┆ 0.0    ┆ 

In [27]:
test.describe()

statistic,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
str,str,f64,str,f64,f64,f64,f64,f64,f64,f64
"""count""","""4277""",4277.0,"""4277""",4277.0,4277.0,4277.0,4277.0,4277.0,4277.0,4277.0
"""null_count""","""0""",0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,0.361001,,28.622866,0.017302,215.062427,428.592238,173.233107,295.895955,304.898293
"""std""",,0.480347,,14.029425,0.130409,601.914503,1510.155974,554.991776,1104.872018,1235.991811
"""min""","""Earth""",0.0,"""55 Cancri e""",0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""25%""",,0.0,,20.0,0.0,0.0,0.0,0.0,0.0,0.0
"""50%""",,0.0,,27.0,0.0,0.0,0.0,0.0,0.0,0.0
"""75%""",,1.0,,37.0,0.0,48.0,66.0,27.0,43.0,31.0
"""max""","""missing""",1.0,"""missing""",79.0,1.0,11567.0,25273.0,8292.0,19844.0,22272.0


# Encoding

In [28]:
# 1. Apply one-hot encoding to categorical columns
one_hot_encoded = train.select(cat_cols).to_dummies()

# 2. Drop original categorical columns from the DataFrame
train = train.drop(cat_cols)  # Pass the list of column names directly

# 3. Concatenate the one-hot encoded columns to the original DataFrame
train = train.hstack(one_hot_encoded)

print(train)


shape: (8_693, 17)
┌───────────┬──────┬─────┬─────────────┬───┬─────────────┬─────────────┬─────────────┬─────────────┐
│ CryoSleep ┆ Age  ┆ VIP ┆ RoomService ┆ … ┆ Destination ┆ Destination ┆ Destination ┆ Destination │
│ ---       ┆ ---  ┆ --- ┆ ---         ┆   ┆ _55 Cancri  ┆ _PSO        ┆ _TRAPPIST-1 ┆ _missing    │
│ f64       ┆ f64  ┆ f64 ┆ f64         ┆   ┆ e           ┆ J318.5-22   ┆ e           ┆ ---         │
│           ┆      ┆     ┆             ┆   ┆ ---         ┆ ---         ┆ ---         ┆ u8          │
│           ┆      ┆     ┆             ┆   ┆ u8          ┆ u8          ┆ u8          ┆             │
╞═══════════╪══════╪═════╪═════════════╪═══╪═════════════╪═════════════╪═════════════╪═════════════╡
│ 0.0       ┆ 39.0 ┆ 0.0 ┆ 0.0         ┆ … ┆ 0           ┆ 0           ┆ 1           ┆ 0           │
│ 0.0       ┆ 24.0 ┆ 0.0 ┆ 109.0       ┆ … ┆ 0           ┆ 0           ┆ 1           ┆ 0           │
│ 0.0       ┆ 58.0 ┆ 1.0 ┆ 43.0        ┆ … ┆ 0           ┆ 0           ┆

In [29]:
# 1. Apply one-hot encoding to categorical columns
one_hot_encoded = test.select(cat_cols).to_dummies()

# 2. Drop original categorical columns from the DataFrame
test = test.drop(cat_cols)  # Pass the list of column names directly

# 3. Concatenate the one-hot encoded columns to the original DataFrame
test = test.hstack(one_hot_encoded)

print(train)


shape: (8_693, 17)
┌───────────┬──────┬─────┬─────────────┬───┬─────────────┬─────────────┬─────────────┬─────────────┐
│ CryoSleep ┆ Age  ┆ VIP ┆ RoomService ┆ … ┆ Destination ┆ Destination ┆ Destination ┆ Destination │
│ ---       ┆ ---  ┆ --- ┆ ---         ┆   ┆ _55 Cancri  ┆ _PSO        ┆ _TRAPPIST-1 ┆ _missing    │
│ f64       ┆ f64  ┆ f64 ┆ f64         ┆   ┆ e           ┆ J318.5-22   ┆ e           ┆ ---         │
│           ┆      ┆     ┆             ┆   ┆ ---         ┆ ---         ┆ ---         ┆ u8          │
│           ┆      ┆     ┆             ┆   ┆ u8          ┆ u8          ┆ u8          ┆             │
╞═══════════╪══════╪═════╪═════════════╪═══╪═════════════╪═════════════╪═════════════╪═════════════╡
│ 0.0       ┆ 39.0 ┆ 0.0 ┆ 0.0         ┆ … ┆ 0           ┆ 0           ┆ 1           ┆ 0           │
│ 0.0       ┆ 24.0 ┆ 0.0 ┆ 109.0       ┆ … ┆ 0           ┆ 0           ┆ 1           ┆ 0           │
│ 0.0       ┆ 58.0 ┆ 1.0 ┆ 43.0        ┆ … ┆ 0           ┆ 0           ┆

In [30]:
train.head(2)

CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_missing,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_missing
f64,f64,f64,f64,f64,f64,f64,f64,bool,u8,u8,u8,u8,u8,u8,u8,u8
0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,False,0,1,0,0,0,0,1,0
0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,True,1,0,0,0,0,0,1,0


In [31]:
test.head(2)

CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_missing,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_missing
f64,f64,f64,f64,f64,f64,f64,f64,u8,u8,u8,u8,u8,u8,u8,u8
1.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0,0,1,0
0.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,1,0,0,0,0,0,1,0


# Data Modelling

In [32]:
# Initialize H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu320.04); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu320.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp6oiwrqph
  JVM stdout: /tmp/tmp6oiwrqph/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp6oiwrqph/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,2 months and 19 days
H2O_cluster_name:,H2O_from_python_unknownUser_6o6d8l
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.500 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [33]:
# Convert Polars DataFrame to Pandas (H2O does not support Polars directly)
train_df = train.to_pandas()
test_df = test.to_pandas()

# Upload Pandas DataFrame to H2O
train_h2o = H2OFrame(train_df)
test_h2o = H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [34]:
# Specify Features and Target
target = "Transported"  # Replace with the name of your target column
features = [col for col in train_h2o.columns if col != target] 

In [35]:
# Train the Model using H2O AutoML
aml = H2OAutoML(max_models=20, seed=42, balance_classes=True)
aml.train(x=features, y=target, training_frame=train_h2o)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),5/6
# GBM base models (used / total),1/1
# XGBoost base models (used / total),1/1
# DeepLearning base models (used / total),1/1
# GLM base models (used / total),1/1
# DRF base models (used / total),1/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5

Unnamed: 0,False,True,Error,Rate
False,3135.0,1180.0,0.2735,(1180.0/4315.0)
True,427.0,3951.0,0.0975,(427.0/4378.0)
Total,3562.0,5131.0,0.1849,(1607.0/8693.0)

metric,threshold,value,idx
max f1,0.4052358,0.8310022,225.0
max f2,0.1878117,0.8980674,305.0
max f0point5,0.6101491,0.8385682,152.0
max accuracy,0.5581571,0.8268722,173.0
max precision,0.9992125,1.0,0.0
max recall,0.0464937,1.0,377.0
max specificity,0.9992125,1.0,0.0
max absolute_mcc,0.5581571,0.6537515,173.0
max min_per_class_accuracy,0.5566699,0.8259479,174.0
max mean_per_class_accuracy,0.5581571,0.8268839,173.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0108133,0.9911943,1.9856099,1.9856099,1.0,0.9946258,1.0,0.9946258,0.021471,0.021471,98.5609868,98.5609868,0.021471
2,0.0203612,0.9906383,1.9856099,1.9856099,1.0,0.9907932,1.0,0.9928286,0.0189584,0.0404294,98.5609868,98.5609868,0.0404294
3,0.0307144,0.989912,1.9856099,1.9856099,1.0,0.9903096,1.0,0.9919795,0.0205573,0.0609868,98.5609868,98.5609868,0.0609868
4,0.0401472,0.9880315,1.9856099,1.9856099,1.0,0.9887816,1.0,0.9912281,0.01873,0.0797168,98.5609868,98.5609868,0.0797168
5,0.0506154,0.9869144,1.9856099,1.9856099,1.0,0.9872608,1.0,0.9904076,0.0207857,0.1005025,98.5609868,98.5609868,0.1005025
6,0.1014609,0.9767996,1.9766252,1.9811074,0.9954751,0.9822489,0.9977324,0.986319,0.1005025,0.201005,97.6625207,98.1107351,0.2005415
7,0.1500058,0.9469674,1.947968,1.9703828,0.9810427,0.9675403,0.9923313,0.9802418,0.0945637,0.2955688,94.7967974,97.0382798,0.2932513
8,0.200161,0.8799249,1.899081,1.9525164,0.956422,0.9101799,0.9833333,0.9626861,0.095249,0.3908177,89.9080997,95.251637,0.384097
9,0.3000115,0.7172044,1.6767881,1.8607479,0.84447,0.8064801,0.9371166,0.9106973,0.167428,0.5582458,67.6788056,86.0747897,0.5202388
10,0.4028529,0.642142,1.3948132,1.7418028,0.7024609,0.6768485,0.877213,0.8509997,0.1434445,0.7016903,39.4813196,74.1802831,0.6020379

Unnamed: 0,False,True,Error,Rate
False,3093.0,1222.0,0.2832,(1222.0/4315.0)
True,570.0,3808.0,0.1302,(570.0/4378.0)
Total,3663.0,5030.0,0.2061,(1792.0/8693.0)

metric,threshold,value,idx
max f1,0.4198267,0.8095238,226.0
max f2,0.1685307,0.8685797,316.0
max f0point5,0.6243538,0.8072322,146.0
max accuracy,0.5311631,0.7984585,188.0
max precision,0.9986711,1.0,0.0
max recall,0.0035854,1.0,397.0
max specificity,0.9986711,1.0,0.0
max absolute_mcc,0.4576044,0.597433,215.0
max min_per_class_accuracy,0.5474835,0.792584,180.0
max mean_per_class_accuracy,0.5311631,0.7983462,188.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0100081,0.9909412,1.9856099,1.9856099,1.0,0.9936742,1.0,0.9936742,0.0198721,0.0198721,98.5609868,98.5609868,0.0198721
2,0.0200161,0.9898062,1.9856099,1.9856099,1.0,0.9903434,1.0,0.9920088,0.0198721,0.0397442,98.5609868,98.5609868,0.0397442
3,0.0300242,0.9885926,1.9627868,1.9780022,0.9885057,0.9892237,0.9961686,0.9910805,0.0196437,0.0593878,96.2786766,97.8002167,0.0591561
4,0.0400322,0.9868929,1.9856099,1.9799041,1.0,0.9875905,0.9971264,0.990208,0.0198721,0.0792599,98.5609868,97.9904092,0.0790282
5,0.0500403,0.985936,1.9627868,1.9764806,0.9885057,0.9864302,0.9954023,0.9894524,0.0196437,0.0989036,96.2786766,97.6480627,0.0984401
6,0.1000805,0.977563,1.9673514,1.971916,0.9908046,0.9815762,0.9931034,0.9855143,0.0984468,0.1973504,96.7351386,97.1916006,0.1959599
7,0.1500058,0.93187,1.9490088,1.964292,0.9815668,0.9603724,0.9892638,0.9771465,0.0973047,0.2946551,94.9008764,96.429197,0.2914106
8,0.200161,0.8777176,1.7897814,1.920564,0.9013761,0.9026204,0.9672414,0.9584722,0.089767,0.3844221,78.9781371,92.0564027,0.3712124
9,0.3000115,0.7070858,1.5829977,1.8082145,0.797235,0.7994301,0.9106595,0.9055394,0.158063,0.5424852,58.2997728,80.8214507,0.4884875
10,0.399977,0.6253453,1.3161235,1.6852271,0.6628308,0.6627831,0.8487202,0.8448678,0.1315669,0.6740521,31.6123456,68.5227127,0.5521517

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.7940132,0.0085675,0.7880647,0.8045845,0.8019351,0.7893258,0.7861558
aic,1497.4414,62.39958,1564.7251,1506.3905,1481.7709,1533.8242,1400.4962
auc,0.8802972,0.0049239,0.8745003,0.8793476,0.8880163,0.8809101,0.8787116
err,0.2059868,0.0085675,0.2119353,0.1954155,0.1980649,0.2106742,0.2138443
err_count,358.0,18.069311,380.0,341.0,348.0,375.0,346.0
f0point5,0.7764607,0.0133234,0.7654981,0.7922427,0.7897365,0.7667639,0.7680625
f1,0.8104672,0.0080625,0.8026999,0.8183271,0.8196891,0.8079877,0.8036323
f2,0.8476892,0.0050154,0.8437023,0.8461878,0.8520035,0.8538961,0.8426565
lift_top_group,1.9858576,0.0317370,2.0259888,1.9673055,1.9435841,2.0022497,1.9901599
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
lb = aml.leaderboard
print(lb)

model_id                                                      auc    logloss     aucpr    mean_per_class_error      rmse       mse
StackedEnsemble_BestOfFamily_1_AutoML_1_20241117_172858  0.880344   0.427076  0.890063                0.206697  0.373705  0.139656
StackedEnsemble_AllModels_1_AutoML_1_20241117_172858     0.880053   0.42732   0.889714                0.203704  0.373917  0.139814
GBM_1_AutoML_1_20241117_172858                           0.878109   0.43264   0.88766                 0.205875  0.375706  0.141155
GBM_5_AutoML_1_20241117_172858                           0.877285   0.435371  0.886985                0.207979  0.376752  0.141942
XGBoost_3_AutoML_1_20241117_172858                       0.87672    0.435788  0.88581                 0.213666  0.377199  0.142279
GBM_2_AutoML_1_20241117_172858                           0.876548   0.435827  0.886332                0.204812  0.3768    0.141978
GBM_grid_1_AutoML_1_20241117_172858_model_1              0.876097   0.436923  0.886

In [53]:
# Predict on Test Data
predictions = aml.predict(test_h2o)

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


In [54]:
predictions_df = predictions.as_data_frame() 
predictions_df.head()




Unnamed: 0,predict,False,True
0,True,0.368184,0.631816
1,False,0.988868,0.011132
2,True,0.0113,0.9887
3,True,0.05643,0.94357
4,True,0.459299,0.540701


# Submission

In [55]:
predictions_df.rename(columns={'predict': 'Transported'}, inplace=True)

In [58]:
import pandas as pd
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

In [59]:
test_data['PassengerId']

0       0013_01
1       0018_01
2       0019_01
3       0021_01
4       0023_01
         ...   
4272    9266_02
4273    9269_01
4274    9271_01
4275    9273_01
4276    9277_01
Name: PassengerId, Length: 4277, dtype: object

In [60]:
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [61]:
submission_df = pd.DataFrame({'PassengerId': test_data['PassengerId'],'Transported': predictions_df['Transported']})

In [62]:
# Save the combined DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [63]:
check = pd.read_csv('submission.csv')

In [64]:
check

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
