In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic-in-all-probability/sample_submission.csv
/kaggle/input/spaceship-titanic-in-all-probability/train.csv
/kaggle/input/spaceship-titanic-in-all-probability/test.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from sklearn.impute import SimpleImputer
from sklearn.metrics import brier_score_loss

Get Data


In [3]:
# Load the training data
train_data = pd.read_csv('/kaggle/input/spaceship-titanic-in-all-probability/train.csv')

# Feature selection
features = ['CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Split the data into features and target variable
X = train_data[features]
y = train_data['Transported']

# Split the data into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1


In [4]:
# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)


# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

In [5]:
# Fit and calibrate models
calibrated_models = {}

for name, model in models.items():
    # Fit the model
    model.fit(X_train_imputed, y_train)

   
    calibrated_model = CalibratedClassifierCV(model, method='sigmoid', cv='prefit')
    calibrated_model.fit(X_val_imputed, y_val)

    calibrated_models[name] = calibrated_model

In [6]:
# Load the test data (replace 'test.csv' with the actual file path)
test_data = pd.read_csv('/kaggle/input/spaceship-titanic-in-all-probability/test.csv')

# Extract features from the test data
X_test = test_data[features]

In [7]:
# Impute missing values in the test set
X_test_imputed = imputer.transform(X_test)

In [8]:
# Make probability predictions on the test set for each model
probabilities = {}

for name, model in calibrated_models.items():
    probabilities[name] = model.predict_proba(X_test_imputed)[:, 1]  # Probability of being transported

In [9]:
pd.DataFrame(probabilities).head()

Unnamed: 0,Logistic Regression,Random Forest,Gradient Boosting
0,0.799841,0.795762,0.80494
1,0.013133,0.111904,0.072591
2,0.799114,0.710635,0.80494
3,0.968035,0.849473,0.94276
4,0.545402,0.382153,0.570839


In [10]:
import pandas as pd
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    brier_score_loss
)

# Dictionary to store validation scores
validation_scores = {}

for name, model in calibrated_models.items():
    # Predict probabilities on validation set
    val_probabilities = model.predict_proba(X_val_imputed)[:, 1]
    
    # Convert probabilities to binary predictions (threshold = 0.5)
    val_predictions = (val_probabilities > 0.5).astype(int)

    # Calculate metrics
    brier_score = brier_score_loss(y_val, val_probabilities)
    roc_auc = roc_auc_score(y_val, val_probabilities)
    accuracy = accuracy_score(y_val, val_predictions)
    precision = precision_score(y_val, val_predictions)
    recall = recall_score(y_val, val_predictions)
    f1 = f1_score(y_val, val_predictions)

    validation_scores[name] = {
        'Brier Score': brier_score,
        'ROC AUC': roc_auc,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

# Convert to DataFrame for easy viewing
scores_df = pd.DataFrame(validation_scores).T  # Transpose to get models as rows
print(scores_df)


                     Brier Score   ROC AUC  Accuracy  Precision    Recall  \
Logistic Regression     0.161427  0.835002  0.764807   0.756284  0.788155   
Random Forest           0.169502  0.815233  0.767683   0.759868  0.789294   
Gradient Boosting       0.157636  0.839648  0.783209   0.759585  0.834852   

                     F1 Score  
Logistic Regression  0.771891  
Random Forest        0.774302  
Gradient Boosting    0.795442  


In [11]:
# Create a submission file for each model
for name, probs in probabilities.items():
    submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported': probs})
    submission.to_csv(f'submission_{name.replace(" ", "_")}.csv', index=False)

In [12]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,0.804940
1,0018_01,0.072591
2,0019_01,0.804940
3,0021_01,0.942760
4,0023_01,0.570839
...,...,...
4272,9266_02,0.819696
4273,9269_01,0.496049
4274,9271_01,0.804940
4275,9273_01,0.776319
