In [1]:
# Import necessary libraries
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# ---------------------------
# Step 1: Load the Data
# ---------------------------
# Make sure the CSV files (train.csv, test.csv) are available in the working directory.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# ---------------------------
# Step 2: Feature Engineering
# ---------------------------
# Extract deck information from the 'Cabin' column.
# The 'Cabin' entries are structured like "Deck/Num/Side". We extract the deck (first part).
def extract_deck(cabin):
    if pd.isnull(cabin):
        return "Missing"
    return cabin.split('/')[0]

# Create a new 'Deck' column in both datasets.
train['Deck'] = train['Cabin'].apply(extract_deck)
test['Deck'] = test['Cabin'].apply(extract_deck)

# Process the boolean columns: 'CryoSleep' and 'VIP'.
# Convert string booleans to actual booleans and then fill any missing values with False.
for col in ['CryoSleep', 'VIP']:
    # Convert string booleans to actual booleans
    train[col] = train[col].replace({'True': True, 'False': False})
    test[col]  = test[col].replace({'True': True, 'False': False})

    # Fill missing values with False and then let pandas infer the correct type.
    train[col] = train[col].fillna(False).infer_objects()
    test[col]  = test[col].fillna(False).infer_objects()

    # Convert to boolean then integer.
    train[col] = train[col].astype(bool).astype(int)
    test[col]  = test[col].astype(bool).astype(int)

In [4]:
# ---------------------------
# Step 3: Define Features and Target Variable
# ---------------------------
# Define target. In this competition, the target is 'Transported'.
# Convert the target column to binary (1 for True, 0 for False).
y_train = train['Transported'].apply(lambda x: 1 if x in [True, 'True'] else 0)

# Drop columns that we are not going to use in the modeling.
# Keep PassengerId for the final submission, but remove it from the features.
drop_columns = ['PassengerId', 'Name', 'Cabin', 'Transported']
X_train = train.drop(drop_columns, axis=1)
X_test = test.drop(['PassengerId', 'Name', 'Cabin'], axis=1)

In [5]:
# ---------------------------
# Step 4: Preprocessing Pipeline
# ---------------------------
# Define lists of numeric and categorical features.
# Numeric features
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Impute missing numerical values with the median.
    # Note: Decision trees generally do not require scaling, but scaling has no adverse effect.
    ('scaler', StandardScaler())                    
])

# Categorical features
categorical_features = ['HomePlanet', 'Destination', 'Deck']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),  # Impute missing categorical values.
    ('onehot', OneHotEncoder(handle_unknown='ignore'))                      # One-hot encode the categorical features.
])

# Combine numeric and categorical transformers using ColumnTransformer.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [6]:
# ---------------------------
# Step 5: Build the Decision Tree Pipeline
# ---------------------------
# Build a pipeline that integrates data preprocessing with the decision tree classifier.
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))  # You can tune parameters as needed.
])

In [7]:
# ---------------------------
# Step 6: Train the Model
# ---------------------------
# Fit the decision tree model on the training data.
pipeline.fit(X_train, y_train)

In [8]:
# ---------------------------
# Step 7: Make Predictions and Create Submission File
# ---------------------------
# Use the trained model to predict on the test dataset.
predictions = pipeline.predict(X_test)

# Prepare the submission DataFrame with PassengerId and the predicted 'Transported' status.
# Convert predictions (0 or 1) back to boolean values if required.
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': predictions.astype(bool)
})

# Save the submission to a CSV file.
submission.to_csv('submission_decision_tree.csv', index=False)
print("Submission file saved as submission_decision_tree.csv")

Submission file saved as submission_decision_tree.csv
