In [52]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import os
import numpy as np  # Linear algebra
import pandas as pd  # Data processing
import seaborn as sns  # Data visualization
import matplotlib.pyplot as plt  # Data visualization

# Sklearn imports
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [53]:
def processData(data):
    # Fill missing values for Name
    data['Name'].fillna('Unknown', inplace=True)

    # Fill missing values for Age
    data['Age'].fillna(data['Age'].median(), inplace=True)

    # Split Cabin into Deck, Number, and Side
    data[['Deck', 'Number', 'Side']] = data['Cabin'].str.split('/', expand=True)
    data.drop(columns=['Cabin'], inplace=True)

    # Fill missing values for spending-related columns
    spending_columns = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    data[spending_columns] = data[spending_columns].fillna(0)

    # Update VIP status based on spending percentiles
    for col in spending_columns:
        lower_bound = data[col].quantile(0.80)  # 80th percentile
        upper_bound = data[col].quantile(1.00)  # 100th percentile

        # Update VIP where values fall within the percentile range and VIP is null
        condition = (data[col] >= lower_bound) & (data[col] <= upper_bound) & (data['VIP'].isnull())
        data.loc[condition, 'VIP'] = True

    data['VIP'] = data['VIP'].fillna(False)

    # Fill missing HomePlanet values based on most common HomePlanet per Destination
    most_common_homeplanet = data.groupby('Destination')['HomePlanet'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
    for destination, homeplanet in most_common_homeplanet.items():
        data.loc[data['Destination'] == destination, 'HomePlanet'] = \
            data.loc[data['Destination'] == destination, 'HomePlanet'].fillna(homeplanet)

    # Fill missing Destination values based on most common Destination per HomePlanet
    most_common_destination = data.groupby('HomePlanet')['Destination'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)
    for homeplanet, destination in most_common_destination.items():
        data.loc[data['HomePlanet'] == homeplanet, 'Destination'] = \
            data.loc[data['HomePlanet'] == homeplanet, 'Destination'].fillna(destination)

    # Fill remaining null HomePlanet and Destination values with global modes
    data['HomePlanet'].fillna(data['HomePlanet'].mode()[0], inplace=True)
    data['Destination'].fillna(data['Destination'].mode()[0], inplace=True)

    # Add a TotalSpending column
    data['TotalSpending'] = data[spending_columns].sum(axis=1)

    # Impute CryoSleep based on patterns in HomePlanet, Destination, and VIP
    grouped = data.groupby(['HomePlanet', 'Destination', 'VIP']).agg(
        CryoSleep_Mode=('CryoSleep', lambda x: x.mode()[0] if not x.mode().empty else None)
    ).reset_index()

    # Merge CryoSleep mode back to original data
    data = pd.merge(data, grouped, on=['HomePlanet', 'Destination', 'VIP'], how='left')

    # Impute CryoSleep values
    def impute_cryosleep(row):
        if pd.notnull(row['CryoSleep']):
            return row['CryoSleep']
        if row['TotalSpending'] == 0:
            return True
        if pd.notnull(row['CryoSleep_Mode']):
            return row['CryoSleep_Mode']
        return None

    data['CryoSleep'] = data.apply(impute_cryosleep, axis=1)

    # Final fallback imputation for CryoSleep
    data['CryoSleep'].fillna(data['TotalSpending'].apply(lambda x: True if x == 0 else False), inplace=True)

    # Drop temporary columns used for imputation
    data.drop(['CryoSleep_Mode', 'TotalSpending'], axis=1, inplace=True)

    # Fill missing values for Deck, Number, and Side
    data['Deck'].fillna(data['Deck'].mode()[0], inplace=True)
    data['Number'].fillna(data['Number'].mode()[0], inplace=True)
    data['Side'].fillna(data['Side'].mode()[0], inplace=True)

    # Verify no remaining missing values
    print("Remaining null values:")
    print(data.isnull().sum())

    return data



In [54]:
data1 = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
data2 = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [55]:
data1_cleaned = processData(data1)
data2_cleaned = processData(data2)

Remaining null values:
PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
Deck            0
Number          0
Side            0
dtype: int64
Remaining null values:
PassengerId     0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Deck            0
Number          0
Side            0
dtype: int64


In [56]:
X_train = data1_cleaned.drop('Transported', axis=1)
y_train = data1_cleaned['Transported']

In [57]:
X_test = data2_cleaned.drop('Transported', axis=1, errors='ignore')

In [58]:
categorical_columns = ['HomePlanet', 'Destination', 'Deck', 'Side', 'CryoSleep']
numeric_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [59]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_columns),
        
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_columns)
    ]
)

In [60]:
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))  # Logistic Regression as the model
])

In [61]:
model_pipeline.fit(X_train, y_train)

In [62]:
y_test_pred = model_pipeline.predict(X_test)

In [63]:
y_val_pred = model_pipeline.predict(X_train)
print("Validation Accuracy:", accuracy_score(y_train, y_val_pred))  # For validation data


Validation Accuracy: 0.7905211089382261


In [64]:
y_test_pred = model_pipeline.predict(X_test)
submission = pd.DataFrame({
    "PassengerId": data2["PassengerId"],
    "Transported": y_test_pred
})

In [65]:
out_dir = "submissions/logistic_regression"
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "submission.csv")
submission.to_csv(out_path, index=False)
print(f"Logistic Regression submission saved to: {out_path}")

Logistic Regression submission saved to: submissions/logistic_regression/submission.csv
