In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the data
train = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")

# Create feature engineering function to keep code DRY
def process_features(df):
    # Split Cabin into components
    df['Deck'] = df['Cabin'].str.split('/', expand=True)[0]
    df['Num'] = df['Cabin'].str.split('/', expand=True)[1]
    df['Side'] = df['Cabin'].str.split('/', expand=True)[2]
    
    # Store PassengerId before dropping if it's in the dataframe
    passenger_id = df['PassengerId'] if 'PassengerId' in df.columns else None
    
    # Drop unnecessary columns
    df.drop(columns=['Cabin', 'Name'], inplace=True)
    if 'PassengerId' in df.columns:
        df.drop(columns=['PassengerId'], inplace=True)
    
    # Fill missing values
    df['CryoSleep'].fillna(False, inplace=True)
    df['VIP'].fillna(False, inplace=True)
    
    # For numerical columns, fill with median
    numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
    
    # Convert 'Num' to numeric, filling errors with median
    df['Num'] = pd.to_numeric(df['Num'], errors='coerce')
    df['Num'].fillna(df['Num'].median(), inplace=True)
    
    return df, passenger_id

# Process both datasets
train, _ = process_features(train)
test, passenger_id = process_features(test)

# Define column types
categorical_features = ['HomePlanet', 'Destination', 'Deck', 'Side']
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num']
boolean_features = ['CryoSleep', 'VIP']

# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore'), categorical_features),
        ('bool', 'passthrough', boolean_features)
    ])

# Create pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Prepare target variable
y = train['Transported'].astype(bool)
X = train.drop('Transported', axis=1)

# Fit the pipeline
model.fit(X, y)

# Make predictions
predictions = model.predict(test)

# Create submission file
submission = pd.DataFrame({
    'PassengerId': passenger_id,
    'Transported': predictions
})

# Save submission
submission.to_csv('submission.csv', index=False)

print("Shape of submission file:", submission.shape)
print("\nFirst few rows of submission file:")
print(submission.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CryoSleep'].fillna(False, inplace=True)
  df['CryoSleep'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['VIP'].fillna(False, inplace=True)
  df['VIP'].fillna(False, inplace=True)
The behavior will change in pandas 3.0. This inplace method will

Shape of submission file: (4277, 2)

First few rows of submission file:
  PassengerId  Transported
0     0013_01         True
1     0018_01        False
2     0019_01         True
3     0021_01         True
4     0023_01         True
