In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


## Loading Data

In [2]:
from sklearn.model_selection import train_test_split

# Load data
X_train_full = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
X_test_full = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# # Check the Data load
# print(X_train_full.shape)
# X_test_full.shape

# Remove rows with NULL target and separate target from the train dataset
X_train_full.dropna(axis=0, subset=['Transported'], inplace=True)
y = X_train_full.Transported
X_train_full.drop(['Transported'], axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y, train_size=0.8, test_size=0.2, random_state=0)

# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train.columns if
                    X_train[cname].nunique() < 10 and 
                    X_train[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train.columns if 
                X_train[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
#print(categorical_cols)
#print(numerical_cols)

my_cols = categorical_cols + numerical_cols
X_train_clean = X_train[my_cols].copy()
X_valid_clean = X_valid[my_cols].copy()
X_test_clean = X_test_full[my_cols].copy()

X_train_clean[categorical_cols] = X_train_clean[categorical_cols].astype(str)
X_valid_clean[categorical_cols] = X_valid_clean[categorical_cols].astype(str)
X_test_clean[categorical_cols] = X_test_clean[categorical_cols].astype(str)

In [3]:
X_train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,6808.0,6803.0,6806.0,6782.0,6802.0,6808.0
mean,28.872944,232.87138,454.277255,179.915217,308.785798,302.280699
std,14.481302,691.501507,1587.206812,639.275102,1121.184428,1139.835194
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,54.0,75.0,28.0,59.0,46.0
max,79.0,14327.0,27723.0,23492.0,18572.0,24133.0


## Creating the Pipeline

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define your numerical and categorical columns
# Example:
# numerical_cols = ['age', 'income']
# categorical_cols = ['gender', 'city']

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=0)

# Define pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', xgb)
])

# Define parameter grid (note: prefix with 'model__')
param_grid = {
    'model__n_estimators': [100, 250, 500],
    'model__max_depth': [5, 7, 10],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__subsample': [0.8, 1],
    'model__colsample_bytree': [0.8, 1]
}

# Grid search with cross-validation
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid,
                           cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

# Fit the model
grid_search.fit(X_train_clean, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

# Predict using best model
best_model = grid_search.best_estimator_
preds = best_model.predict(X_valid_clean)

# Evaluate
print("Accuracy:", accuracy_score(y_valid, preds))
print("MAE:", mean_absolute_error(y_valid, preds))


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best Parameters: {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.01, 'model__max_depth': 7, 'model__n_estimators': 250, 'model__subsample': 0.8}
Best CV Score: 0.7995398331895313
Accuracy: 0.7941345600920069
MAE: 0.2058654399079931


## Create Submission

In [5]:
preds = grid_search.predict(X_test_clean).astype(bool)
output = pd.DataFrame({'PassengerId': X_test_full.PassengerId,
                       'Transported': preds})
output.to_csv('submission.csv', index=False)

In [6]:
sample_sub = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
sub = pd.read_csv('submission.csv')

In [7]:
sample_sub

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,False
4275,9273_01,False


In [8]:
sub

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [9]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.metrics import mean_absolute_error
# from xgboost import XGBClassifier
# from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import accuracy_score

# # Preprocessing for numerical data
# numerical_transformer = SimpleImputer(strategy='mean')

# # Preprocessing for categorical data
# categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='constant')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore'))
# ])

# # Bundle preprocessing for numerical and categorical data
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('cat', categorical_transformer, categorical_cols)
#     ])

# # Define model
# xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=0)

# #Define pipeline
# pipeline = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('model', xgb)])

# # Define parameter grid
# param_grid = {
#     'n_estimators': [100, 250, 500],
#     'max_depth': [5, 7, 10],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.8, 1],
#     'colsample_bytree': [0.8, 1]
# }

# # Grid search with cross-validation
# grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid,
#                            cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best parameters and score
# print("Best Parameters:", grid_search.best_params_)
# print("Best CV Score:", grid_search.best_score_)

# # Evaluate on test data
# best_model = grid_search.best_estimator_

# # Bundle preprocessing and modeling code in a pipeline
# clf = Pipeline(steps=[('preprocessor', preprocessor),
#                       ('model', model)
#                      ])

# # Preprocessing of training data, fit model 
# clf.fit(X_train, y_train)

# # Preprocessing of validation data, get predictions
# preds = clf.predict(X_valid)

# print('MAE:', mean_absolute_error(y_valid, preds))

In [10]:

# # Define model
# xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# # Define parameter grid
# param_grid = {
#     'n_estimators': [50, 100],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.8, 1],
#     'colsample_bytree': [0.8, 1]
# }

# # Grid search with cross-validation
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid,
#                            cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

# # Fit the model
# grid_search.fit(X_train, y_train)

# # Best parameters and score
# print("Best Parameters:", grid_search.best_params_)
# print("Best CV Score:", grid_search.best_score_)

# # Evaluate on test data
# best_model = grid_search.best_estimator_
# y_pred = best_model.predict(X_test)
# print("Test Accuracy:", accuracy_score(y_test, y_pred))
