Import the necessary libraries and load the dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
dataset_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
print("Full train dataset shape is {}".format(dataset_df.shape))

In [None]:
# Display the first 5 examples
dataset_df.head(5)


In [None]:
# Display the first 5 examples
print(dataset_df.head(5))

# Dataset overview
print(dataset_df.describe())
print(dataset_df.info())

# Check for missing values
print(dataset_df.isnull().sum())

Visualize the target variable distribution


In [None]:

sns.countplot(x='Transported', data=dataset_df)
plt.show()


Visualize distributions of some features

In [None]:
fig, ax = plt.subplots(5, 1, figsize=(10, 10))
plt.subplots_adjust(top=2)

sns.histplot(dataset_df['Age'], color='b', bins=50, ax=ax[0])
sns.histplot(dataset_df['FoodCourt'], color='b', bins=50, ax=ax[1])
sns.histplot(dataset_df['ShoppingMall'], color='b', bins=50, ax=ax[2])
sns.histplot(dataset_df['Spa'], color='b', bins=50, ax=ax[3])
sns.histplot(dataset_df['VRDeck'], color='b', bins=50, ax=ax[4])
plt.show()


Drop unnecessary columns

In [None]:
# Drop columns not needed for modeling
dataset_df = dataset_df.drop(['PassengerId', 'Name'], axis=1)


Handle missing values

In [None]:
# Fill missing values with appropriate strategies
dataset_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = dataset_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
dataset_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']] = dataset_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']].fillna(method='ffill')
dataset_df['Age'] = dataset_df['Age'].fillna(dataset_df['Age'].mean())


Create new features from the Cabin column

In [None]:
# Create new features from Cabin
if 'Cabin' in dataset_df.columns:
    dataset_df[["Deck", "Cabin_num", "Side"]] = dataset_df["Cabin"].str.split("/", expand=True)
    dataset_df = dataset_df.drop('Cabin', axis=1)


Convert new categorical columns:

In [None]:
# Convert new categorical columns
if 'Deck' in dataset_df.columns:
    le_deck = LabelEncoder()
    dataset_df['Deck'] = le_deck.fit_transform(dataset_df['Deck'])

if 'Cabin_num' in dataset_df.columns:
    dataset_df['Cabin_num'] = pd.to_numeric(dataset_df['Cabin_num'], errors='coerce').fillna(0).astype(int)

if 'Side' in dataset_df.columns:
    le_side = LabelEncoder()
    dataset_df['Side'] = le_side.fit_transform(dataset_df['Side'])


Convert categorical columns to numerical

In [None]:
# Convert categorical columns to numerical
label_encoders = {}
for column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
    if column in dataset_df.columns:
        le = LabelEncoder()
        dataset_df[column] = le.fit_transform(dataset_df[column].astype(str))
        label_encoders[column] = le

# Ensure target and categorical variables are integer
dataset_df['Transported'] = dataset_df['Transported'].astype(int)
dataset_df['VIP'] = dataset_df['VIP'].astype(int)
dataset_df['CryoSleep'] = dataset_df['CryoSleep'].astype(int)


Split the dataset

In [None]:
# Split dataset into training and validation sets
train_df, valid_df = train_test_split(dataset_df, test_size=0.2, random_state=42)
X_train = train_df.drop('Transported', axis=1)
y_train = train_df['Transported']
X_valid = valid_df.drop('Transported', axis=1)
y_valid = valid_df['Transported']


Train and evaluate the XGBoost model

In [None]:
# Train the XGBoost model
model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42, enable_categorical=True)
model.fit(X_train, y_train)

# Predict on validation set
y_pred = model.predict(X_valid)

# Evaluate the model
accuracy = accuracy_score(y_valid, y_pred)
conf_matrix = confusion_matrix(y_valid, y_pred)
class_report = classification_report(y_valid, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)


In [None]:
# Prepare the test data
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test_df = test_df.drop(['PassengerId', 'Name'], axis=1)


In [None]:

test_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = test_df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)
test_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']] = test_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']].fillna(method='ffill')
test_df['Age'] = test_df['Age'].fillna(test_df['Age'].mean())


In [None]:
# Create new features from Cabin
test_df[["Deck", "Cabin_num", "Side"]] = test_df["Cabin"].str.split("/", expand=True)
test_df = test_df.drop('Cabin', axis=1)


In [None]:
# Convert new categorical columns
if 'Deck' in test_df.columns:
    test_df['Deck'] = le_deck.transform(test_df['Deck'].astype(str))

if 'Cabin_num' in test_df.columns:
    test_df['Cabin_num'] = pd.to_numeric(test_df['Cabin_num'], errors='coerce').fillna(0).astype(int)

if 'Side' in test_df.columns:
    test_df['Side'] = le_side.transform(test_df['Side'].astype(str))

# Convert categorical columns to numerical
for column in ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']:
    test_df[column] = label_encoders[column].transform(test_df[column].astype(str))

In [None]:
# Prepare test features for prediction
X_test = test_df

# Make predictions
y_test_pred = model.predict(X_test)

# Prepare the submission file
submission = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
submission['Transported'] = y_test_pred
submission.to_csv('submission.csv', index=False)

print("Predictions saved to submission.csv")