# Importing Libraries
We import pandas for data handling, numpy for math, and sklearn/xgboost for building our machine learning pipeline.


In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Loading Data
Reading the train and test CSV files into pandas DataFrames.


In [2]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Display first few rows of training data
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
train['HomePlanet'].unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

# Dropping Columns
The 'Name' column is unique to passengers and not useful for prediction, so we drop it.


In [4]:
train.drop(columns='Name', axis=1,inplace=True)
test.drop(columns='Name', axis=1,inplace=True)


# Feature Engineering: Passenger Group
Extracting 'Group' and 'GroupMember' from the 'PassengerId' to see if travelling in a group affects the outcome.


In [5]:
train[['Group', 'GroupMember']] = train['PassengerId'].str.split('_', expand=True)

train['Group'] = train['Group'].astype(int)
train['GroupMember'] = train['GroupMember'].astype(int)




test[['Group', 'GroupMember']] = test['PassengerId'].str.split('_', expand=True)

test['Group'] = test['Group'].astype(int)
test['GroupMember'] = test['GroupMember'].astype(int)



In [6]:
train.drop(columns='PassengerId',axis=1,inplace=True)
test.drop(columns='PassengerId',axis=1,inplace=True)

# Feature Engineering: Cabin Details
Splitting the 'Cabin' 'B/0/P' format into 'Deck', 'Room', and 'Side' features.


In [7]:
train[['Deck', 'Room', 'Side']] = train['Cabin'].str.split('/', expand=True)
train['Room'] = train['Room'].astype(float)



test[['Deck', 'Room', 'Side']] = test['Cabin'].str.split('/', expand=True)
test['Room'] = test['Room'].astype(float)





In [8]:
train.drop(columns='Cabin',axis=1,inplace=True)

test.drop(columns='Cabin',axis=1,inplace=True)



In [9]:
train.isnull().sum()

HomePlanet      201
CryoSleep       217
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Group             0
GroupMember       0
Deck            199
Room            199
Side            199
dtype: int64

# Handling Missing Values (Categorical)
Filling missing categorical values (CryoSleep, VIP, etc.) with the mode (most frequent value).


In [10]:
train['CryoSleep'].fillna(train['CryoSleep'].mode()[0], inplace=True)
train['VIP'].fillna(train['VIP'].mode()[0], inplace=True)
train['Transported'].fillna(train['Transported'].mode()[0], inplace=True)


test['CryoSleep'].fillna(test['CryoSleep'].mode()[0], inplace=True)
test['VIP'].fillna(test['VIP'].mode()[0], inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['CryoSleep'].fillna(train['CryoSleep'].mode()[0], inplace=True)
  train['CryoSleep'].fillna(train['CryoSleep'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['VIP'].fillna(train['VIP'].mode()[0], inplace=True)
  train['VIP'].fillna(train['VIP

In [11]:
train['HomePlanet'].fillna(train['HomePlanet'].mode()[0], inplace=True)


test['HomePlanet'].fillna(test['HomePlanet'].mode()[0], inplace=True)




The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['HomePlanet'].fillna(train['HomePlanet'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['HomePlanet'].fillna(test['HomePlanet'].mode()[0], inplace=True)


In [12]:
train['Destination'].fillna(train['Destination'].mode()[0], inplace=True)

test['Destination'].fillna(test['Destination'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['Destination'].fillna(train['Destination'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['Destination'].fillna(test['Destination'].mode()[0], inplace=True)


In [13]:
train.isnull().sum()

HomePlanet        0
CryoSleep         0
Destination       0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
Group             0
GroupMember       0
Deck            199
Room            199
Side            199
dtype: int64

# Handling Missing Values (Numerical)
Filling missing numerical values with the mode as well. (Mean or Median could also be used).


In [14]:
num_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(num_cols)

for col in num_cols:
    train[col].fillna(train[col].mode()[0], inplace=True)




num_cols = test.select_dtypes(include=['int64', 'float64']).columns.tolist()
print(num_cols)

for col in num_cols:
    test[col].fillna(test[col].mode()[0], inplace=True)


    



['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group', 'GroupMember', 'Room']
['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Group', 'GroupMember', 'Room']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are 

In [15]:
y = train['Transported']

x = train.drop(columns='Transported', axis=1)


In [16]:
print(y)

0       False
1        True
2       False
3       False
4        True
        ...  
8688    False
8689    False
8690     True
8691    False
8692     True
Name: Transported, Length: 8693, dtype: bool


In [17]:
encode = LabelEncoder()

encode.fit_transform(y)

array([0, 1, 0, ..., 1, 0, 1], shape=(8693,))

# Label Encoding
Converting categorical text columns into numbers so the model can understand them. We fit on both columns to ensure the mapping is consistent.


In [18]:
cols = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP','Side','Deck' ]  

for col in cols:
    # Fit on combined data to ensure consistent encoding
    full_data = pd.concat([x[col], test[col]], axis=0).astype(str)
    encode.fit(full_data)
    x[col] = encode.transform(x[col].astype(str))
    test[col] = encode.transform(test[col].astype(str))


# Train-Test Split
Splitting our training data into a local train set and a validation set to check model performance before submitting.


In [19]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15,random_state=42)


In [20]:
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,GroupMember,Deck,Room,Side
4753,0,0,2,20.0,0,542.0,175.0,0.0,5.0,0.0,5070,1,5,966.0,1
4714,1,0,0,47.0,0,1904.0,18.0,0.0,89.0,4411.0,5028,1,0,61.0,1
8102,1,0,0,38.0,1,0.0,1298.0,0.0,1841.0,1547.0,8656,2,0,103.0,1
7716,0,0,2,19.0,0,5.0,666.0,0.0,123.0,440.0,8232,1,5,1574.0,1
6545,2,1,2,7.0,0,0.0,0.0,0.0,0.0,0.0,6906,3,5,1432.0,0


# Scaling Features
Scaling numerical columns to a standard range. This is important for many ML algorithms to perform well.


In [21]:
scaler = StandardScaler()

In [22]:
scale_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
x_train[scale_cols] = scaler.fit_transform(x_train[scale_cols])
x_test[scale_cols] = scaler.transform(x_test[scale_cols])
test[scale_cols] = scaler.transform(test[scale_cols])


In [23]:
x_train.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,GroupMember,Deck,Room,Side
4753,0,0,2,-0.605954,0,0.490326,-0.173974,-0.284191,-0.27029,-0.259507,5070,1,5,966.0,1
4714,1,0,0,1.274983,0,2.543023,-0.275047,-0.284191,-0.195614,3.573081,5028,1,0,61.0,1
8102,1,0,0,0.648004,1,-0.326533,0.548989,-0.284191,1.361924,1.084636,8656,2,0,103.0,1
7716,0,0,2,-0.675619,0,-0.318997,0.142121,-0.284191,-0.165388,0.122796,8232,1,5,1574.0,1
6545,2,1,2,-1.511591,0,-0.326533,-0.286635,-0.284191,-0.274735,-0.259507,6906,3,5,1432.0,0


# Model Initialization
Setting up the XGBoost classifier with specific hyperparameters.


In [24]:
model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)


# Model Training
Training the model on our processing training data.


In [25]:
model.fit(x_train,y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [26]:
test_prediction = model.predict(x_test)

# Evaluation and Submission
Checking our validation accuracy and then generating predictions for the official test set to create the submission file.


In [27]:
accuracy = accuracy_score(y_test, test_prediction)
print("Accuracy:", accuracy)


# Predict on Kaggle test set
real_test_pred = model.predict(test[x_train.columns])

# Create submission
test_ids = pd.read_csv('test.csv')['PassengerId']
submission = pd.DataFrame({'PassengerId': test_ids, 'Transported': real_test_pred})
submission['Transported'] = submission['Transported'].map({1: True, 0: False})
submission.to_csv('submission5.csv', index=False)
print('submission5.csv saved!')


Accuracy: 0.7960122699386503
submission5.csv saved!


In [28]:
import joblib 

joblib.dump(model,'diabetes_model.pkl')

['diabetes_model.pkl']