In [18]:
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier

In [19]:
# Load the training and test data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [20]:
# Combine train and test data for consistent encoding
train_data['is_train'] = 1
test_data['is_train'] = 0
combined_data = pd.concat([train_data, test_data], sort=False)

In [21]:
# Extracting numerical features
num_features = combined_data.select_dtypes(exclude = ['object', 'bool']).columns.tolist()
combined_data[num_features].head()

# Extracting categorical features
cat_features = combined_data.select_dtypes(include = ['object']).columns.tolist()
combined_data[cat_features].head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,False,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,False,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,True,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,False,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,False,Willy Santantines,True


In [22]:
# Impute missing values with Median for each Numerical feature
for col in num_features:
    combined_data[col] = combined_data[col].fillna(combined_data[col].median())

# Impute missing values with Mode for each Categorical feature
for col in cat_features:
    combined_data[col] = combined_data[col].fillna(combined_data[col].mode()[0])


In [23]:
# Encode categorical features
encoder = OrdinalEncoder()
combined_data[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']] = encoder.fit_transform(combined_data[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']])

# Split the data back into train and test sets
train_data = combined_data[combined_data['is_train'] == 1].drop(['is_train'], axis=1)
test_data = combined_data[combined_data['is_train'] == 0].drop(['is_train'], axis=1)

# Select features and target
features = train_data.drop(['PassengerId', 'Name', 'Transported'], axis=1).columns
target = ['Transported']

X = train_data[features]
y = train_data[target]

In [24]:
# Define the objective function for Optuna
def objective(trial):
    xgb_params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    xgb = XGBClassifier(**xgb_params)
    score = cross_val_score(xgb, X, y.values.ravel(), cv=5, scoring='accuracy').mean()
    return score

In [25]:
# Optimize the hyperparameters using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2023-03-21 13:32:50,612][0m A new study created in memory with name: no-name-0fbd0947-1394-4cf6-ae51-f06e36a2585d[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1),
[32m[I 2023-03-21 13:32:58,980][0m Trial 0 finished with value: 0.7817854261969532 and parameters: {'max_depth': 8, 'learning_rate': 0.0454583697469847, 'n_estimators': 54, 'subsample': 0.8, 'colsample_bytree': 0.7}. Best is trial 0 with value: 0.7817854261969532.[0m
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.2),
  'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1.0, 0.1),
  'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1.0, 0.1),
[32m[I 2023-03-21 13:33:11,049][0m Trial 1 finished with value: 0.7906413550636551 and parameters: {'max_depth': 6, 'learning_

In [26]:
# Train the model with the best hyperparameters
best_params = study.best_params
xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X, y.values.ravel())

# Predict Transported for test data
X_test = test_data[features]
y_test_pred = xgb_best.predict(X_test)

In [28]:
# Create submission file
submission = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Transported': y_test_pred})
submission.to_csv('XGBoost_submission.csv', index=False)

print("Submission file created: submission.csv")

Submission file created: submission.csv
