# Task 1

## Importing the required data

In [1]:
import pandas as pd
import matplotlib as plt
import numpy as np

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

## Imputing Values and Encoding Values

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

num_imp = SimpleImputer(strategy='constant', fill_value=0)
cat_imp = SimpleImputer(strategy='constant')
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

## Preprocessing Data

In [3]:
def preprocess(df, is_train=True):
    df = df.copy()

    #cabin split
    df[['Deck', 'CabinNum', 'Side']] = df['Cabin'].str.split('/', expand=True)
    df['Group'] = df['PassengerId'].str.split('_').str[0]
    
    # Map booleans
    df['CryoSleep'] = df['CryoSleep'].map({'True': 1, 'False': 0})
    df['VIP'] = df['VIP'].map({'True': 1, 'False': 0})

    if not is_train:
        passenger_ids = df['PassengerId']
    else:
        passenger_ids = None

    df.drop(columns=['PassengerId', 'Cabin', 'Name'], inplace=True)

    if is_train:
        y = df['Transported'].astype(int)
        df.drop(columns='Transported', inplace=True)
    else:
        y = None
    # Feature engineering
    spend_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    df['TotalSpend'] = df[spend_cols].sum(axis=1)
    df['SpentAny'] = (df['TotalSpend'] > 0).astype(int)
    df['CryoSpend'] = df['CryoSleep'] * df['SpentAny']

    # Numerical columns
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if is_train:
        df[num_cols] = num_imp.fit_transform(df[num_cols])
    else:
        df[num_cols] = num_imp.transform(df[num_cols])

    # Categorical columns
    cat_cols = df.select_dtypes(include=['object']).columns
    if is_train:
        df[cat_cols] = cat_imp.fit_transform(df[cat_cols])
        df[cat_cols] = encoder.fit_transform(df[cat_cols])
    else:
        df[cat_cols] = cat_imp.transform(df[cat_cols])
        df[cat_cols] = encoder.transform(df[cat_cols])

    return df, y, passenger_ids

## Specifying X and y for model and Splitting the same

In [4]:
X_all, y_all, _ = preprocess(train_df, is_train=True)
X_test, _, test_ids = preprocess(test_df, is_train=False)

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

## Class weights and Gridsearch

In [5]:
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

cw = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
cw_dict = {i: w for i, w in enumerate(cw)}

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}

grid = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight=cw_dict),
    param_grid, cv=3, scoring='accuracy', n_jobs=-1
)
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_

print("Best RF Params:", grid.best_params_)
print("RF Val Accuracy:", accuracy_score(y_val, best_rf.predict(X_val)))

Best RF Params: {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
RF Val Accuracy: 0.7918343875790684


## Using XGBoost/XGBClassifier

In [6]:
from xgboost import XGBClassifier

xgb = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05,
                    use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
print("XGB Val Accuracy:", accuracy_score(y_val, xgb.predict(X_val)))

model = xgb if accuracy_score(y_val, xgb.predict(X_val)) > accuracy_score(y_val, best_rf.predict(X_val)) else best_rf
print(" Using model:", type(model).__name__)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGB Val Accuracy: 0.8010350776308223
 Using model: XGBClassifier


## Getting Training values

In [7]:
final_train_acc = accuracy_score(y_train, model.predict(X_train))
final_val_acc = accuracy_score(y_val, model.predict(X_val))

print(f" Final Model: {type(model).__name__}")
print(f" Train Accuracy: {final_train_acc:.4f}")
print(f" Validation Accuracy: {final_val_acc:.4f}")

 Final Model: XGBClassifier
 Train Accuracy: 0.8759
 Validation Accuracy: 0.8010


### we can do better than this so we try better case with XGBoost

In [8]:
xgb_new_features = XGBClassifier(n_estimators=200, max_depth=6, learning_rate=0.05,
                                 use_label_encoder=False, eval_metric='logloss', random_state=42)

xgb_new_features.fit(X_train, y_train)

y_pred_val_new_features = xgb_new_features.predict(X_val)

accuracy_new_features = accuracy_score(y_val, y_pred_val_new_features)
print(f"XGB Val Accuracy with new features: {accuracy_new_features:.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGB Val Accuracy with new features: 0.8010


### no change in accuracy so we shift to new model

## Using LightGBM and CatBosst

In [9]:
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

lgbm = LGBMClassifier(random_state=42)
lgbm.fit(X_train, y_train)

y_pred_proba_lgbm = lgbm.predict_proba(X_val)[:, 1]

roc_auc_lgbm = roc_auc_score(y_val, y_pred_proba_lgbm)
print(f"LightGBM Val ROC AUC: {roc_auc_lgbm:.4f}")

catboost = CatBoostClassifier(random_state=42, verbose=0)
catboost.fit(X_train, y_train)

y_pred_proba_catboost = catboost.predict_proba(X_val)[:, 1]

roc_auc_catboost = roc_auc_score(y_val, y_pred_proba_catboost)
print(f"CatBoost Val ROC AUC: {roc_auc_catboost:.4f}")

y_pred_proba_xgb = xgb_new_features.predict_proba(X_val)[:, 1]
roc_auc_xgb = roc_auc_score(y_val, y_pred_proba_xgb)
print(f"XGBoost Val ROC AUC: {roc_auc_xgb:.4f}")

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2140
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
LightGBM Val ROC AUC: 0.8930
CatBoost Val ROC AUC: 0.8967
XGBoost Val ROC AUC: 0.8913


## Trying new parameters

In [10]:
lgbm_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

lgbm_grid_search = GridSearchCV(LGBMClassifier(random_state=42), lgbm_param_grid, cv=3, scoring='roc_auc', n_jobs=-1)

lgbm_grid_search.fit(X_train, y_train)

print("Best LightGBM Params:", lgbm_grid_search.best_params_)

catboost_param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [1, 3, 5]
}

catboost_grid_search = GridSearchCV(CatBoostClassifier(random_state=42, verbose=0), catboost_param_grid, cv=3, scoring='roc_auc', n_jobs=-1)

catboost_grid_search.fit(X_train, y_train)

print(" Best CatBoost Params:", catboost_grid_search.best_params_)

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2140
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Best LightGBM Params: {'learning_rate': 0.05, 'max_depth': 7, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0.1}
 Best CatBoost Params: {'depth': 7, 'iterations': 300, 'l2_leaf_reg': 5, 'learning_rate': 0.05}


In [11]:
best_lgbm = LGBMClassifier(**lgbm_grid_search.best_params_, random_state=42)
best_lgbm.fit(X_train, y_train)

y_pred_proba_best_lgbm = best_lgbm.predict_proba(X_val)[:, 1]

roc_auc_best_lgbm = roc_auc_score(y_val, y_pred_proba_best_lgbm)
print(f"Tuned LightGBM Val ROC AUC: {roc_auc_best_lgbm:.4f}")

best_catboost = CatBoostClassifier(**catboost_grid_search.best_params_, random_state=42, verbose=0)
best_catboost.fit(X_train, y_train)

y_pred_proba_best_catboost = best_catboost.predict_proba(X_val)[:, 1]

roc_auc_best_catboost = roc_auc_score(y_val, y_pred_proba_best_catboost)
print(f"Tuned CatBoost Val ROC AUC: {roc_auc_best_catboost:.4f}")

[LightGBM] [Info] Number of positive: 3500, number of negative: 3454
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004328 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2140
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503307 -> initscore=0.013230
[LightGBM] [Info] Start training from score 0.013230
Tuned LightGBM Val ROC AUC: 0.8907
Tuned CatBoost Val ROC AUC: 0.8949


## Out of Both, CATBoost is better so we go with that

In [13]:
from sklearn.model_selection import cross_val_score
selected_model = best_catboost

cv_scores = cross_val_score(selected_model, X_all, y_all, cv=5, scoring='roc_auc')

print("Cross-validation ROC AUC scores:", cv_scores)
print(f"Mean Cross-validation ROC AUC: {cv_scores.mean():.4f}")

Cross-validation ROC AUC scores: [0.83952589 0.70543183 0.85741837 0.91494653 0.88960901]
Mean Cross-validation ROC AUC: 0.8414


In [14]:
from sklearn.metrics import roc_auc_score

y_pred_proba_catboost_final = best_catboost.predict_proba(X_val)[:, 1]

roc_auc_catboost_final = roc_auc_score(y_val, y_pred_proba_catboost_final)

print(f"Final CatBoost Val ROC AUC: {roc_auc_catboost_final:.4f}") 

Final CatBoost Val ROC AUC: 0.8949


## Final Testing and Prediction

In [15]:
# Use the predict_proba() method of the best_catboost model to predict the probabilities
y_pred_proba_test = best_catboost.predict_proba(X_test)[:, 1]

# Create a pandas DataFrame named submission
submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Transported": (y_pred_proba_test > 0.5).astype(bool)
})

# Save the submission DataFrame to a CSV file
submission.to_csv("submission_best.csv", index=False)

# Print a confirmation message
print("🎉 submission_best.csv saved!")

🎉 submission_best.csv saved!
