# بسم الله الرحمن الرحيم

# 📦 Imports


In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold


# 📁 Load data



In [44]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 🎯 Save target and PassengerId


In [45]:
y = train["Transported"]
test_ids = test["PassengerId"]

# 🧬 Feature Engineering: Extract from PassengerId (Group, Number)


In [46]:
for df in [train, test]:
    df[["Group", "Number"]] = df["PassengerId"].str.split("_", expand=True)
    df["Group"] = df["Group"].astype(int)
    df["Number"] = df["Number"].astype(int)


# 🧬 Feature Engineering: Extract from Cabin (Deck, Num, Side)


In [47]:
for df in [train, test]:
    df[["Deck", "CabinNum", "Side"]] = df["Cabin"].str.split("/", expand=True)


# 📤 Drop unused original columns


In [48]:
train.drop(columns=["PassengerId", "Name", "Cabin", "Transported"], inplace=True)
test.drop(columns=["PassengerId", "Name", "Cabin"], inplace=True)

# 🔁 Combine train/test for preprocessing


In [49]:
all_data = pd.concat([train, test], axis=0)


# 🔧 Fill missing values


In [50]:
# Impute booleans
for col in ["CryoSleep", "VIP"]:
    all_data[col] = all_data[col].fillna(False)

# Impute numerics
for col in ["Age", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]:
    all_data[col] = all_data[col].fillna(0)

# Impute cabin features
all_data["Deck"] = all_data["Deck"].fillna("Unknown")
all_data["Side"] = all_data["Side"].fillna("Unknown")
all_data["CabinNum"] = all_data["CabinNum"].fillna("0")

# One-hot encode categoricals
all_data = pd.get_dummies(all_data)

  all_data[col] = all_data[col].fillna(False)


# 🧠 Split back


In [51]:
X = all_data.iloc[:len(y), :]
X_test = all_data.iloc[len(y):, :]

# 🔀 Train/test split


In [52]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
final_preds = np.zeros(len(X_test))

# ⚙️ XGBoost model


In [53]:
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)

# ⚙️ Train LightGBM


In [54]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [55]:
for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # Train XGBoost
    xgb_model.fit(X_tr, y_tr)
    xgb_preds = xgb_model.predict_proba(X_test)[:, 1]

    # Train LGBM
    lgb_model.fit(X_tr, y_tr)
    lgb_preds = lgb_model.predict_proba(X_test)[:, 1]

    # Average predictions
    final_preds += (xgb_preds + lgb_preds) / 2

# Average over all folds
final_preds /= skf.get_n_splits()

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1659
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001772 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1661
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3502, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001890 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1660
[LightGBM] [Info] Number of data points in the train set: 6954, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503595 -> initscore=0.014380
[LightGBM] [Info] Start training from score 0.014380


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002118 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1660
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503666 -> initscore=0.014666
[LightGBM] [Info] Start training from score 0.014666


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 3503, number of negative: 3452
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1659
[LightGBM] [Info] Number of data points in the train set: 6955, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.503666 -> initscore=0.014666
[LightGBM] [Info] Start training from score 0.014666


# 📈 Predict on validation set


In [56]:
xgb_val = xgb_model.predict_proba(X_val)[:, 1]
lgb_val = lgb_model.predict_proba(X_val)[:, 1]

# Ensemble (simple average)
val_preds = (xgb_val + lgb_val) / 2
val_class = val_preds > 0.5
val_acc = accuracy_score(y_val, val_class)
print(f"Ensemble Validation Accuracy: {val_acc:.4f}")

Ensemble Validation Accuracy: 0.7975


# 🔮 Predict on test set


In [57]:
xgb_test = xgb_model.predict_proba(X_test)[:, 1]
lgb_test = lgb_model.predict_proba(X_test)[:, 1]
test_preds = (xgb_test + lgb_test) / 2
test_class = test_preds > 0.5

# 🧾 Create submission


In [58]:
submission = pd.DataFrame({
    "PassengerId": test_ids,
    "Transported": test_class.astype(bool)
    
})
submission.to_csv("submission.csv", index=False)