In [None]:
import pandas as pd

# Load preprocessed data
train_df = pd.read_csv("data/train_cleaned.csv")
test_df = pd.read_csv("data/test_cleaned.csv")

# Preview to double-check
train_df.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Features (X) and Target (y)
X = train_df.drop("survived", axis=1)
y = train_df["survived"]

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(random_state=42)

param_grid = {
    'max_depth': [5, 7, 10],
    'num_leaves': [15, 31, 50],
    'min_child_samples': [5, 10, 20],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200, 400]
}

grid_search_lgbm = GridSearchCV(lgbm, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search_lgbm.fit(X_train, y_train)

print("Best LightGBM params:", grid_search_lgbm.best_params_)
print("Best LightGBM CV score:", grid_search_lgbm.best_score_)


ng] No further splits with positive gain, best gain: -inf
















[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=5, min_child_samples=20, n_estimators=100, num_leaves=50, subsample=0.6; total time=   1.4s
[LightGBM] [Info] Number of positive: 179, number of negative: 296
[CV] END colsample_bytree=1.0, learning_rate=0.01, max_depth=5, min_child_samples=10, n_estimators=400, num_leaves=50, subsample=1.0; total time=   8.2s
[LightGBM] [Info] Number of positive: 179, number of negative: 296
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91
[LightGBM] [Info] Number of data points in the train set: 475, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.376842 -> initscore=-0.502974
[LightGBM] [Info] Start training from score -0.502974
[CV

In [None]:

# Models to compare
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

# Train and evaluate each model
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    results[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")


In [None]:
'''
Results before feature engineering:
    Logistic Regression Accuracy: 0.7989
    Decision Tree Accuracy: 0.8101
    Random Forest Accuracy: 0.8324
    XGBoost Accuracy: 0.8380
    LightGBM Accuracy: 0.8268

Results after feature engineering:
    Logistic Regression Accuracy: 0.8101
    Decision Tree Accuracy: 0.8268
    Random Forest Accuracy: 0.8436
    XGBoost Accuracy: 0.8212
    LightGBM Accuracy: 0.8436

Chosen model: 

'''