In [1]:
# STEP 3 — Model Training
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# --- 1) Load processed data fresh ---
PROC_DIR = Path("../data/processed")
df = pd.read_csv(PROC_DIR / "train_processed.csv")

# --- 2) Separate features and target ---
X = df.drop(columns=["Survived"], errors="ignore")
y = df["Survived"]

# --- 3) Split into train/validation sets ---
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 4) Initialize models ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42)
}

# --- 5) Train and evaluate each model ---
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    results.append((name, acc))
    print(f"{name}: {acc:.4f}")

# --- 6) Display leaderboard ---
results_df = pd.DataFrame(results, columns=["Model", "Validation_Accuracy"])
results_df = results_df.sort_values("Validation_Accuracy", ascending=False).reset_index(drop=True)

print("\n🏆 Model Leaderboard")
print(results_df)

# --- 7) Save best model (example: RandomForest) ---
best_model = RandomForestClassifier(n_estimators=200, random_state=42)
best_model.fit(X, y)

import joblib
Path("../outputs").mkdir(parents=True, exist_ok=True)
joblib.dump(best_model, "../outputs/random_forest_model.pkl")

print("\n✅ Training complete. Best model saved to ../outputs/random_forest_model.pkl")


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression: 0.8045
DecisionTree: 0.7039
RandomForest: 0.8101

🏆 Model Leaderboard
                Model  Validation_Accuracy
0        RandomForest             0.810056
1  LogisticRegression             0.804469
2        DecisionTree             0.703911

✅ Training complete. Best model saved to ../outputs/random_forest_model.pkl
