In [1]:
# DIAGNOSE why we're seeing mostly zeros
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

PROC_DIR = Path("../data/processed")
df = pd.read_csv(PROC_DIR / "train_processed.csv")

X = df.drop(columns=["Survived"])
y = df["Survived"].astype(int)

print("Class balance in y:\n", y.value_counts(normalize=True).rename("fraction"), "\n")

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Refit your current RF to see its behavior on VAL set
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

print("Validation accuracy:", accuracy_score(y_val, pred))
print("\nClassification report:\n", classification_report(y_val, pred, digits=3))
print("Predicted label counts:", pd.Series(pred).value_counts(), "\n")

print("Confusion matrix (rows=true [0,1], cols=pred [0,1])")
print(confusion_matrix(y_val, pred, labels=[0,1]))


Class balance in y:
 Survived
0    0.616162
1    0.383838
Name: fraction, dtype: float64 

Validation accuracy: 0.8100558659217877

Classification report:
               precision    recall  f1-score   support

           0      0.828     0.873     0.850       110
           1      0.778     0.710     0.742        69

    accuracy                          0.810       179
   macro avg      0.803     0.791     0.796       179
weighted avg      0.808     0.810     0.808       179

Predicted label counts: 0    116
1     63
Name: count, dtype: int64 

Confusion matrix (rows=true [0,1], cols=pred [0,1])
[[96 14]
 [20 49]]


In [2]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import joblib

PROC_DIR = Path("../data/processed")
OUT_DIR = Path("../outputs"); OUT_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(PROC_DIR / "train_processed.csv")
X = df.drop(columns=["Survived"])
y = df["Survived"].astype(int)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 1) Logistic Regression with class weights
logreg = LogisticRegression(max_iter=2000, class_weight="balanced", random_state=42)
logreg.fit(X_train, y_train)
p_val = logreg.predict_proba(X_val)[:, 1]
pred_def = (p_val >= 0.5).astype(int)

print("=== LogisticRegression (balanced) ===")
print("Val ACC:", accuracy_score(y_val, pred_def))
print("Val AUC:", roc_auc_score(y_val, p_val))
print("Pred counts:", pd.Series(pred_def).value_counts().to_dict())
print(classification_report(y_val, pred_def, digits=3))

# Find a better threshold (maximize F1)
thresh_grid = np.linspace(0.2, 0.8, 13)
best_t, best_f1 = 0.5, -1
from sklearn.metrics import f1_score
for t in thresh_grid:
    pred_t = (p_val >= t).astype(int)
    f1 = f1_score(y_val, pred_t)
    if f1 > best_f1:
        best_f1, best_t = f1, t
print(f"Best threshold for F1 on val: {best_t:.2f} (F1={best_f1:.3f})")

# 2) RandomForest with class weights
rf_bal = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)
rf_bal.fit(X_train, y_train)
rf_p = rf_bal.predict_proba(X_val)[:, 1]
rf_pred = (rf_p >= 0.5).astype(int)

print("\n=== RandomForest (balanced) ===")
print("Val ACC:", accuracy_score(y_val, rf_pred))
print("Val AUC:", roc_auc_score(y_val, rf_p))
print("Pred counts:", pd.Series(rf_pred).value_counts().to_dict())
print(classification_report(y_val, rf_pred, digits=3))

# Choose the better of the two by AUC (or your preferred metric)
use_model = rf_bal if roc_auc_score(y_val, rf_p) >= roc_auc_score(y_val, p_val) else logreg
chosen = "rf_balanced" if use_model is rf_bal else "logreg_balanced"
joblib.dump(use_model, OUT_DIR / f"{chosen}.pkl")
print(f"\n✅ Saved chosen model: {chosen}.pkl")


=== LogisticRegression (balanced) ===
Val ACC: 0.7932960893854749
Val AUC: 0.8561264822134387
Pred counts: {0: 107, 1: 72}
              precision    recall  f1-score   support

           0      0.841     0.818     0.829       110
           1      0.722     0.754     0.738        69

    accuracy                          0.793       179
   macro avg      0.782     0.786     0.784       179
weighted avg      0.795     0.793     0.794       179

Best threshold for F1 on val: 0.65 (F1=0.746)

=== RandomForest (balanced) ===
Val ACC: 0.8156424581005587
Val AUC: 0.8396574440052702
Pred counts: {0: 115, 1: 64}
              precision    recall  f1-score   support

           0      0.835     0.873     0.853       110
           1      0.781     0.725     0.752        69

    accuracy                          0.816       179
   macro avg      0.808     0.799     0.803       179
weighted avg      0.814     0.816     0.814       179


✅ Saved chosen model: logreg_balanced.pkl
