# Titanic â€” Clean Feature Engineering + CV Ensemble (CatBoost / LightGBM)

This notebook:
- Builds strong, compact features (Title, FamilySize, Deck, TicketPrefix, etc.)
- Uses Stratified K-Fold CV with OOF probabilities
- Tunes a classification threshold on OOF
- Blends CatBoost + LightGBM and outputs `/kaggle/working/submission.csv`


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix

from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

SEED = 42
N_SPLITS = 10

TRAIN_PATH = "/kaggle/input/titanic/train.csv"
TEST_PATH  = "/kaggle/input/titanic/test.csv"
WORKDIR = "/kaggle/working"

rng = np.random.default_rng(SEED)

pd.set_option("display.max_columns", 200)
np.random.seed(SEED)


In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

train.shape, test.shape


In [None]:
train.head()


In [None]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["Pclass"] = df["Pclass"].astype(str)

    title = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    title = title.replace({
        "Mlle": "Miss",
        "Ms": "Miss",
        "Mme": "Mrs",
        "Lady": "Rare",
        "Countess": "Rare",
        "Capt": "Rare",
        "Col": "Rare",
        "Don": "Rare",
        "Dr": "Rare",
        "Major": "Rare",
        "Rev": "Rare",
        "Sir": "Rare",
        "Jonkheer": "Rare",
        "Dona": "Rare",
    })
    vc = title.value_counts()
    rare_titles = vc[vc < 10].index
    title = title.replace(rare_titles, "Rare")
    df["Title"] = title.fillna("Rare")

    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    df["CabinKnown"] = df["Cabin"].notna().astype(int)
    df["Deck"] = df["Cabin"].astype(str).str[0].replace("n", np.nan).fillna("U")

    ticket_prefix = (
        df["Ticket"]
        .astype(str)
        .str.replace(r"\d+", "", regex=True)
        .str.replace(r"\s+", "", regex=True)
        .str.strip()
    )
    ticket_prefix = ticket_prefix.replace("", "NUM")
    df["TicketPrefix"] = ticket_prefix

    df["FarePerPerson"] = df["Fare"] / df["FamilySize"]
    df["FarePerPerson"] = df["FarePerPerson"].replace([np.inf, -np.inf], np.nan)

    return df

def impute(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["Embarked"] = df["Embarked"].fillna(df["Embarked"].mode(dropna=True)[0])
    df["Fare"] = df["Fare"].fillna(df["Fare"].median())
    df["FarePerPerson"] = df["FarePerPerson"].fillna(df["Fare"].median())

    g1 = df.groupby(["Title", "Pclass", "Sex"])["Age"].median()
    m1 = df["Age"].isna()
    if m1.any():
        df.loc[m1, "Age"] = df.loc[m1, ["Title", "Pclass", "Sex"]].apply(lambda r: g1.get((r["Title"], r["Pclass"], r["Sex"]), np.nan), axis=1)

    g2 = df.groupby(["Pclass", "Sex"])["Age"].median()
    m2 = df["Age"].isna()
    if m2.any():
        df.loc[m2, "Age"] = df.loc[m2, ["Pclass", "Sex"]].apply(lambda r: g2.get((r["Pclass"], r["Sex"]), np.nan), axis=1)

    df["Age"] = df["Age"].fillna(df["Age"].median())
    return df


In [None]:
train_fe = impute(add_features(train))
test_fe  = impute(add_features(test))

target = "Survived"
id_col = "PassengerId"

drop_cols = [target, "Name", "Ticket", "Cabin"]
X = train_fe.drop(columns=drop_cols)
y = train_fe[target].astype(int).to_numpy()

X_test = test_fe.drop(columns=[c for c in drop_cols if c != target])

cat_cols = ["Pclass", "Sex", "Embarked", "Title", "Deck", "TicketPrefix"]
for c in cat_cols:
    X[c] = X[c].astype("category")
    X_test[c] = X_test[c].astype("category")

X.shape, X_test.shape


## Quick EDA (compact visuals)


In [None]:
miss = train.isna().mean().sort_values(ascending=False)
plt.figure(figsize=(8,4))
plt.bar(miss.index[:10], miss.values[:10])
plt.xticks(rotation=45, ha="right")
plt.title("Top missing-rate columns (train)")
plt.tight_layout()
plt.show()


In [None]:
tmp = train_fe.copy()
tmp["Pclass"] = tmp["Pclass"].astype(str)

fig, axes = plt.subplots(1, 3, figsize=(12, 3.5))

axes[0].bar(tmp.groupby("Sex")["Survived"].mean().index, tmp.groupby("Sex")["Survived"].mean().values)
axes[0].set_title("Survival rate by Sex")

axes[1].bar(tmp.groupby("Pclass")["Survived"].mean().index, tmp.groupby("Pclass")["Survived"].mean().values)
axes[1].set_title("Survival rate by Pclass")

axes[2].bar(tmp.groupby("Embarked")["Survived"].mean().index, tmp.groupby("Embarked")["Survived"].mean().values)
axes[2].set_title("Survival rate by Embarked")

plt.tight_layout()
plt.show()


In [None]:
def tune_threshold(y_true, proba):
    ths = np.linspace(0.05, 0.95, 181)
    best_t = 0.5
    best_a = -1.0
    for t in ths:
        a = accuracy_score(y_true, (proba >= t).astype(int))
        if a > best_a:
            best_a = a
            best_t = float(t)
    return best_t, float(best_a)


In [None]:
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_cb = np.zeros(len(X), dtype=float)
test_cb = np.zeros(len(X_test), dtype=float)

cat_idx = [X.columns.get_loc(c) for c in cat_cols]

for fold, (trn, val) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X.iloc[trn], X.iloc[val]
    y_tr, y_va = y[trn], y[val]

    tr_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
    va_pool = Pool(X_va, y_va, cat_features=cat_idx)
    te_pool = Pool(X_test, cat_features=cat_idx)

    m = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Accuracy",
        iterations=20000,
        learning_rate=0.03,
        depth=7,
        l2_leaf_reg=5.0,
        subsample=0.85,
        colsample_bylevel=0.85,
        random_seed=SEED,
        early_stopping_rounds=400,
        verbose=False
    )

    m.fit(tr_pool, eval_set=va_pool, use_best_model=True)
    oof_cb[val] = m.predict_proba(va_pool)[:, 1]
    test_cb += m.predict_proba(te_pool)[:, 1] / N_SPLITS

t_cb, a_cb = tune_threshold(y, oof_cb)
a_cb, t_cb


In [None]:
pred_cb = (oof_cb >= t_cb).astype(int)
cm = confusion_matrix(y, pred_cb)
cm


In [None]:
plt.figure(figsize=(4,4))
plt.imshow(cm)
plt.title("CatBoost OOF Confusion Matrix")
plt.xticks([0,1], ["Pred 0", "Pred 1"])
plt.yticks([0,1], ["True 0", "True 1"])
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(cm[i, j]), ha="center", va="center")
plt.tight_layout()
plt.show()


In [None]:
skf2 = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_lgb = np.zeros(len(X), dtype=float)
test_lgb = np.zeros(len(X_test), dtype=float)

for fold, (trn, val) in enumerate(skf2.split(X, y), 1):
    X_tr, X_va = X.iloc[trn], X.iloc[val]
    y_tr, y_va = y[trn], y[val]

    m = lgb.LGBMClassifier(
        n_estimators=20000,
        learning_rate=0.02,
        num_leaves=64,
        max_depth=-1,
        min_child_samples=25,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_lambda=1.0,
        random_state=SEED
    )

    m.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="binary_logloss",
        callbacks=[lgb.early_stopping(400, verbose=False)]
    )

    oof_lgb[val] = m.predict_proba(X_va)[:, 1]
    test_lgb += m.predict_proba(X_test)[:, 1] / N_SPLITS

t_lgb, a_lgb = tune_threshold(y, oof_lgb)
a_lgb, t_lgb


In [None]:
best = {"acc": -1.0, "w": 0.5, "t": 0.5}
for w in np.linspace(0, 1, 101):
    p = w * oof_cb + (1 - w) * oof_lgb
    t, a = tune_threshold(y, p)
    if a > best["acc"]:
        best = {"acc": a, "w": float(w), "t": float(t)}
best


In [None]:
w = best["w"]
t = best["t"]

oof_blend = w * oof_cb + (1 - w) * oof_lgb
test_blend = w * test_cb + (1 - w) * test_lgb

acc_blend = accuracy_score(y, (oof_blend >= t).astype(int))
acc_blend, w, t


In [None]:
pred_test = (test_blend >= t).astype(int)

submission = pd.DataFrame({
    "PassengerId": test[id_col].values,
    "Survived": pred_test
})

out_path = f"{WORKDIR}/submission.csv"
submission.to_csv(out_path, index=False)

out_path
