# Titanic â€” CV v3 (Pipeline + OHE + Ensemble, Dense-safe)

Writes `/kaggle/working/submission.csv`


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, log_loss

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

import lightgbm as lgb

SEED = 42
N_SPLITS = 10

TRAIN_PATH = "/kaggle/input/titanic/train.csv"
TEST_PATH  = "/kaggle/input/titanic/test.csv"
WORKDIR = "/kaggle/working"

rng = np.random.default_rng(SEED)
np.random.seed(SEED)
pd.set_option("display.max_columns", 200)


In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

train.shape, test.shape


In [None]:
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["Pclass"] = df["Pclass"].astype(str)

    title = df["Name"].str.extract(r" ([A-Za-z]+)\.", expand=False)
    title = title.replace({
        "Mlle": "Miss",
        "Ms": "Miss",
        "Mme": "Mrs",
        "Lady": "Rare",
        "Countess": "Rare",
        "Capt": "Rare",
        "Col": "Rare",
        "Don": "Rare",
        "Dr": "Rare",
        "Major": "Rare",
        "Rev": "Rare",
        "Sir": "Rare",
        "Jonkheer": "Rare",
        "Dona": "Rare",
    })
    df["Title"] = title.fillna("Rare")

    df["FamilySize"] = df["SibSp"] + df["Parch"] + 1
    df["IsAlone"] = (df["FamilySize"] == 1).astype(int)

    df["CabinKnown"] = df["Cabin"].notna().astype(int)
    df["Deck"] = df["Cabin"].astype(str).str[0].replace("n", np.nan).fillna("U")

    ticket_prefix = (
        df["Ticket"]
        .astype(str)
        .str.replace(r"\d+", "", regex=True)
        .str.replace(r"\s+", "", regex=True)
        .str.strip()
    )
    df["TicketPrefix"] = ticket_prefix.replace("", "NUM")

    df["FareLog"] = np.log1p(df["Fare"])

    df["NameLen"] = df["Name"].astype(str).str.len()
    df["SexPclass"] = df["Sex"].astype(str) + "_" + df["Pclass"].astype(str)

    df["IsChild"] = (df["Age"] <= 14).astype(int)

    return df

train_fe = add_features(train)
test_fe  = add_features(test)

y = train_fe["Survived"].astype(int).to_numpy()

drop_cols = ["Survived", "Name", "Ticket", "Cabin"]
X = train_fe.drop(columns=drop_cols)
X_test = test_fe.drop(columns=[c for c in drop_cols if c != "Survived"])

X.shape, X_test.shape


In [None]:
cat_cols = ["Pclass", "Sex", "Embarked", "Title", "Deck", "TicketPrefix", "SexPclass"]
num_cols = [c for c in X.columns if c not in cat_cols]

for c in cat_cols:
    X[c] = X[c].astype("object")
    X_test[c] = X_test[c].astype("object")

pre = ColumnTransformer(
    transformers=[
        ("num", Pipeline([("imp", SimpleImputer(strategy="median"))]), num_cols),
        ("cat", Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                          ("ohe", OneHotEncoder(handle_unknown="ignore"))]), cat_cols),
    ],
    remainder="drop"
)

pre


In [None]:
class DenseTransformer:
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.toarray() if hasattr(X, "toarray") else np.asarray(X)

def make_pipeline(model, needs_dense=False):
    if needs_dense:
        return Pipeline([("pre", pre), ("dense", DenseTransformer()), ("model", model)])
    return Pipeline([("pre", pre), ("model", model)])

def oof_predict_proba(model, X_all, y_all, X_te, needs_dense=False):
    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
    oof = np.zeros(len(X_all), dtype=float)
    te = np.zeros(len(X_te), dtype=float)

    for trn, val in skf.split(X_all, y_all):
        X_tr, X_va = X_all.iloc[trn], X_all.iloc[val]
        y_tr, y_va = y_all[trn], y_all[val]

        clf = make_pipeline(model, needs_dense=needs_dense)
        clf.fit(X_tr, y_tr)

        oof[val] = clf.predict_proba(X_va)[:, 1]
        te += clf.predict_proba(X_te)[:, 1] / N_SPLITS

    return oof, te

def tune_threshold(y_true, proba):
    ths = np.linspace(0.05, 0.95, 181)
    best_t = 0.5
    best_a = -1.0
    for t in ths:
        a = accuracy_score(y_true, (proba >= t).astype(int))
        if a > best_a:
            best_a = a
            best_t = float(t)
    return best_t, float(best_a)

def safe_logloss(y_true, proba):
    p = np.clip(proba, 1e-6, 1 - 1e-6)
    return float(log_loss(y_true, p))


In [None]:
m_lr = LogisticRegression(
    C=2.0,
    solver="liblinear",
    max_iter=2000,
    random_state=SEED
)

m_hgb = HistGradientBoostingClassifier(
    max_depth=6,
    learning_rate=0.05,
    max_iter=800,
    random_state=SEED
)

m_lgb = lgb.LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.02,
    num_leaves=64,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=1.0,
    random_state=SEED,
    n_jobs=-1
)

oof_lr, te_lr = oof_predict_proba(m_lr, X, y, X_test, needs_dense=False)
oof_hgb, te_hgb = oof_predict_proba(m_hgb, X, y, X_test, needs_dense=True)
oof_lgb, te_lgb = oof_predict_proba(m_lgb, X, y, X_test, needs_dense=False)

scores = {
    "lr":  {"acc": tune_threshold(y, oof_lr)[1],  "t": tune_threshold(y, oof_lr)[0],  "logloss": safe_logloss(y, oof_lr)},
    "hgb": {"acc": tune_threshold(y, oof_hgb)[1], "t": tune_threshold(y, oof_hgb)[0], "logloss": safe_logloss(y, oof_hgb)},
    "lgb": {"acc": tune_threshold(y, oof_lgb)[1], "t": tune_threshold(y, oof_lgb)[0], "logloss": safe_logloss(y, oof_lgb)},
}

scores


In [None]:
P = np.column_stack([oof_lr, oof_hgb, oof_lgb])
T = np.column_stack([te_lr, te_hgb, te_lgb])

best_w = np.ones(3) / 3
best_s = safe_logloss(y, P @ best_w)

for _ in range(15000):
    w = rng.dirichlet(np.ones(3))
    s = safe_logloss(y, P @ w)
    if s < best_s:
        best_s = s
        best_w = w

step = 0.05
for _ in range(6000):
    i = int(rng.integers(0, 3))
    j = int(rng.integers(0, 3))
    if i == j:
        continue
    w = best_w.copy()
    delta = float(rng.uniform(-step, step))
    w[i] = max(0.0, w[i] + delta)
    w[j] = max(0.0, w[j] - delta)
    ssum = w.sum()
    if ssum <= 0:
        continue
    w /= ssum
    s = safe_logloss(y, P @ w)
    if s < best_s:
        best_s = s
        best_w = w

blend_oof = P @ best_w
blend_test = T @ best_w

t_blend, a_blend = tune_threshold(y, blend_oof)
best_w, best_s, a_blend, t_blend


In [None]:
pred_test = (blend_test >= t_blend).astype(int)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"].values,
    "Survived": pred_test
})

out_path = f"{WORKDIR}/submission.csv"
submission.to_csv(out_path, index=False)

out_path
