In [2]:


import re
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree


DATA_PATH = Path("/content/train.csv")
df = pd.read_csv(DATA_PATH)

TARGET = "Survived"
y = df[TARGET].values
X_raw = df.drop(columns=[TARGET])


class TitanicFeatureBuilder(BaseEstimator, TransformerMixin):
    """Add Title, FamilySize, IsAlone. Keep all original columns; downstream
    ColumnTransformer selects what it needs and drops the rest."""
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        def extract_title(name):
            m = re.search(r",\s*([^\.]+)\.", str(name))
            return m.group(1).strip() if m else "None"

        title = X["Name"].map(extract_title)
        title = title.replace(
            {
                "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
                "Lady": "Royal", "Countess": "Royal", "Sir": "Royal",
                "Jonkheer": "Royal", "Don": "Royal", "Dona": "Royal",
                "Dr": "Officer", "Rev": "Officer", "Col": "Officer",
                "Major": "Officer", "Capt": "Officer"
            }
        )
        X["Title"] = title


        X["FamilySize"] = X["SibSp"].fillna(0) + X["Parch"].fillna(0) + 1
        X["IsAlone"] = (X["FamilySize"] == 1).astype(int)

        return X

feat_builder = TitanicFeatureBuilder()

numeric_features = ["Age", "SibSp", "Parch", "Fare", "FamilySize", "IsAlone"]
categorical_features = ["Pclass", "Sex", "Embarked", "Title"]

numeric_transformer_median = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),
    ]
)

numeric_transformer_knn = Pipeline(
    steps=[
        ("imputer", KNNImputer(n_neighbors=5, weights="uniform")),
        ("scaler", StandardScaler(with_mean=False)),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

def make_preprocessor(numeric_transformer):
    return ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ],
        remainder="drop"
    )

pipe = Pipeline(
    steps=[
        ("feat", feat_builder),
        ("preprocess", make_preprocessor(numeric_transformer_median)),
        ("clf", DecisionTreeClassifier(random_state=42))
    ]
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

param_grid = [
    {
        "preprocess": [make_preprocessor(numeric_transformer_median)],
        "clf__criterion": ["gini", "entropy", "log_loss"],
        "clf__max_depth": [3, 5, 6, 7, 9, 11],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4],
    },
    {
        "preprocess": [make_preprocessor(numeric_transformer_knn)],
        "clf__criterion": ["gini", "entropy", "log_loss"],
        "clf__max_depth": [3, 5, 6, 7, 9, 11],
        "clf__min_samples_split": [2, 5, 10],
        "clf__min_samples_leaf": [1, 2, 4],
    },
]

grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    refit=True,
    return_train_score=True,
)

grid.fit(X_raw, y)

best_model = grid.best_estimator_
best_params = grid.best_params_
best_cv = grid.best_score_

print("\n=== Decision Tree (tuned) ===")
print(f"Best 5-fold CV accuracy: {best_cv:.4f}")
print("Best params:")
for k, v in best_params.items():
    print(f"  {k}: {v}")

clf = best_model.named_steps["clf"]
preprocessor = best_model.named_steps["preprocess"]

try:
    feat_names = preprocessor.get_feature_names_out()
except AttributeError:
    num_names = numeric_features
    cat_ohe = preprocessor.named_transformers_["cat"].named_steps["ohe"]
    cat_names = cat_ohe.get_feature_names_out(categorical_features).tolist()
    feat_names = np.array(list(num_names) + cat_names)

plt.figure(figsize=(22, 12))
plot_tree(
    clf,
    feature_names=feat_names,
    class_names=["Not Survived", "Survived"],
    filled=True,
    rounded=True,
    fontsize=9,
)
plt.title("Titanic — Tuned Decision Tree")
plt.tight_layout()
plt.savefig("titanic_decision_tree.png", dpi=220)
plt.close()
print("Saved tree plot to titanic_decision_tree.png")
print(f"Tree depth: {clf.get_depth()} | Leaves: {clf.get_n_leaves()}")


Fitting 5 folds for each of 324 candidates, totalling 1620 fits

=== Decision Tree (tuned) ===
Best 5-fold CV accuracy: 0.8283
Best params:
  clf__criterion: entropy
  clf__max_depth: 3
  clf__min_samples_leaf: 1
  clf__min_samples_split: 2
  preprocess: ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median')),
                                                 ('scaler',
                                                  StandardScaler(with_mean=False))]),
                                 ['Age', 'SibSp', 'Parch', 'Fare', 'FamilySize',
                                  'IsAlone']),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ohe',
                                   

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import randint, uniform

base_rf = RandomForestClassifier(
    n_estimators=200,
    bootstrap=True, oob_score=False,
    n_jobs=-1, random_state=42
)

eval_pipe_rf = Pipeline([
    ("feat", feat_builder),
    ("preprocess", best_model.named_steps["preprocess"]),
    ("clf", base_rf),
])

cv3 = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

param_dist = {
    "clf__max_depth": [None, 10],
    "clf__min_samples_split": randint(2, 11),
    "clf__min_samples_leaf": randint(1, 4),
    "clf__max_features": ["sqrt", 0.5],
    "clf__class_weight": [None, "balanced"],
}

rs = RandomizedSearchCV(
    eval_pipe_rf, param_distributions=param_dist,
    n_iter=40, scoring="accuracy", cv=cv3, n_jobs=-1, verbose=1, random_state=42, refit=True
)
rs.fit(X_raw, y)

print("Fast tuned RF (3-fold) best:", rs.best_score_, rs.best_params_)

best_rf = rs.best_estimator_
best_rf.named_steps["clf"].set_params(n_estimators=800, oob_score=True)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(best_rf, X_raw, y, cv=5, scoring="accuracy", n_jobs=-1)
print(f"Final RF 5-fold: {scores.mean():.4f} ± {scores.std():.4f}")


Fitting 3 folds for each of 40 candidates, totalling 120 fits
Fast tuned RF (3-fold) best: 0.8451178451178452 {'clf__class_weight': None, 'clf__max_depth': None, 'clf__max_features': 0.5, 'clf__min_samples_leaf': 3, 'clf__min_samples_split': 8}
Final RF 5-fold: 0.8406 ± 0.0240
