In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import yaml

from xgboost import XGBClassifier

from sklearn import set_config

from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import make_scorer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

set_config(display="diagram")

  from pandas import MultiIndex, Int64Index


In [2]:
target = 'Survived'
random_state = 5646

### 1. Import data

In [3]:
df = pd.read_csv("../data/raw/train.csv")
X, y = (
    df.drop(target, axis=1),
    df[[target]],
)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=0.3, stratify=y)

### 2. Prepare data

In [5]:
def count_features(X, offset=0):
    X = X.apply(lambda x: x.sum(), axis=1) + offset
    return np.array(X).reshape(-1, 1)


def discretize_feature(X, quantiles=4, labels=None):
    flattened_X = X.values.flatten()
    if labels is None:
        labels = np.arange(quantiles)
    X = pd.qcut(flattened_X, q=quantiles, labels=labels)
    return np.array(X).reshape(-1, 1)


def boolean_feature(X):
    X = X.apply(lambda x: x.sum() == 0, axis=1).astype(int)
    return np.array(X).reshape(-1, 1)


def extract_titles(X):
    X = X.apply(lambda x: x.str.extract(" ([A-Za-z]+)\.", expand=False))
    return np.array(X).reshape(-1, 1)


### 3. Create the pipeline

In [6]:
# Perform imputation
imputer = make_column_transformer(
    (SimpleImputer(strategy="median"), ["Age"]),
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown="ignore", sparse=False),
        ),
        ["Embarked"],
    ),
)
FunctionTransformer()


# Build new features
new_features = make_column_transformer(
    (FunctionTransformer(count_features, kw_args={"offset": 1}), ["SibSp", "Parch"]),
    (FunctionTransformer(discretize_feature, kw_args={"quantiles": 4}), ["Fare"]),
    (FunctionTransformer(boolean_feature), ["SibSp", "Parch"]),
    (
        make_pipeline(
            FunctionTransformer(extract_titles),
            OneHotEncoder(handle_unknown="ignore", sparse=False),
        ),
        ["Name"],
    ),
    verbose_feature_names_out=False,
)

data_pipeline = make_union(imputer, new_features)

### 4. Create and evaluate a model

In [7]:
# The model
model = XGBClassifier(n_estimators=200, eval_metric="logloss", random_state=random_state, use_label_encoder=False)

# The final pipeline
pipeline = make_pipeline(data_pipeline, model)

# Metrics to evaluate the model
scoring = ('accuracy', 'average_precision', 'neg_brier_score','roc_auc', 'f1')

# Cross validation
scores = cross_validate(pipeline, X_train, y_train, cv=10, scoring=scoring)

# Mean value of the metrics
for k, v in scores.items():
    print(f"{k}: {v.mean()}")

fit_time: 0.4658080816268921
score_time: 0.0254866361618042
test_accuracy: 0.7816948284690219
test_average_precision: 0.7895978318642364
test_neg_brier_score: -0.1780402109998613
test_roc_auc: 0.8188906369183829
test_f1: 0.704129565654146
