In [92]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import yaml

from xgboost import XGBClassifier

from sklearn import set_config

from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import make_scorer

from sklearn.base import BaseEstimator, TransformerMixin

from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

set_config(display="diagram")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [93]:
target = 'Survived'
random_state = 5646

### 1. Import data

In [94]:
df = pd.read_csv("../data/raw/train.csv")
X, y = (
    df.drop(target, axis=1),
    df[[target]],
)

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state, test_size=0.3, stratify=y)

### 2. Prepare data

In [96]:
class _BaseTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X=None, y=None):
        self.feature_names_in_ = X.columns
        return self

    def transform(self, X=None):
        return self

class CountAcrossFeatures(_BaseTransformer):
    def __init__(self, name="Count", offset=0):
        self.name = name
        self.offset = offset

    def transform(self, X):
        X_out = X[self.feature_names_in_].apply(lambda x: x.sum(), axis=1) + self.offset
        return np.array(X_out).reshape(-1,1)

    def get_feature_names_out(self, input_variables=None):
        return [self.name.lower()]

class DiscretizeFeatures(_BaseTransformer):
    def __init__(self, name="Discretize", quantiles=4, labels=None):
        self.name = name
        self.quantiles = quantiles
        self.labels = labels

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns
        if self.labels is None:
            self.labels = np.arange(self.quantiles)
        return self

    def transform(self, X, y=None):
        flattened_X = X[self.feature_names_in_].values.flatten()
        X_out, self.bins = pd.qcut(flattened_X, self.quantiles, labels=self.labels, retbins=True)
        return np.array(X_out).reshape(-1,1)

    def get_feature_names_out(self, input_variables=None):
        return [self.name.lower()]

class BooleanFeatures(_BaseTransformer):
    def __init__(self, name="Boolean"):
        self.name = name

    def transform(self, X, y=None):
        X_out = X[self.feature_names_in_].apply(lambda x: x.sum() == 0, axis=1).astype(int)
        return np.array(X_out).reshape(-1,1)

    def get_feature_names_out(self, input_variables=None):
        return [self.name.lower()]

class ExtractTitles(_BaseTransformer):
    def __init__(self, name="Title"):
        self.name = name

    def transform(self, X, y=None):
        X_out = X[self.feature_names_in_].apply(lambda x: x.str.extract(' ([A-Za-z]+)\.', expand=False))
        return np.array(X_out)
    
    def get_feature_names_out(self, input_variables=None):        
        return [(feature + "_" + self.name).lower() for feature in self.feature_names_in_]
       

In [97]:
# Perform imputation
imputer = make_column_transformer(
    (SimpleImputer(strategy="median"), ["Age"]),
    (
        make_pipeline(
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown="ignore", sparse=False),
        ),
        ["Embarked"],
    ),
)

# Build new features
new_features = make_column_transformer(
    (CountAcrossFeatures("FamilySize", offset=1), ["SibSp", "Parch"]),
    (DiscretizeFeatures("FareBand"), ["Fare"]),
    (BooleanFeatures("IsAlone"), ["SibSp", "Parch"]),
    (
        make_pipeline(ExtractTitles("Title"), OneHotEncoder(handle_unknown="ignore", sparse=False)),
        ["Name"],
    ),
    verbose_feature_names_out=False,
)

# Build pipeline
data_pipeline = make_union(imputer, new_features)
data_pipeline.fit(X_train)

In [98]:
# The model
model = XGBClassifier(n_estimators=200, eval_metric="logloss", random_state=random_state, use_label_encoder=False)

# The final pipeline
pipeline = make_pipeline(data_pipeline, model)

# Metrics to evaluate the model
scoring = ('accuracy', 'average_precision', 'neg_brier_score','roc_auc')

# Cross validation
scores = cross_validate(pipeline, X_train, y_train, cv=10, scoring=scoring)

# Mean value of the metrics
for k, v in scores.items():
    print(f"{k}: {v.mean()}")

fit_time: 0.5291356563568115
score_time: 0.028127336502075197
test_accuracy: 0.7816948284690219
test_average_precision: 0.7895978318642364
test_neg_brier_score: -0.1780402109998613
test_roc_auc: 0.8188906369183829
