### This notebook is meant to explore and compare models before selecting the one that will be used for the inference API.

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

import sys
import os

# Add the project root to sys.path (Jupyter version)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.loaders import load_required_attributes_from_raw
from src.column_transformers import TitanicInputTransformer

%matplotlib inline
sns.set(style="whitegrid")
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold



# Load and preprocess data
X, y = load_required_attributes_from_raw('../data/train.csv')

# Specify numeric columns to scale
numeric_features = ['Age', 'Fare']
numeric_scaler = ColumnTransformer(
    transformers=[('num', StandardScaler(), numeric_features)],
    remainder='passthrough'
)

# Define dummy groups and columns to drop
dummy_groups = {
    'Sex': ['female', 'male'],
    'Embarked': ['C', 'Q', 'S'],
    'Pclass': ['Class_1', 'Class_2', 'Class_3']
}
columns_to_drop = [cols[0] for cols in dummy_groups.values()]


pipelines = {
    'provided LinearSVC': Pipeline([
        ('input_transform', TitanicInputTransformer()),
        ('scaler', numeric_scaler),
        ('clf', LinearSVC(random_state=42))
    ]),
    'provided RandomForest': Pipeline([
        ('input_transform', TitanicInputTransformer()),
        ('clf', RandomForestClassifier(random_state=42))
    ]),
    'LinearSVC': Pipeline([
        ('input_transform', TitanicInputTransformer()),
        ('scaler', numeric_scaler),
        ('clf', LinearSVC(max_iter=10000, random_state=42))
    ]),
    'RandomForest': Pipeline([
        ('input_transform', TitanicInputTransformer()),
        ('clf', RandomForestClassifier(random_state=42))
    ]),
    'LogisticRegression': Pipeline([
        ('input_transform', TitanicInputTransformer()),
        ('scaler', numeric_scaler),
        ('clf', LogisticRegression(max_iter=1000, random_state=42))
    ]),
    'XGBoost': Pipeline([
        ('input_transform', TitanicInputTransformer()),
        ('clf', xgb.XGBClassifier(eval_metric='logloss', random_state=42))
    ])
}

# Update param_grids for pipelines (prefix model params with 'clf__')
param_grids = {
    'provided LinearSVC': {},
    'provided RandomForest': {},
    'LinearSVC': {
        'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
        'clf__dual': [True, False],
        'clf__loss': ['hinge', 'squared_hinge'],
        'clf__fit_intercept': [True, False],
        'clf__tol': [1e-4, 1e-3, 1e-2]
    },
    'RandomForest': {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [3, 5, 7, 10, None],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4],
    'clf__max_features': ['sqrt', 'log2'],
    'clf__bootstrap': [True, False]
    },
    'LogisticRegression': [
        {'clf__penalty': ['l1'], 'clf__C': [0.01, 0.1, 1, 10], 'clf__solver': ['liblinear']},
        {'clf__penalty': ['l2'], 'clf__C': [0.01, 0.1, 1, 10], 'clf__solver': ['liblinear', 'saga', 'lbfgs']},
        {'clf__penalty': ['elasticnet'], 'clf__C': [0.01, 0.1, 1, 10], 'clf__solver': ['saga'], 'clf__l1_ratio': [0.0, 0.5, 1.0]},
        {'clf__penalty': ['none'], 'clf__solver': ['saga', 'lbfgs']}
    ],
    'XGBoost': {
        'clf__n_estimators': [100, 200],
        'clf__max_depth': [3, 5, 7],
        'clf__learning_rate': [0.01, 0.1, 0.2],
        'clf__subsample': [0.8, 1],
        'clf__colsample_bytree': [0.8, 1],
        'clf__gamma': [0, 1],
        'clf__min_child_weight': [1, 5]
    }
}

scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for name, model in pipelines.items():
    print(f"Training {name} with GridSearchCV...")
    grid = GridSearchCV(
        model,
        param_grids[name],
        cv=cv,
        scoring=scoring,
        refit='f1',
        n_jobs=-1,
        return_train_score=True
    )
    grid.fit(X, y)
    best_idx = grid.best_index_
    results.append({
        'Model': name,
        'CV F1': grid.cv_results_['mean_test_f1'][best_idx],
        'CV Precision': grid.cv_results_['mean_test_precision'][best_idx],
        'CV Recall': grid.cv_results_['mean_test_recall'][best_idx],
        'CV Accuracy': grid.cv_results_['mean_test_accuracy'][best_idx],
        'Best Params': grid.best_params_
    })

results_df = pd.DataFrame(results).round(3)
display(results_df)

Training provided LinearSVC with GridSearchCV...
Training provided RandomForest with GridSearchCV...
Training LinearSVC with GridSearchCV...
Training RandomForest with GridSearchCV...
Training LogisticRegression with GridSearchCV...
Training XGBoost with GridSearchCV...


Unnamed: 0,Model,CV F1,CV Precision,CV Recall,CV Accuracy,Best Params
0,provided LinearSVC,0.722,0.758,0.693,0.796,{}
1,provided RandomForest,0.768,0.789,0.749,0.826,{}
2,LinearSVC,0.728,0.805,0.667,0.809,"{'clf__C': 0.01, 'clf__dual': False, 'clf__fit..."
3,RandomForest,0.79,0.831,0.754,0.846,"{'clf__bootstrap': True, 'clf__max_depth': Non..."
4,LogisticRegression,0.733,0.796,0.681,0.81,"{'clf__C': 0.1, 'clf__penalty': 'l2', 'clf__so..."
5,XGBoost,0.797,0.849,0.751,0.853,"{'clf__colsample_bytree': 0.8, 'clf__gamma': 1..."


In [5]:
# save the model
import pickle
with open("../models/best_model.pkl", "wb") as f:
    pickle.dump(grid, f)


In [None]:
import json
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.compose import ColumnTransformer

def save_importances_from_xgboost_model(model, X_train, importances_path):
    """
    Extract and save grouped feature importances from a fitted XGBoost pipeline.

    Parameters:
    - model: fitted best_estimator_ (Pipeline with input_transform, optional scaler, and XGBClassifier)
    - X_train: training data used to fit the input transformer
    - importances_path: path to save grouped importances as JSON
    """

    # Unpack pipeline
    input_transform = model.named_steps["input_transform"]
    scaler = model.named_steps.get("scaler")
    clf = model.named_steps["clf"]

    # Fit input and scaling transformers
    X_transformed = input_transform.fit_transform(X_train)

    if scaler is not None:
        scaler.fit(X_transformed)

        def get_feature_names_from_column_transformer(ct: ColumnTransformer):
            feature_names = []
            for name, transformer, cols in ct.transformers_:
                if transformer in ('drop', None):
                    continue
                if hasattr(transformer, 'get_feature_names_out'):
                    names = transformer.get_feature_names_out(cols)
                else:
                    names = cols
                feature_names.extend(names)
            return feature_names

        feature_names = get_feature_names_from_column_transformer(scaler)
    else:
        feature_names = (
            X_transformed.columns.tolist()
            if isinstance(X_transformed, pd.DataFrame)
            else X_train.columns.tolist()
        )

    # Get importances from XGBoost
    importances = clf.feature_importances_

    # Map feature names to original user input keys
    reverse_mapping = {
        'Age': 'Age',
        'Fare': 'Fare',
        'SibSp': 'SibSp',
        'Parch': 'Parch',
        'male': 'Sex',
        'female': 'Sex',
        'Class_1': 'Pclass',
        'Class_2': 'Pclass',
        'Class_3': 'Pclass',
        'C': 'Embarked',
        'Q': 'Embarked',
        'S': 'Embarked'
    }

    def map_feature_to_user_key(feature_name):
        return reverse_mapping.get(feature_name, feature_name)

    grouped_importances = defaultdict(list)
    for feat, imp in zip(feature_names, importances):
        user_key = map_feature_to_user_key(feat)
        grouped_importances[user_key].append(imp)

    # Build and export as JSON
    df = pd.DataFrame([
        {"User Input Key": key, "Total Importance": round(float(np.sum(imps)), 4)}
        for key, imps in grouped_importances.items()
    ])

    df = df.sort_values("Total Importance", ascending=False).reset_index(drop=True)

    df.to_json(importances_path, orient="records", indent=2)



import pickle
with open("../models/best_model.pkl", "rb") as f:
    grid=pickle.load(f)
X, _ = load_required_attributes_from_raw('../data/train.csv')

save_importances_from_xgboost_model(
        model=grid.best_estimator_,
        X_train=X,
        importances_path="../models/feature_importance.json"
    )