# Set up and global variables

In [None]:
from pathlib import Path

import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from IPython.display import display, HTML
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

from src.prioritization import *

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

RESOLUTION = config['DEFAULTS']['resolution']
SEED = config['DEFAULTS']['random_seed']

# input data
BENCHMARK_PATH = config['PATHS']['benchmark_dataset']
STORAGE_PATH = config['PATHS']['storage']

# output data
IMAGE_DIR = config['PATHS']['images'] / 'modelling'

os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / f'defects.csv', index_col=0)

df = pd.read_csv(BENCHMARK_PATH / 'benchmark_dataset.csv')

In [None]:
left_discrete_features = [col for col in df.columns if col.endswith('(Left Discrete)')]
right_discrete_features = [col for col in df.columns if col.endswith('(Right Discrete)')]
left_continuous_features = [col for col in df.columns if col.endswith('(Left Continuous)')]
right_continuous_features = [col for col in df.columns if col.endswith('(Right Continuous)')]

if any(map(lambda x: len(x) == 0, [left_discrete_features, right_discrete_features, left_continuous_features, right_continuous_features])):
    raise ValueError('Some of the feature sets are empty')

***

# Feature engineering

In [None]:
def remove_suffix(col):
    """Get to the original column name."""
    return col[:col.find(' (')]

In [None]:
left_discrete_values = df[left_discrete_features].rename(columns=remove_suffix)
right_discrete_values = df[right_discrete_features].rename(columns=remove_suffix)

left_continuous_values = df[left_continuous_features].rename(columns=remove_suffix)
right_continuous_values = df[right_continuous_features].rename(columns=remove_suffix)

## Difference features

In [None]:
discrete_diff = left_discrete_values - right_discrete_values
discrete_diff = discrete_diff.add_suffix(' (Discrete Diff)')

continuous_diff = left_continuous_values - right_continuous_values
continuous_diff = continuous_diff.add_suffix(' (Continuous Diff)')

## Binary flags

In [None]:
discrete_is_larger = left_discrete_values > right_discrete_values
discrete_is_larger = discrete_is_larger.add_suffix(' (Discrete >)')

continuous_is_larger = left_continuous_values > right_continuous_values
continuous_is_larger = continuous_is_larger.add_suffix(' (Continuous >)')

In [None]:
left_is_extreme_max = left_discrete_values == 5
left_is_extreme_max = left_is_extreme_max.add_suffix(' (Left Max)')
left_is_extreme_min = left_discrete_values == 1
left_is_extreme_min = left_is_extreme_min.add_suffix(' (Left Min)')

## Item and defect metadata

In [None]:
left_type = defects['defect type'].loc[df['left']].reset_index(drop=True).rename('left')
right_type = defects['defect type'].loc[df['right']].reset_index(drop=True).rename('right')

item_topic = items['topic'].loc[df['item']].reset_index(drop=True).rename('item')

In [None]:
metadata_encoder = OneHotEncoder()

metadata = metadata_encoder.fit_transform(pd.concat([
    left_type,
    right_type,
    item_topic
], axis=1))

metadata = pd.DataFrame(metadata.toarray(), columns=metadata_encoder.get_feature_names_out())

# Feature groups and combined dataframe

In [None]:
engineered_df = pd.concat([
    df[left_discrete_features],
    df[right_discrete_features],
    discrete_diff,
    df[left_continuous_features],
    df[right_continuous_features],
    continuous_diff,
    discrete_is_larger,
    continuous_is_larger,
    left_is_extreme_max,
    left_is_extreme_min,
    metadata,
], axis=1)

print("Final engineered dataframe shape:", engineered_df.shape)

In [None]:
feature_groups = {
    "Left Discrete": left_discrete_features,
    "Right Discrete": right_discrete_features,
    "Discrete Diff": discrete_diff.columns.tolist(),
    "Left+Right Continuous": left_continuous_features + right_continuous_features,
    "Continuous Diff": continuous_diff.columns.tolist(),
    "Derived Rules": discrete_is_larger.columns.tolist() + continuous_is_larger.columns.tolist(),
    "Additional Rules": left_is_extreme_max.columns.tolist() + left_is_extreme_min.columns.tolist(),
    "Metadata": metadata.columns.tolist(),
    "All Features": engineered_df.columns.tolist()
}

***

# Training loop

In [None]:
models = {
    "Random Forest": RandomForestClassifier(random_state=SEED, max_depth=3),
    "Gradient Boosting": GradientBoostingClassifier(max_depth=3, random_state=SEED),
    "Logistic Regression": LogisticRegression(random_state=SEED),
}

y = df['left won']

groups = df['submission id']
logo = LeaveOneGroupOut()

results = []
fold_predictions = {}

for fold_idx, (train_idx, test_idx) in tqdm(enumerate(logo.split(engineered_df, y, groups=groups)), desc="Iterating over folds", total=groups.nunique()):
    X_train_full, X_test_full = engineered_df.iloc[train_idx], engineered_df.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for model_name, model in models.items():
        for group_name, cols in feature_groups.items():
            X_train = X_train_full[cols]
            X_test = X_test_full[cols]

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            if hasattr(model, "predict_proba"):
                y_proba = model.predict_proba(X_test)[:, 1]
            else:
                # fallback for models without predict_proba
                y_proba = y_pred  

            acc = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_proba)

            results.append({
                "fold": fold_idx,
                "model": model_name,
                "group": group_name,
                "accuracy": acc,
                "auc": auc
            })

            fold_predictions[(fold_idx, model_name, group_name)] = pd.DataFrame({
                "y_true": y_test.values,
                "y_pred": y_pred,
                "y_proba": y_proba
            }, index=y_test.index)

results_df = pd.DataFrame(results)

In [None]:
results_df = pd.DataFrame(results)

# Aggregate by model and feature group
summary = results_df.groupby(["model", "group"])[["accuracy", "auc"]].agg(["mean", "std"]).reset_index()
display(summary)


***

# Results

In [None]:
# ***
# Plot results
# ***

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x='group', y='accuracy', ci='sd')
plt.title("Ablation Study: Accuracy by Feature Group")
plt.ylabel("Accuracy")
plt.xlabel("Feature Group")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x='group', y='auc', ci='sd')
plt.title("Ablation Study: AUC by Feature Group")
plt.ylabel("AUC")
plt.xlabel("Feature Group")
plt.tight_layout()
plt.show()


In [None]:
# ***
# Optional: Inspect feature importances per group (for interpretability)
# ***

for group_name, cols in feature_groups.items():
    print(f"\n=== Feature importances: {group_name} ===")
    
    # retrain on full dataset for interpretability
    X_full = df[cols]
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    model.fit(X_full, y)
    
    importances = pd.DataFrame({
        "feature": cols,
        "importance": model.feature_importances_
    }).sort_values("importance", ascending=False)
    
    display(importances.head(10))
