# Set up and global variables

In [None]:
from pathlib import Path

import os
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from IPython.display import display, HTML
from tqdm import tqdm

from src.prioritization import *

In [None]:
os.environ["CONFIG_ENV"] = "debug"

from config import load_config
config = load_config()

RESOLUTION = config['DEFAULTS']['resolution']
SEED = config['DEFAULTS']['random_seed']

# input data
BENCHMARK_PATH = config['PATHS']['benchmark_dataset']
STORAGE_PATH = config['PATHS']['storage']

# output data
IMAGE_DIR = config['PATHS']['images']

os.makedirs(IMAGE_DIR, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / f'defects.csv', index_col=0)

df = pd.read_csv(BENCHMARK_PATH / 'benchmark_dataset.csv')

In [None]:
left_discrete_features = [col for col in df.columns if col.endswith('(Left Discrete)')]
right_discrete_features = [col for col in df.columns if col.endswith('(Right Discrete)')]
left_continuous_features = [col for col in df.columns if col.endswith('(Left Continuous)')]
right_continuous_features = [col for col in df.columns if col.endswith('(Right Continuous)')]

if any(map(lambda x: len(x) == 0, [left_discrete_features, right_discrete_features, left_continuous_features, right_continuous_features])):
    raise ValueError('Some of the feature sets are empty')

***

# Feature engineering

## Difference features

## Binary flags

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

X = df[left_discrete_features + right_discrete_features]
y = df['left won']

# convert into binary itemsets
itemsets = X.apply(lambda x: [f"{col}>" if x[col] > 0 else f"{col}<=" for col in X.columns], axis=1)

In [None]:
# encode
te = TransactionEncoder()
te_ary = te.fit(itemsets).transform(itemsets)
encoded = pd.DataFrame(te_ary, columns=te.columns_)

encoded['winner'] = y.values.astype(bool)

# run apriori
frequent_itemsets = apriori(encoded, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

# filter rules
# predicting the output variable
rules = rules[rules['consequents'].apply(lambda x: 'winner' in x)]
# sufficient confidence and support
rules = rules[
    (rules['confidence'] > 0.7) & 
    (rules['support'] > 0.15)
]
# sort
rules = rules.sort_values(by='lift', ascending=False)
# only one rule per antecedent
rules = rules.drop_duplicates(subset=['antecedents'])

# Feature groups and combined dataframe

In [None]:
# ***
# Step 4: Combine all engineered features into final dataframe
# ***

engineered_df = pd.concat([
    left_discrete_features,
    right_discrete_features,
    discrete_diff_features,
    left_continuous,
    right_continuous,
    continuous_diff,
    derived_df,
    interaction_df
], axis=1)

print("Final engineered dataframe shape:", engineered_df.shape)

In [None]:
feature_groups = {
    "Left Discrete": left_discrete_features.columns.tolist(),
    "Right Discrete": right_discrete_features.columns.tolist(),
    "Discrete Diff": discrete_diff_features.columns.tolist(),
    "Left+Right Continuous": left_continuous.columns.tolist() + right_continuous.columns.tolist(),
    "Continuous Diff": continuous_diff.columns.tolist(),
    "Derived Rules": derived_df.columns.tolist(),
    "Interactions": interaction_df.columns.tolist(),
    "All Features": engineered_df.columns.tolist()
}

***

# Cross-validation folds

In [None]:
# ***
# Prepare target and folds
# ***

from sklearn.model_selection import KFold

# target
y = df['left won']

# define KFold
NUM_FOLDS = 5
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)

# ***
# Prepare a place to store results
# ***

results = []

# will keep fold-level predictions
fold_predictions = {}

***

# Training loop

In [None]:
# ***
# Ablation loop (Option A: retrain per fold per group)
# ***

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

for fold_idx, (train_idx, test_idx) in enumerate(kf.split(df)):
    print(f"\n=== Fold {fold_idx+1}/{NUM_FOLDS} ===")
    
    X_train_full = df.iloc[train_idx]
    X_test_full = df.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]
    
    for group_name, cols in feature_groups.items():
        print(f"Training ablation group: {group_name}")
        
        X_train = X_train_full[cols]
        X_test = X_test_full[cols]
        
        # Example model; could swap with Logistic, Tree, etc.
        model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:,1]
        
        acc = accuracy_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_proba)
        
        results.append({
            "fold": fold_idx,
            "group": group_name,
            "accuracy": acc,
            "auc": auc
        })
        
        # Save predictions for later analysis if needed
        fold_predictions[(fold_idx, group_name)] = pd.DataFrame({
            "y_true": y_test.values,
            "y_pred": y_pred,
            "y_proba": y_proba
        }, index=y_test.index)


In [None]:
# ***
# Aggregate results
# ***

results_df = pd.DataFrame(results)

display(results_df.groupby("group")[["accuracy","auc"]].agg(["mean","std"]))


***

# Results

In [None]:
# ***
# Plot results
# ***

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x='group', y='accuracy', ci='sd')
plt.title("Ablation Study: Accuracy by Feature Group")
plt.ylabel("Accuracy")
plt.xlabel("Feature Group")
plt.tight_layout()
plt.show()

plt.figure(figsize=(8,5))
sns.barplot(data=results_df, x='group', y='auc', ci='sd')
plt.title("Ablation Study: AUC by Feature Group")
plt.ylabel("AUC")
plt.xlabel("Feature Group")
plt.tight_layout()
plt.show()


In [None]:
# ***
# Optional: Inspect feature importances per group (for interpretability)
# ***

for group_name, cols in feature_groups.items():
    print(f"\n=== Feature importances: {group_name} ===")
    
    # retrain on full dataset for interpretability
    X_full = df[cols]
    model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
    model.fit(X_full, y)
    
    importances = pd.DataFrame({
        "feature": cols,
        "importance": model.feature_importances_
    }).sort_values("importance", ascending=False)
    
    display(importances.head(10))


In [None]:
# ***
# Optional: Save fold predictions and aggregated results
# ***

OUTPUT_DIR = DATASET_PATH / "ablation_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

results_df.to_csv(OUTPUT_DIR / "ablation_summary.csv", index=False)

for key, df_pred in fold_predictions.items():
    fold_idx, group_name = key
    df_pred.to_csv(OUTPUT_DIR / f"fold{fold_idx}_{group_name.replace(' ','_')}_predictions.csv")
