# Set up and global variables

In [None]:
from pathlib import Path
from copy import deepcopy

import os
import pickle
import json
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from IPython.display import display, HTML
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.pipeline import Pipeline
from scipy.stats import wilcoxon
from sklearn.calibration import calibration_curve

from src.prioritization import *
from src.feature_engineering import select_features, BASE_VALUES

RETRAIN_MODELS = True

In [None]:
os.environ["CONFIG_ENV"] = "debug"
if False:
    os.environ["CONFIG_ENV"] = "production"

from config import load_config
config = load_config()

DEBUG = config["DEBUG"]

RESOLUTION = config['DEFAULTS']['resolution']
SEED = config['DEFAULTS']['random_seed']

# input data
BENCHMARK_PATH = config['PATHS']['benchmark_dataset']
STORAGE_PATH = config['PATHS']['storage']

# output data
IMAGE_DIR = config['PATHS']['images'] / 'modelling'
FINAL_MODEL_PATH = BENCHMARK_PATH / "final_teacher_model.pkl"
FINAL_MODEL_FEATURES_PATH = BENCHMARK_PATH / "final_selected_features.pkl"
FINAL_BASELINE_PATH = BENCHMARK_PATH / "baseline_models"
FINAL_BASELINE_FEATURES_PATH = BENCHMARK_PATH / "baseline_features"
BENCHMARK_CACHE_PATH = BENCHMARK_PATH / "benchmark_cache"

os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(FINAL_BASELINE_PATH, exist_ok=True)
os.makedirs(FINAL_BASELINE_FEATURES_PATH, exist_ok=True)
os.makedirs(BENCHMARK_CACHE_PATH, exist_ok=True)

***

# Loading data

In [None]:
items = pd.read_csv(STORAGE_PATH / 'items.csv', index_col=0)
defects = pd.read_csv(STORAGE_PATH / f'defects.csv', index_col=0)

df = pd.read_csv(BENCHMARK_PATH / 'benchmark_dataset.csv')
catalog = joblib.load(BENCHMARK_PATH / 'feature_catalog.joblib')

***

# Experiment

In [None]:
y = df['left won']
sample_weights = df['weight']
groups = df['submission id']
X = df.drop(columns=['left won', 'submission id', 'weight', 'left', 'right', 'item'])

logo = LeaveOneGroupOut()

folds = list(enumerate(logo.split(df, y, groups)))

## Single feature models

In [None]:
heuristic_groups = {
    f"{base} - {value_type}": select_features(catalog, base=base, kind='Original', dtype=value_type)
        for base in BASE_VALUES for value_type in ['Discrete', 'Continuous']
        if base != 'Metadata'
}

In [None]:
base_model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=5000, solver="lbfgs", random_state=SEED))
])

In [None]:
if RETRAIN_MODELS:
    single_feature_results = []
    single_feature_fold_predictions = {}

    for fold_idx, (train_idx, test_idx) in tqdm(folds, desc="Iterating over folds", total=groups.nunique()):

        # Select index
        X_train_full, X_test_full = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        train_weights, test_weights = sample_weights.iloc[train_idx], sample_weights.iloc[test_idx]

        # Select column
        for group_name, cols in heuristic_groups.items():
            X_train = X_train_full[cols]
            X_test = X_test_full[cols]

            # Train and predict
            model = deepcopy(base_model)

            # Pass sample weights to the pipeline
            model.fit(X_train, y_train, clf__sample_weight=train_weights)

            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1]

            # Evaluate
            acc = accuracy_score(y_test, y_pred, sample_weight=test_weights)
            try:
                auc_score = roc_auc_score(y_test, y_proba, sample_weight=test_weights)
            except ValueError:
                # y_test might only contain one class
                auc_score = np.nan

            # Store results
            model_name = "LogisticRegression"

            single_feature_results.append({
                "fold": fold_idx,
                "model": model_name,
                "group": group_name,
                "accuracy": acc,
                "auc": auc_score
            })

            single_feature_fold_predictions[(fold_idx, model_name, group_name)] = pd.DataFrame({
                "y_true": y_test.values,
                "y_pred": y_pred,
                "y_proba": y_proba
            }, index=y_test.index)

    single_feature_results = pd.DataFrame(single_feature_results)
    single_feature_results.to_csv(BENCHMARK_CACHE_PATH / 'single_feature_results.csv', index=False)
    pickle.dump(single_feature_fold_predictions, open(BENCHMARK_CACHE_PATH / 'single_feature_fold_predictions.pkl', 'wb'))
else:
    single_feature_results = pd.read_csv(BENCHMARK_CACHE_PATH / 'single_feature_results.csv')
    single_feature_fold_predictions = pickle.load(open(BENCHMARK_CACHE_PATH / 'single_feature_fold_predictions.pkl', 'rb'))

In [None]:
# Aggregate by model and feature group
single_feature_summary = single_feature_results.groupby(["model", "group"])[["accuracy", "auc"]].agg(["mean", "std"]).reset_index()
# flatten multiindex
single_feature_summary.columns = [" ".join(col).strip() for col in single_feature_summary.columns.values]
single_feature_summary['ranking_stability_score'] = single_feature_summary['auc mean'] - 0.5 * single_feature_summary['auc std']
single_feature_summary.sort_values('ranking_stability_score', ascending=False, inplace=True)
single_feature_summary.to_html(IMAGE_DIR / "single_feature_summary.html", index=False)
display(single_feature_summary)

## Complex models

In [None]:
feature_groups = {
    "Original (Left Only)": select_features(
        catalog, kind="Original", side="Left"
    ),

    "Original (Right Only)": select_features(
        catalog, kind="Original", side="Right"
    ),

    "Original (Left+Right)": select_features(
        catalog, kind="Original"
    ),

    "Diff Only": select_features(
        catalog, kind="Difference"
    ),

    "Binary Comparisons": select_features(
        catalog, kind="Binary"
    ),

    "Metadata": select_features(
        catalog, kind="Metadata"
    ),

    "All Heuristic-Derived": select_features(
        catalog, kind=None
    ),
}

In [None]:
models = {
    "Random Forest": RandomForestClassifier(
        random_state=SEED,
        max_depth=3
    ),

    "Gradient Boosting": GradientBoostingClassifier(
        random_state=SEED,
        max_depth=3
    ),

    "Logistic Regression": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            random_state=SEED,
            max_iter=5000,
            solver="lbfgs"
        ))
    ]),
    "LASSO Logistic": Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            random_state=SEED,
            max_iter=5000,
            solver="liblinear",
            penalty="l1"
        ))
    ])
}

In [None]:
if RETRAIN_MODELS:
    results = []
    fold_predictions = {}

    for fold_idx, (train_idx, test_idx) in tqdm(folds, desc="Iterating over folds", total=groups.nunique()):

        # Select indices
        X_train_full, X_test_full = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        train_weights, test_weights = sample_weights.iloc[train_idx], sample_weights.iloc[test_idx]

        # Iterate over models and feature groups
        for model_name, model in models.items():
            for group_name, cols in feature_groups.items():
                # Select columns
                X_train = X_train_full[cols]
                X_test = X_test_full[cols]
        
                # Train and predict
                model = deepcopy(model)

                # Pass sample weights to the pipeline
                if isinstance(model, Pipeline):
                    model.fit(X_train, y_train, clf__sample_weight=train_weights)
                else:
                    model.fit(X_train, y_train, sample_weight=train_weights)
                    
                y_pred = model.predict(X_test)

                # Evaluate
                # fallback for models without predict_proba
                if hasattr(model, "predict_proba"):
                    y_proba = model.predict_proba(X_test)[:, 1]
                else:
                    y_proba = np.nan

                acc = accuracy_score(y_test, y_pred, sample_weight=test_weights)
                try:
                    auc_score = roc_auc_score(y_test, y_proba, sample_weight=test_weights)
                except ValueError:
                    # y_test might only contain one class
                    auc = np.nan

                # Store results
                results.append({
                    "fold": fold_idx,
                    "model": model_name,
                    "group": group_name,
                    "accuracy": acc,
                    "auc": auc_score
                })

                fold_predictions[(fold_idx, model_name, group_name)] = pd.DataFrame({
                    "y_true": y_test.values,
                    "y_pred": y_pred,
                    "y_proba": y_proba
                }, index=y_test.index)

    ablation_study_results = pd.DataFrame(results)
    ablation_study_results.to_csv(BENCHMARK_CACHE_PATH / 'ablation_study_results.csv', index=False)
    pickle.dump(fold_predictions, open(BENCHMARK_CACHE_PATH / 'ablation_study_fold_predictions.pkl', 'wb'))
else:
    ablation_study_results = pd.read_csv(BENCHMARK_CACHE_PATH / 'ablation_study_results.csv')
    fold_predictions = pickle.load(open(BENCHMARK_CACHE_PATH / 'ablation_study_fold_predictions.pkl', 'rb'))

In [None]:
# Aggregate by model and feature group
ablation_study_summary = ablation_study_results.groupby(["model", "group"])[["accuracy", "auc"]].agg(["mean", "std"]).reset_index()
# flatten multiindex
ablation_study_summary.columns = [" ".join(col).strip() for col in ablation_study_summary.columns.values]
ablation_study_summary['ranking_stability_score'] = ablation_study_summary['auc mean'] - 0.5 * ablation_study_summary['auc std']
ablation_study_summary.sort_values('ranking_stability_score', ascending=False, inplace=True)
ablation_study_summary.to_html(IMAGE_DIR / "ablation_study_summary.html", index=False)
display(ablation_study_summary)


***

# Train the final model with the best performing parameters

In [None]:
if DEBUG:
    best_hyperparams = ablation_study_summary[ablation_study_summary['model'] == "LASSO Logistic"].iloc[0]
else:
    best_hyperparams = ablation_study_summary.iloc[0]
best_model_name, best_feature_group = best_hyperparams["model"], best_hyperparams["group"]
print(f"Selected model: {best_model_name}; feature group: {best_feature_group}")

In [None]:
if RETRAIN_MODELS:
    best_model = deepcopy(models[best_model_name])
    selected_cols = feature_groups.get(best_feature_group)

    X_full = X[selected_cols]
    y_full = y

    best_model.fit(X_full, y_full)

    joblib.dump(best_model, FINAL_MODEL_PATH)
    joblib.dump(selected_cols, FINAL_MODEL_FEATURES_PATH)
else:
    best_model = joblib.load(FINAL_MODEL_PATH)
    selected_cols = joblib.load(FINAL_MODEL_FEATURES_PATH)

In [None]:
primary_heuristic = single_feature_summary[single_feature_summary['group'] == 'Naive Severity - Discrete'].iloc[0]
secondary_heuristic = single_feature_summary[single_feature_summary['group'] == 'Task Common - Continuous'].iloc[0]

In [None]:
if RETRAIN_MODELS:
    primary_model = deepcopy(base_model)
    secondary_model = deepcopy(base_model)

    primary_cols = heuristic_groups['Naive Severity - Discrete']
    secondary_cols = heuristic_groups['Task Common - Continuous']

    primary_model.fit(X[primary_cols], y_full)
    secondary_model.fit(X[secondary_cols], y_full)

    joblib.dump(primary_model, FINAL_BASELINE_PATH / 'primary_model.joblib')
    joblib.dump(secondary_model, FINAL_BASELINE_PATH / 'secondary_model.joblib')
    joblib.dump(primary_cols, FINAL_BASELINE_FEATURES_PATH / 'primary_features.joblib')
    joblib.dump(secondary_cols, FINAL_BASELINE_FEATURES_PATH / 'secondary_features.joblib')

***

# Results

## Single feature models

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(data=single_feature_summary, x="group", y="ranking_stability_score", hue="group")
plt.xticks(rotation=45, ha="right")
plt.title("Single feature models by stability score")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "single_feature_ranking_stability_score.png", dpi=RESOLUTION)
plt.close()

In [None]:
sorted_df = single_feature_results.copy()
sorted_df['mean_accuracy'] = sorted_df.groupby('group')['accuracy'].transform('mean')
sorted_df = sorted_df.sort_values('mean_accuracy', ascending=False)


plt.figure(figsize=(10, 5))

ax = sns.barplot(
    data=sorted_df,
    x="group",
    y="accuracy",
    errorbar=("ci", 95),
    hue="group"
)

ax.axhline(0.5, linestyle="--", color="red", linewidth=1.2, alpha=0.8)

plt.xticks(rotation=90)
plt.title("Single-Feature Heuristics vs Full Model (Accuracy)")
plt.xlabel("Heuristic (Single Feature)")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "single_feature_accuracy.png", dpi=RESOLUTION)
plt.close()


### Discrete vs continuous

In [None]:
# Select discrete and continuous feature rows
discrete_rows = single_feature_summary[
    single_feature_summary['group'].map(lambda x: x.endswith(' - Discrete'))
]

continuous_rows = single_feature_summary[
    single_feature_summary['group'].map(lambda x: x.endswith(' - Continuous'))
]

# Merge them by base feature name
base_name = lambda group_name: group_name.replace(' - Discrete', '').replace(' - Continuous', '')
discrete_rows = discrete_rows.assign(feature=discrete_rows['group'].map(base_name))
continuous_rows = continuous_rows.assign(feature=continuous_rows['group'].map(base_name))

merged = pd.merge(
    discrete_rows[['feature', 'auc mean', 'accuracy mean']],
    continuous_rows[['feature', 'auc mean', 'accuracy mean']],
    on='feature',
    suffixes=('_discrete', '_continuous')
)

# Add difference columns
merged['delta_accuracy'] = merged['accuracy mean_discrete'] - merged['accuracy mean_continuous']
merged['delta_auc'] = merged['auc mean_discrete'] - merged['auc mean_continuous']

merged.sort_values('delta_accuracy', ascending=False, inplace=True)
merged.to_html(IMAGE_DIR / "continuous_discrete_delta.html", index=False)
display(merged)


In [None]:
print("Average delta in accuracy and AUC (Discrete - Continuous):", merged['delta_accuracy'].mean(), merged['delta_auc'].mean())

### Wilcoxon vs best model

In [None]:
# Add best performing model
combined_df = pd.concat([
    single_feature_results,
    ablation_study_results[(ablation_study_results["model"] == best_model_name) & (ablation_study_results["group"] == best_feature_group)]
], axis=0)

combined_df['model+group'] = combined_df.apply(lambda row: f"{row['group']} ({row['model']})", axis=1)

In [None]:
pairwise_tests = []

best_model_df = ablation_study_results[(ablation_study_results["model"] == best_model_name) & (ablation_study_results["group"] == best_feature_group)]

for i, heuristic_group in enumerate(single_feature_results["group"].unique()):
    heuristic_df = single_feature_results[
        (single_feature_results['group'] == heuristic_group)
    ].sort_values("fold")

    stat, p = wilcoxon(heuristic_df["accuracy"], best_model_df["accuracy"])
    pairwise_tests.append({
        "model_1": heuristic_group,
        "model_2": f"{best_model_name} ({best_feature_group})",
        "wilcoxon_stat": stat,
        "p_value": p
    })

pairwise_tests = pd.DataFrame(pairwise_tests).sort_values("p_value")
pairwise_tests.to_html(IMAGE_DIR / "single_feature_vs_best_wilcoxon.html", index=False)
display(pairwise_tests)

## Top model-feature pairs

In [None]:
top10 = ablation_study_summary.head(10)

plt.figure(figsize=(10,5))
sns.barplot(data=top10, x="group", y="ranking_stability_score", hue="model")
plt.xticks(rotation=45, ha="right")
plt.title("Top model+feature combinations by stability score")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "top_models.png", dpi=RESOLUTION)
plt.close()

In [None]:
group_to_compare = "Naive Severity - Discrete"
severity_summary_row = single_feature_summary[single_feature_summary["group"] == group_to_compare]

In [None]:
with_severity = pd.concat([
    ablation_study_summary.head(10),
    severity_summary_row
], axis=0)

plt.figure(figsize=(10,5))
sns.barplot(data=with_severity, x="group", y="ranking_stability_score", hue="model")
plt.xticks(rotation=45, ha="right")
plt.title("Top model+feature combinations and the severity baseline by stability score")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "top_models_with_severity.png", dpi=RESOLUTION)
plt.close()

In [None]:
pairwise_tests = []

severity_model, severity_group = severity_summary_row["model"].iloc[0], severity_summary_row["group"].iloc[0]
severity_df = single_feature_results[
    (single_feature_results["group"] == severity_group) &
    (single_feature_results["model"] == severity_model)
].sort_values("fold")

for _, model_row in top10.iterrows():
    model_name, model_group = model_row["model"], model_row["group"]
    model_df = ablation_study_results[
        (ablation_study_results["model"] == model_name) & 
        (ablation_study_results["group"] == model_group)
    ].sort_values("fold")

    stat, p = wilcoxon(model_df["accuracy"], severity_df["accuracy"])
    pairwise_tests.append({
        "model_1": f"{model_name} ({model_group})",
        "model_2": f"{severity_model} ({severity_group})",
        "wilcoxon_stat": stat,
        "p_value": p
    })

pairwise_tests = pd.DataFrame(pairwise_tests)
pairwise_tests.to_html(IMAGE_DIR / "top_models_vs_severity_wilcoxon.html", index=False)
display(pairwise_tests)

## Model comparison

### Wilcoxon

In [None]:
feature_group_to_compare = "All Heuristic-Derived"

pairwise_tests = []
models_list = ablation_study_results["model"].unique()

for i, m1 in enumerate(models_list):
    for m2 in models_list[i+1:]:
        df1 = ablation_study_results[
            (ablation_study_results["model"] == m1) & 
            (ablation_study_results["group"] == feature_group_to_compare)
        ].sort_values("fold")

        df2 = ablation_study_results[
            (ablation_study_results["model"] == m2) & 
            (ablation_study_results["group"] == feature_group_to_compare)
        ].sort_values("fold")

        if len(df1) == len(df2) and len(df1) > 0:
            stat, p = wilcoxon(df1["accuracy"], df2["accuracy"])
            pairwise_tests.append({
                "model_1": m1,
                "model_2": m2,
                "wilcoxon_stat": stat,
                "p_value": p
            })

pairwise_tests = pd.DataFrame(pairwise_tests)
pairwise_tests.to_html(IMAGE_DIR / "ml_model_wilcoxon.html", index=False)
display(pairwise_tests)

## Diagnostic plots

In [None]:
plt.figure(figsize=(6, 6))

for model_name in ablation_study_results["model"].unique():
    y_true_all = []
    y_proba_all = []
    
    for key, pred_df in fold_predictions.items():
        _, mname, group = key
        if mname == model_name and group == "All Heuristic-Derived":
            y_true_all.extend(pred_df["y_true"])
            y_proba_all.extend(pred_df["y_proba"])

    fpr, tpr, _ = roc_curve(y_true_all, y_proba_all)
    roc_auc = auc(fpr, tpr)

    plt.plot(fpr, tpr, lw=2, label=f"{model_name} (AUC={roc_auc:.3f})")

plt.plot([0, 1], [0, 1], "k--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve (Aggregated Across Folds)")
plt.legend()
plt.grid(True)
plt.savefig(IMAGE_DIR / "roc_curve.png", dpi=RESOLUTION)
plt.close()

In [None]:
plt.figure(figsize=(6, 6))

for model_name in ablation_study_results["model"].unique():
    y_true_all = []
    y_proba_all = []
    
    for key, pred_df in fold_predictions.items():
        _, mname, group = key
        if mname == model_name and group == "All Heuristic-Derived":
            y_true_all.extend(pred_df["y_true"])
            y_proba_all.extend(pred_df["y_proba"])

    prob_true, prob_pred = calibration_curve(
        y_true_all, y_proba_all, n_bins=10, strategy="quantile"
    )

    plt.plot(prob_pred, prob_true, marker="o", label=model_name)

plt.plot([0, 1], [0, 1], "k--", label="Perfect")
plt.xlabel("Predicted Probability")
plt.ylabel("Empirical Win Rate")
plt.title("Calibration Plot")
plt.legend()
plt.grid(True)
plt.savefig(IMAGE_DIR / "calibration_curve.png", dpi=RESOLUTION)
plt.close()

In [None]:
plt.figure(figsize=(8, 5))

for model_name in ablation_study_results["model"].unique():
    y_proba_all = []
    
    for key, pred_df in fold_predictions.items():
        _, mname, group = key
        if mname == model_name and group == "All Heuristic-Derived":
            y_proba_all.extend(pred_df["y_proba"])

    sns.kdeplot(y_proba_all, label=model_name, fill=True, alpha=0.3)

plt.title("Distribution of Predicted Probabilities")
plt.xlabel("p(left wins)")
plt.legend()
plt.savefig(IMAGE_DIR / "predicted_probabilities_hist.png", dpi=RESOLUTION)
plt.close()


## Feature group comparison

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharex=False)

# --- Accuracy ---
sns.barplot(
    data=ablation_study_results,
    x="group",
    y="accuracy",
    errorbar="sd",
    ax=axes[0]
)
axes[0].set_title("Ablation Study: Accuracy by Feature Group")
axes[0].set_ylabel("Accuracy")
axes[0].set_xlabel("Feature Group")
axes[0].tick_params(axis='x', rotation=45)

# --- AUC ---
sns.barplot(
    data=ablation_study_results,
    x="group",
    y="auc",
    errorbar="sd",
    ax=axes[1]
)
axes[1].set_title("Ablation Study: AUC by Feature Group")
axes[1].set_ylabel("AUC")
axes[1].set_xlabel("Feature Group")
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(IMAGE_DIR / "feature_group_stats.png", dpi=RESOLUTION)
plt.close()


## Tree feature importance

In [None]:
best_tree_params = ablation_study_summary[ablation_study_summary["model"].isin(["Random Forest", "GradientBoostingClassifier"])].iloc[0]

In [None]:
all_importances = []

for group_name, cols in feature_groups.items():
    X_full = X[cols]
    model = deepcopy(models[best_tree_params["model"]])
    model.fit(X_full, y)
    
    all_importances.append(pd.DataFrame({
        "group": group_name,
        "feature": cols,
        "importance": model.feature_importances_
    }))

importance_df = pd.concat(all_importances, ignore_index=True)

In [None]:
heatmap_df = importance_df.pivot_table(
    index="feature", columns="group", values="importance", fill_value=0
)

plt.figure(figsize=(12,heatmap_df.shape[0]/5))
sns.heatmap(heatmap_df, cmap="viridis", linewidths=0.5)
plt.title("Feature Importance Heatmap Across Groups")
plt.xlabel("Feature Group")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "feature_importance.png", dpi=RESOLUTION)
plt.close()

In [None]:
global_ranking = importance_df.groupby("feature")["importance"].mean().sort_values(ascending=False).head(20)
plt.figure(figsize=(8,6))
sns.barplot(x=global_ranking.values, y=global_ranking.index, hue=global_ranking.index, palette="viridis")
plt.title("Top 20 Features Overall")
plt.xlabel("Mean Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig(IMAGE_DIR / "top_20_features.png", dpi=RESOLUTION)
plt.close()

## Permutation importance

TODO