In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import pyrootutils

root = pyrootutils.setup_root(
    search_from=os.getcwd(),
    indicator=[".git", "pyproject.toml"],
    pythonpath=True,
    dotenv=True,
)

In [None]:
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline
import itertools

import mlflow
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.graphics.factorplots import interaction_plot

import src.utils.plotting

# Inspect Iterative Experiment Results
This notebook aids in inspecting the results of the iterative data chunks experiments, and in collecting the necessary information for the paper: gathering the figures and outputting latex table data.

In [None]:
def search_mlflow(
    search_experiment_name,
    mlflow_tracking_uri=None,
    mlflow_file_path=None,
):
    tags_model_to_name = dict(
        XGB="XGBoost",
        TCN="TCN",
        RNN="LSTM",
        Regression="ElasticNet",
        NaiveSeasonal="BaselineSeasonal",
        TCNNoTarget="TCN",
        RNNNoTarget="LSTM",
    )
    if isinstance(search_experiment_name, str):
        search_experiment_name = [search_experiment_name]

    if mlflow_file_path is None:
        mlflow_file_path = os.path.join(root, "logs", "mlflow", "mlruns")

    if mlflow_tracking_uri is None:
        assert mlflow_file_path is not None
        mlflow_tracking_uri = f"file:///{mlflow_file_path}"

    mlflow.set_tracking_uri(mlflow_tracking_uri)
    df = mlflow.search_runs(experiment_names=search_experiment_name)
    if df.empty:
        raise ValueError(
            f"Did not find experiment with name: {search_experiment_name} on tracking uri: {mlflow_tracking_uri}"
        )

    # df["model_name"] = df.apply(get_model_name, axis=1)
    df["tags.model"] = df["tags.model"].apply(
        lambda x: tags_model_to_name.get(x.replace("Model", ""), x.replace("Model", ""))
    )

    return df


# n_missing: for nowcasting, the combination with all inputs disabled is not valid and missing from some datasets
def check_completeness(df, input_names, n_missing=0):
    df_sub = df[input_names]

    # Convert each row to a tuple
    combinations = df_sub.apply(tuple, axis=1)

    n_combinations = 2 ** len(input_names) - n_missing

    # Check if there are n_combinations unique combinations
    unique_combinations = combinations.nunique()
    total_combinations = len(combinations)

    if (
        abs(unique_combinations - n_combinations) <= n_missing
        and abs(total_combinations - n_combinations) <= n_missing
    ):
        print(f"All {n_combinations} unique combinations are present and there are no duplicates.")
    else:
        print(f"Expected number of combinations: {n_combinations}")
        print(f"Number of unique combinations: {unique_combinations}")
        print(f"Total number of combinations: {total_combinations}")
        if unique_combinations < n_combinations:
            print("Some combinations are missing.")
        if total_combinations > n_combinations:
            print("There are duplicates in the dataframe.")

## Performance Metrics

### Mean metric score by input

In [None]:
dataset = "veas_pilot"
model = "rnn"
metric_name = "test_mse"
task = "nowcast"
fig_folder = os.path.join(root, "figures", dataset, "input_analysis", task)


metric_column = f"metrics.{metric_name}"
tags_model_to_name = dict(
    XGBModel="XGBoost", TCNModel="TCN", RNNModel="LSTM", RegressionModel="ElasticNet"
)

search_experiment_name = {
    "xgboost": f"{dataset}-inputs_test-{task}-{dataset}_{model}_{task}",
    "tcn": f"{dataset}-inputs_test-{task}-{dataset}_{model}_{task}",
    "elastic_net": f"{dataset}-inputs_test-{task}-{dataset}_{model}_{task}",
    "rnn": f"hopt-{dataset}-inputs_test-{task}-{dataset}_{model}_{task}",
}

df = search_mlflow(search_experiment_name[model])

input_data_method = (
    "use_inputs" if search_experiment_name[model].startswith("hopt") else "data_variables"
)

if input_data_method == "use_inputs":
    input_columns = [col for col in df.columns if "use_inputs" in col]
    input_names_clean = {col: col.split("/")[-1] for col in input_columns}

    for col in input_columns:
        df[col] = df[col] == "True"
elif input_data_method == "data_variables":
    # in this case use_input is not a dictionary with True False for each input. Therefore we create these columns
    # on the format that the rest of the notebook expects
    all_inputs = set()
    if model == "tcn":
        data_variables_column = "params.datamodule/data_variables/past_covariates"
    else:
        data_variables_column = "params.datamodule/data_variables/future_covariates"

    for input_combination in df[data_variables_column].dropna().unique():
        all_inputs.update(input_combination.strip("[]").replace("'", "").split(", "))
    input_columns = [f"params.use_inputs/{input_name}" for input_name in all_inputs]
    input_names_clean = {
        f"params.use_inputs/{input_name}": input_name for input_name in all_inputs
    }

    for input_name in input_names_clean.values():
        df[f"params.use_inputs/{input_name}"] = df[data_variables_column].apply(
            lambda x: input_name in x if pd.notnull(x) else False
        )
else:
    raise ValueError()

plot_input_names = src.utils.plotting.get_covariate_plot_names()
input_names_clean = {col: plot_input_names[col_v] for col, col_v in input_names_clean.items()}
df = df.rename(columns=input_names_clean)
input_columns = list(input_names_clean.values())

df = df.drop_duplicates(subset=input_columns)
check_completeness(df, input_columns, n_missing=1 if task == "nowcast" else 0)

In [None]:
column_span = "double"
height = 8
mean_relative = False
src.utils.plotting.set_matplotlib_attributes(font_size=8)

plot_data = []
mean_all = df[metric_column].mean()

# Compute mean metrics.val_mse for each input condition (True and False)
for col in input_columns:
    true_mean = df[df[col]][metric_column].mean()
    false_mean = df[~df[col]][metric_column].mean()
    if mean_relative:
        true_mean -= mean_all
        false_mean -= mean_all
    plot_data.append({"input": col, "used": "True", "mean_val_mse": true_mean})
    plot_data.append({"input": col, "used": "False", "mean_val_mse": false_mean})

# Create a DataFrame from the plot data
plot_df = pd.DataFrame(plot_data)

# Plotting
plt.figure()
sns.barplot(
    data=plot_df, x="input", y="mean_val_mse", hue="used", order=plot_df["input"].sort_values()
)
plt.xticks(rotation=30)
plt.xlabel("Covariate")

if mean_relative:
    plt.title(f'Change in MSE by Covariate Usage for {df["tags.model"][0]} {task.capitalize()}')
    plt.ylabel("$\Delta$ Mean MSE")
else:
    plt.title(f'MSE by Input Covariate for {df["tags.model"][0]} {task.capitalize()}')
    plt.ylabel("Mean MSE")

if model == "elastic_net":
    plt.legend(title="Covariate Used")
else:
    plt.legend([])

plt.tight_layout()
if mean_relative:
    fig_path = os.path.join(fig_folder, f"relative_mean_mse_by_input_usage_{model}")
else:
    fig_path = os.path.join(fig_folder, f"mean_mse_by_input_usage_{model}")
fig = plt.gcf()
src.utils.plotting.set_figure_size(fig, column_span, height=height)
src.utils.plotting.save_figure(fig, fig_path)
plt.show()

In [None]:
column_span = "double"
top_x = 50  # Number of top models to consider
height = 8

# Sort DataFrame and select top models based on the metric
top_models = df.nsmallest(top_x, f"metrics.{metric_name}")

# Calculate the usage percentage of each input among the top models
input_usage_counts = top_models[input_columns].sum()
input_usage_percentages = (input_usage_counts / top_x) * 100

# Create a DataFrame for plotting
plot_data = []
for input_col, percentage in input_usage_percentages.items():
    plot_data.append(
        {
            "input": input_col.replace("params.use_inputs/", ""),  # Clean input names
            "percentage": percentage,
        }
    )

# Convert to DataFrame
plot_df = pd.DataFrame(plot_data)

# Plotting using seaborn
plt.figure(figsize=(12, 8))
sns.barplot(
    data=plot_df, x="input", y="percentage", color="skyblue", order=plot_df["input"].sort_values()
)
plt.title(
    f'Percentage of Top {top_x} Models Using Each Covariate for {df["tags.model"][0]} {task.capitalize()}'
)
plt.xlabel("Covariate")
plt.ylabel("Percentage (%)")
# plt.grid(axis="y")
plt.yticks(range(0, 120, 20))
plt.xticks(rotation=30)  # Rotate labels for better readability

fig = plt.gcf()
src.utils.plotting.set_figure_size(fig, column_span=column_span, height=height)
src.utils.plotting.save_figure(fig, os.path.join(fig_folder, f"top_{top_x}_models_{model}"))
plt.show()

In [None]:
# Generate all possible combinations of input conditions
input_conditions = list(itertools.product([False, True], repeat=len(input_columns)))

# Create a DataFrame to hold combination data
combinations = []
for condition in input_conditions:
    mask = (df[input_columns] == condition).all(axis=1)
    mean_metric = df.loc[mask, metric_column].mean()
    combinations.append(list(condition) + [mean_metric])

# Columns for the new DataFrame
columns = input_columns + ["mean_val_mse"]
combination_df = pd.DataFrame(combinations, columns=columns)

# Melt DataFrame for heatmap usage
heatmap_data = combination_df.melt(id_vars="mean_val_mse", var_name="input", value_name="used")

# Plotting
plt.figure(figsize=(12, 8))
heatmap_data = heatmap_data.pivot_table(index="input", columns="used", values="mean_val_mse")
sns.heatmap(heatmap_data, annot=True, cmap="viridis")
plt.title("Heatmap of Mean Val MSE Across Input Combinations")
plt.xlabel("Input Used")
plt.ylabel("Input Type")
plt.show()

In [None]:
# Assuming df, input_columns, and input_names_clean are defined
n_inputs = len(input_columns)

fig, axs = plt.subplots(n_inputs - 1, n_inputs - 1, figsize=(20, 20))  # Adjusted size for clarity
fig.subplots_adjust(hspace=0.6, wspace=0.6)  # Adjust space between plots

for i in range(n_inputs):
    for j in range(i + 1, n_inputs):
        ax = axs[i, j - 1]
        interaction_plot(
            df[input_columns[i]].astype(int),
            df[input_columns[j]].astype(int),
            df[metric_column],
            ax=ax,
            colors=["red", "blue"],
            markers=["D", "o"],
            ms=10,
        )
        ax.set_title(
            f"{input_names_clean[input_columns[i]]} x {input_names_clean[input_columns[j]]}",
            fontsize=9,
        )
        ax.tick_params(axis="both", which="major", labelsize=8)
        ax.legend(title="Condition", title_fontsize="8", fontsize="7", loc="upper right")
        ax.set_ylabel("")
        ax.set_xlabel(input_names_clean[input_columns[i]])
        # Explicitly set axis labels and ticks visibility
        ax.xaxis.set_tick_params(labelbottom=True)
        ax.yaxis.set_tick_params(labelleft=True)

# Hide plots for combinations that don't exist
for i in range(n_inputs - 1):
    for j in range(i + 1, n_inputs):
        axs[j - 1, i].axis("off")  # Hides the lower triangle and non-existing plots correctly

plt.tight_layout()  # Adjust layout to fit the plot and labels better
plt.show()

In [None]:
# Standardizing the data
X = df[input_columns].astype(int)
y = df[metric_column]
X_scaled = StandardScaler().fit_transform(X)

# PCA Transformation
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot
plt.figure(figsize=(8, 6))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap="viridis")
plt.colorbar(scatter, label="Val MSE")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("PCA of Input Combinations Colored by Val MSE")
plt.show()

In [None]:
# Standardizing the data
X = df[input_columns].astype(int)
y = df[metric_column]
X_scaled = StandardScaler().fit_transform(X)

# Model with interaction terms
X_interaction = sm.tools.add_constant(X)
regression_model = sm.OLS(y, X_interaction)
results = regression_model.fit()

print(results.summary())

# Remove constant coefficient
results_params = results.params[1:]

# Plotting the coefficients
plt.figure(figsize=(10, 6))
plt.bar(range(len(results_params)), results_params)
plt.xlabel("Coefficients Index")
plt.ylabel("Coefficient Value")
plt.title(f"Impact of Inputs on {metric_name}")
plt.xticks(
    ticks=range(len(results_params)),
    labels=[input_names_clean[param] for param in results_params.index],
    rotation=30,
)
src.utils.plotting.save_figure(
    plt.gcf(), os.path.join(fig_folder, f"input_impact_regression_{model}")
)
plt.show()