# Statistical Analysis: NHST

In [31]:
import os
import json
import pandas as pd
import re

import analysis_utils as utils

Load data from all `../res` sub-directories (treatments):

In [32]:
summary_dfs = {}    # dict to store statistical summary data (<session_name>.json)
all_data_dfs = {}   # dict to store all individual measurements (all_data_<session_name>.json)

experiment_data_dir = '../res' # directory generated by "eval.py", contains the .json files with the experiment data

iteration_structure = True # flag to set the directory structure of the input data

# Load all the experiment data
summary_dfs, all_data_dfs = utils.load_experiment_data(experiment_data_dir, iteration_structure)

In [33]:
#print(summary_dfs["MIS_AWQ_ENCO"])
#print(all_data_dfs["MIS_AWQ_ENCO"])
print(all_data_dfs["MIS_AWQ_BTHS"].head())

                                   data_path  prevalence    n  tp   tn  fp  \
0  ./out/MIS_AWQ_BTHS/2025-04-28/22_58_14/01    0.166667  120   2  100   0   
1  ./out/MIS_AWQ_BTHS/2025-04-28/22_58_14/02    0.166667  120   2  100   0   
2  ./out/MIS_AWQ_BTHS/2025-04-28/22_58_14/03    0.166667  120   2  100   0   
3  ./out/MIS_AWQ_BTHS/2025-04-28/22_58_14/04    0.166667  120   1  100   0   
4  ./out/MIS_AWQ_BTHS/2025-04-28/22_58_14/05    0.166667  120   1  100   0   

   fn  accuracy  balanced_accuracy        f1  recall  precision  specificity  \
0  18  0.850000           0.923729  0.181818    0.10        1.0     0.847458   
1  18  0.850000           0.923729  0.181818    0.10        1.0     0.847458   
2  18  0.850000           0.923729  0.181818    0.10        1.0     0.847458   
3  19  0.841667           0.920168  0.095238    0.05        1.0     0.840336   
4  19  0.841667           0.920168  0.095238    0.05        1.0     0.840336   

   err  time_to_analyze                           

### Flatten GPU and VRAM columns 
These contains nested objects and values.
We are only interested in a subset of the contained data.

In [34]:
def flatten_gpu_vram_columns(df: pd.DataFrame) -> pd.DataFrame:
    # TODO: fix the key for extracting "vram_max_usage_mib" from "max_consumed_MiB" -> new format (check repo)
    
    # GPU columns
    df["gpu_util_mean"] = df["GPU"].apply(lambda x: x["utilization"]["avg"] if isinstance(x, dict) else None)
    df["gpu_util_max"]  = df["GPU"].apply(lambda x: x["utilization"]["max"] if isinstance(x, dict) else None)

    # VRAM columns
    df["vram_util_mean"]     = df["VRAM"].apply(lambda x: x["utilization"]["avg"] if isinstance(x, dict) else None)
    df["vram_util_max"]      = df["VRAM"].apply(lambda x: x["utilization"]["max"] if isinstance(x, dict) else None)
    df["vram_max_usage_mib"] = df["VRAM"].apply(lambda x: x["max_consumed_MiB"] if isinstance(x, dict) else None) # TODO: FIX KEY

    # Drop the redundant columns
    df = df.drop(columns=["GPU", "VRAM"])

    return df

### Convert data to long format

In [None]:
def convert_dict_dfs_to_long(dictionary_dataframes: dict[pd.DataFrame], metrics_of_interest: list[str]) -> pd.DataFrame:
    rows = [] # Temporary list to hold all constructed rows of data

    # Iterate through each treatment and corresponding data
    for treatment_name, df in dictionary_dataframes.items():

        # Extract the treatment factors (dataset is a blocked variable)
        model, quant, dataset = treatment_name.split("_")
        
        for i, row in df.iterrows():
            # Extract iteration from last dir in data_path (e.g. "01" from ".../01", etc.)
            match = re.search(r"/(\d{2})$", row["data_path"])
            iteration = int(match.group(1)) if match else i + 1 # TODO: should we include the fallback or let it fail?

            # Construct row dictionary
            row_data = {
                "treatment": treatment_name,
                "model": model, 
                "quantization": quant, 
                "dataset": dataset,
                "iteration": iteration
            }
        
            # Add relevant metrics
            for metric in metrics_of_interest:
                row_data[metric] = row[metric]

            rows.append(row_data)

    # Combine all the rows of data into a single long-format dataframe
    long_dataframe: pd.DataFrame = pd.DataFrame(rows)

    return long_dataframe

### Flatten columns and convert to long format to prepare for NHST analysis

In [48]:
efficacy_metrics = [
    "accuracy", "recall", 
    "precision", "f1",
    "balanced_accuracy", 
    "specificity"
]

efficiency_metrics = [
    "time_to_analyze",
    "gpu_util_mean", "gpu_util_max",
    "vram_util_mean", "vram_util_max",
    "vram_max_usage_mib"
]

# Flatten and extract the relevant GPU and VRAM metrics into distinct columns (they are intially nested objects)
flattened_dfs = {key: flatten_gpu_vram_columns(df) for key, df in all_data_dfs.items()}

# Convert data to long format; group by research question
long_df_efficacy = convert_dict_dfs_to_long(flattened_dfs, efficacy_metrics)
long_df_efficiency = convert_dict_dfs_to_long(flattened_dfs, efficiency_metrics)

# Sort the dataframes using custom ordering schemes
# Note: we can apply custom schemes to all the sorting columns if needed
sorting_order = ["dataset", "iteration", "model", "quantization"]
quant_order = ["NONE", "AWQ", "GPTQ", "AQLM"]
#long_df_efficacy["quantization"] = pd.Categorical(long_df_efficacy["quantization"], categories=quant_order, ordered=True)
#long_df_efficacy = long_df_efficacy.sort_values(by=sorting_order).reset_index(drop=True)

print(long_df_efficacy.head())
print(long_df_efficiency.head())


      treatment model quantization dataset  iteration  accuracy  recall  \
0  MIS_AWQ_BTHS   MIS          AWQ    BTHS          1  0.850000    0.10   
1  MIS_AWQ_BTHS   MIS          AWQ    BTHS          2  0.850000    0.10   
2  MIS_AWQ_BTHS   MIS          AWQ    BTHS          3  0.850000    0.10   
3  MIS_AWQ_BTHS   MIS          AWQ    BTHS          4  0.841667    0.05   
4  MIS_AWQ_BTHS   MIS          AWQ    BTHS          5  0.841667    0.05   

   precision        f1  balanced_accuracy  specificity  
0        1.0  0.181818           0.923729     0.847458  
1        1.0  0.181818           0.923729     0.847458  
2        1.0  0.181818           0.923729     0.847458  
3        1.0  0.095238           0.920168     0.840336  
4        1.0  0.095238           0.920168     0.840336  
      treatment model quantization dataset  iteration  time_to_analyze  \
0  MIS_AWQ_BTHS   MIS          AWQ    BTHS          1         7.775839   
1  MIS_AWQ_BTHS   MIS          AWQ    BTHS          2      

---

## Testing ANOVA assumptions

In [50]:
print(long_df_efficacy.head())


      treatment model quantization dataset  iteration  accuracy  recall  \
0  MIS_AWQ_BTHS   MIS          AWQ    BTHS          1  0.850000    0.10   
1  MIS_AWQ_BTHS   MIS          AWQ    BTHS          2  0.850000    0.10   
2  MIS_AWQ_BTHS   MIS          AWQ    BTHS          3  0.850000    0.10   
3  MIS_AWQ_BTHS   MIS          AWQ    BTHS          4  0.841667    0.05   
4  MIS_AWQ_BTHS   MIS          AWQ    BTHS          5  0.841667    0.05   

   precision        f1  balanced_accuracy  specificity  
0        1.0  0.181818           0.923729     0.847458  
1        1.0  0.181818           0.923729     0.847458  
2        1.0  0.181818           0.923729     0.847458  
3        1.0  0.095238           0.920168     0.840336  
4        1.0  0.095238           0.920168     0.840336  
