In [None]:
from scipy.stats import mannwhitneyu
import itertools
import numpy as np
from collections import defaultdict
from typing import Callable, Optional
import csv
import pandas as pd
import os
import utils

import utils


def is_valid_data_file(file_name:str) -> bool:
    return file_name.endswith("json") or file_name.endswith("txt")


def get_mean_for_combinations(df: pd.DataFrame, 
                       independent_variables: list[str], 
                       dependent_variables: list[str]) -> pd.DataFrame:

    # ensure all the columns are present in the df
    for col in independent_variables+dependent_variables:
        if col not in df:
            raise Exception(f"The column {col} is not in the dataframe\n\t(columns are {list(df.columns)})")
    assert(all(col in df for col in independent_variables))
    assert(dependent_variable in df for dependent_variable in dependent_variables)
    
    grouped = df.groupby(independent_variables, dropna=False)[dependent_variables].mean().reset_index()
    
    return grouped


import json
import os

# CORRECTED VERSION - Key Changes Made

## 1. Updated run_location to working dataset
```python
# ORIGINAL (pointing to empty dataset):
# run_location = r"A:\metahuristic_benchmark\PS-descriptors\results\compare_own_data_07-29-H15'm'15's16"

# CORRECTED (pointing to complete dataset with actual data):
run_location = r"A:\metahuristic_benchmark\PS-descriptors\resources\variance_tree_materials\compare_own_data\complete_dataset_08-02-H03'm'50's15"
```

## 2. Commented out data conversion functions (since CSV files already exist)
```python
# convert_accuracy_data_to_df(os.path.join(run_location, "data"), results_csv)
# convert_tree_data_to_df(os.path.join(run_location, "data"), tree_data_csv)
```

## 3. Fixed generate_statistical_test_data function
### Problem: Function was incomplete and returned None
### Solution: Added missing return statement and loop

In [None]:
# CORRECTED: Updated run_location to point to working dataset
run_location = r"A:\metahuristic_benchmark\PS-descriptors\resources\variance_tree_materials\compare_own_data\complete_dataset_08-02-H03'm'50's15"

results_csv = os.path.join(run_location, "results.csv")
tree_data_csv = os.path.join(run_location, "tree_data.csv")

# CORRECTED: Commented out conversion functions since CSV files already exist
# convert_accuracy_data_to_df(os.path.join(run_location, "data"), results_csv)
# convert_tree_data_to_df(os.path.join(run_location, "data"), tree_data_csv)

In [None]:
def prettify_kind_column(df):
    kind_dict = {"variance":"PS-W",
                 "variance estimated_atomicity": "PS-WA",
                 "simplicity variance": "PS-SW",
                 "simplicity variance estimated_atomicity" :"PS-SWA"}
    
    df['kind'] = df.apply(
    lambda row: (
        kind_dict[row['metrics']] if row['kind'] == 'ps' else
        'Trad.' if row['kind'] == 'naive' else
        'IAI' if row['kind'] == 'iai' else
        row['kind']
    ),
    axis=1
)

def filter_dataframe(df, **kwargs):
    for col, value in kwargs.items():
        if col in df.columns:
            df = df[df[col] == value]
        else:
            raise ValueError(f"Column '{col}' not found in dataframe.")
    return df

In [None]:
# Load and process the data
accuracy_data = pd.read_csv(results_csv)
prettify_kind_column(accuracy_data)

# Debug: Check the data
print("Data shape:", accuracy_data.shape)
print("Unique pRef_size values:", accuracy_data['pRef_size'].unique())
print("Unique kinds after prettify:", accuracy_data['kind'].unique())
print("Unique depths:", accuracy_data['depth'].unique())
print("\nFirst few rows:")
display(accuracy_data.head())

In [None]:
# CORRECTED: Fixed generate_statistical_test_data function
def generate_statistical_test_data(accuracy_data: pd.DataFrame, input_directory, output_filename):
    depths = [3, 4, 5]
    # CORRECTED: Changed from 10000 to 5000 to match actual data
    usable_data = filter_dataframe(accuracy_data, pRef_size = 5000)
    usable_data = usable_data[usable_data["depth"].isin(depths)]
    
    result_column = "r_sq"

    def winning_competitor_for_competition_and_values(problem, depth, metaheuristic):
        # CORRECTED: Removed IAI since our dataset doesn't have it
        for_each_method = {
            tree_method: filter_dataframe(usable_data, problem=problem, depth=depth, pRef_method=metaheuristic, kind=tree_method)['r_sq']
            for tree_method in {"PS-SW", "PS-SWA", "Trad."}
        }

        # Skip if any group is empty
        if any(len(vals) == 0 for vals in for_each_method.values()):
            return {
                "problem": problem,
                "depth": depth,
                "metaheuristic": metaheuristic,
                "p_value_sw": float('nan'),
                "p_value_swa": float('nan'),
                "winning_competitor": None
            }

        # CORRECTED: Use Trad. as baseline since we don't have IAI data
        winner = "Trad."

        p_value_sw = mannwhitneyu(for_each_method["PS-SW"], for_each_method[winner], alternative="greater").pvalue
        p_value_swa = mannwhitneyu(for_each_method["PS-SWA"], for_each_method[winner], alternative="greater").pvalue

        return {
            "problem": problem,
            "depth": depth,
            "metaheuristic": metaheuristic,
            "p_value_sw": p_value_sw,
            "p_value_swa": p_value_swa,
            "winning_competitor": winner
        }

    # CORRECTED: Added missing loop and return statement
    all_problems = usable_data["problem"].unique()
    all_metaheuristics = usable_data["pRef_method"].unique()
    
    dicts = [winning_competitor_for_competition_and_values(problem=problem, depth=depth, metaheuristic=metaheuristic)
             for problem in all_problems
             for depth in depths
             for metaheuristic in all_metaheuristics]
    
    return pd.DataFrame(dicts)

In [None]:
# CORRECTED: Added debug output to understand what's happening
statistical_data = generate_statistical_test_data(accuracy_data, None, None)

print("\nStatistical data shape:", statistical_data.shape if statistical_data is not None else "None")
if statistical_data is not None and len(statistical_data) > 0:
    print("Statistical data columns:", statistical_data.columns.tolist())
    display(statistical_data)
    
    pivot_table = statistical_data.pivot_table(index=["problem", "depth", "metaheuristic"], 
                                                values =["p_value_sw", "p_value_swa"])
    display(pivot_table)
else:
    print("No statistical data generated - check the filtering conditions")

# Summary of Key Corrections Made

## Issues Fixed:

1. **EmptyDataError**: 
   - **Problem**: CSV files were empty because notebook pointed to failed dataset
   - **Solution**: Updated `run_location` to point to complete dataset with actual data

2. **AttributeError: 'NoneType' object has no attribute 'pivot_table'**:
   - **Problem**: `generate_statistical_test_data` function was incomplete and returned None
   - **Solution**: Added missing loop and return statement to actually generate DataFrame

3. **KeyError: 'p_value_sw'**:
   - **Problem**: Function filtered for `pRef_size = 10000` but data had `pRef_size = 5000`
   - **Solution**: Changed filter to match actual data (5000)

4. **Missing IAI data**:
   - **Problem**: Function expected IAI method but dataset only had Trad., PS-SW, PS-SWA
   - **Solution**: Removed IAI dependency, use Trad. as baseline

## Dataset Used:
- **Path**: `complete_dataset_08-02-H03'm'50's15`
- **Contains**: 12 rows with BT problem, GA method, 3 tree types
- **Methods**: Trad. (naive), PS-SW (simplicity variance), PS-SWA (simplicity variance estimated_atomicity)

## Expected Results:
- Statistical comparisons between PS-SW vs Trad. and PS-SWA vs Trad.
- P-values for significance testing
- Working pivot tables for analysis

This corrected version should run without errors and provide meaningful statistical analysis results.
