In [13]:
import json
import numpy as np
import scipy.stats as stats
import os
import pandas as pd

In [14]:
def calculate_mean_confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    n = len(data)
    stderr = stats.sem(data)
    h = stderr * stats.t.ppf((1 + confidence) / 2., n - 1)
    return mean, h

In [15]:
model_size = 'large'
input_path = f'final_inference_results_{model_size}.json'
with open(input_path, 'r') as f:
    original_results = json.load(f)

In [16]:
# Prepare the new dictionary to hold the mean and confidence intervals
aggregated_results = {}

# Iterate through each dataset and model to compute mean and confidence intervals
for dataset_name, models in original_results.items():
    aggregated_results[dataset_name] = {}
    for model_name, folds in models.items():
        metrics_aggregated = {}
        for fold_name, metrics in folds.items():
            for metric_name, value in metrics.items():
                if metric_name not in metrics_aggregated:
                    metrics_aggregated[metric_name] = []
                metrics_aggregated[metric_name].append(value)

        # Calculate mean and 95% confidence interval for each metric
        metrics_summary = {}
        for metric_name, values in metrics_aggregated.items():
            mean, ci = calculate_mean_confidence_interval(values)
            metrics_summary[metric_name] = {
                "mean": mean,
                "95%_CI": ci
            }

        aggregated_results[dataset_name][model_name] = metrics_summary

# Save the aggregated results to a new JSON file
output_path = 'aggregated_inference_results.json'  # Adjust path if needed
with open(output_path, 'w') as f:
    json.dump(aggregated_results, f, indent=4)

In [17]:
rows = []
for dataset_name, models in aggregated_results.items():
    for model_name, metrics in models.items():
        row = {"Dataset": dataset_name, "Model": model_name}
        for metric_name, summary in metrics.items():
            mean = summary["mean"]
            ci = summary["95%_CI"]
            row[f"{metric_name}"] = f"{mean:.4f} ± {ci:.4f}"
        rows.append(row)

df = pd.DataFrame(rows)

In [18]:
df_isaid = df.loc[df['Dataset'] == 'isaid']

print(df_isaid)

  Dataset         Model              IoU             Dice   Pixel Accuracy  \
0   isaid  UNet_vanilla  0.4821 ± 0.0175  0.5373 ± 0.0186  0.9763 ± 0.0027   
1   isaid    UNet_e2cnn  0.4307 ± 0.0160  0.4821 ± 0.0180  0.9747 ± 0.0024   
2   isaid            C8  0.4457 ± 0.0144  0.4980 ± 0.0155  0.9756 ± 0.0026   
3   isaid            D4              NaN              NaN              NaN   

     Mean Accuracy Frequency Weighted IoU           Recall        Precision  
0  0.5268 ± 0.0165        0.9578 ± 0.0043  0.5268 ± 0.0165  0.5924 ± 0.0200  
1  0.4714 ± 0.0174        0.9550 ± 0.0041  0.4714 ± 0.0174  0.5355 ± 0.0201  
2  0.4876 ± 0.0163        0.9567 ± 0.0043  0.4876 ± 0.0163  0.5516 ± 0.0147  
3              NaN                    NaN              NaN              NaN  


In [19]:
df_coco = df.loc[df['Dataset'] == 'coco']

print(df_coco)

  Dataset         Model              IoU             Dice   Pixel Accuracy  \
4    coco  UNet_vanilla  0.1789 ± 0.0013  0.2276 ± 0.0017  0.6888 ± 0.0019   
5    coco    UNet_e2cnn  0.1822 ± 0.0022  0.2277 ± 0.0027  0.6781 ± 0.0021   
6    coco            C8  0.1829 ± 0.0042  0.2276 ± 0.0052  0.6780 ± 0.0024   
7    coco            D4              NaN              NaN              NaN   

     Mean Accuracy Frequency Weighted IoU           Recall        Precision  
4  0.2380 ± 0.0025        0.5629 ± 0.0056  0.2380 ± 0.0025  0.2834 ± 0.0048  
5  0.2413 ± 0.0023        0.5484 ± 0.0027  0.2413 ± 0.0023  0.2810 ± 0.0041  
6  0.2416 ± 0.0060        0.5479 ± 0.0069  0.2416 ± 0.0060  0.2808 ± 0.0037  
7              NaN                    NaN              NaN              NaN  
