In [1]:
import json
import numpy as np
import scipy.stats as stats
import os
import pandas as pd

In [2]:
def calculate_mean_confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    n = len(data)
    stderr = stats.sem(data)
    h = stderr * stats.t.ppf((1 + confidence) / 2., n - 1)
    return mean, h

In [3]:
#model_size = 'small'
model_size = 'large'
#input_path = f'final_inference_results_ten_percent_{model_size}.json'
input_path = f'final_inference_results_ten_percent_{model_size}.json'
with open(input_path, 'r') as f:
    original_results = json.load(f)

In [4]:
# Prepare the new dictionary to hold the mean and confidence intervals
aggregated_results = {}

# Iterate through each dataset and model to compute mean and confidence intervals
for dataset_name, models in original_results.items():
    aggregated_results[dataset_name] = {}
    for model_name, folds in models.items():
        metrics_aggregated = {}
        for fold_name, metrics in folds.items():
            for metric_name, value in metrics.items():
                if metric_name not in metrics_aggregated:
                    metrics_aggregated[metric_name] = []
                metrics_aggregated[metric_name].append(value)

        # Calculate mean and 95% confidence interval for each metric
        metrics_summary = {}
        for metric_name, values in metrics_aggregated.items():
            mean, ci = calculate_mean_confidence_interval(values)
            metrics_summary[metric_name] = {
                "mean": mean,
                "95%_CI": ci
            }

        aggregated_results[dataset_name][model_name] = metrics_summary

# Save the aggregated results to a new JSON file
#output_path = f'aggregated_inference_results_ten_percent_{model_size}.json'
output_path = f'aggregated_inference_results{model_size}.json'
with open(output_path, 'w') as f:
    json.dump(aggregated_results, f, indent=4)

In [5]:
rows = []
for dataset_name, models in aggregated_results.items():
    for model_name, metrics in models.items():
        row = {"Dataset": dataset_name, "Model": model_name}
        for metric_name, summary in metrics.items():
            mean = summary["mean"]
            ci = summary["95%_CI"]
            row[f"{metric_name}"] = f"{mean:.4f} ± {ci:.4f}"
        rows.append(row)

df = pd.DataFrame(rows)

In [6]:
df_isaid = df.loc[df['Dataset'] == 'isaid']

print(df_isaid)

  Dataset         Model              IoU             Dice   Pixel Accuracy  \
4   isaid    UNet_e2cnn  0.3205 ± 0.0101  0.3599 ± 0.0108  0.9633 ± 0.0032   
5   isaid  UNet_vanilla  0.3624 ± 0.0200  0.4074 ± 0.0227  0.9689 ± 0.0025   
6   isaid            C8  0.3234 ± 0.0098  0.3628 ± 0.0107  0.9652 ± 0.0029   
7   isaid            D4  0.2634 ± 0.0064  0.2963 ± 0.0077  0.9606 ± 0.0031   

     Mean Accuracy Frequency Weighted IoU           Recall        Precision  
4  0.3493 ± 0.0107        0.9360 ± 0.0052  0.3493 ± 0.0107  0.4162 ± 0.0117  
5  0.3972 ± 0.0204        0.9461 ± 0.0038  0.3972 ± 0.0204  0.4654 ± 0.0280  
6  0.3522 ± 0.0103        0.9391 ± 0.0047  0.3522 ± 0.0103  0.4196 ± 0.0146  
7  0.2871 ± 0.0075        0.9315 ± 0.0050  0.2871 ± 0.0075  0.3455 ± 0.0074  


In [7]:
df_coco = df.loc[df['Dataset'] == 'coco']

print(df_coco)

  Dataset         Model              IoU             Dice   Pixel Accuracy  \
0    coco    UNet_e2cnn  0.1057 ± 0.0020  0.1388 ± 0.0024  0.6040 ± 0.0033   
1    coco  UNet_vanilla  0.1086 ± 0.0026  0.1447 ± 0.0033  0.6094 ± 0.0043   
2    coco            C8  0.1039 ± 0.0012  0.1365 ± 0.0015  0.6029 ± 0.0032   
3    coco            D4  0.0906 ± 0.0016  0.1207 ± 0.0020  0.5872 ± 0.0038   

     Mean Accuracy Frequency Weighted IoU           Recall        Precision  
0  0.1473 ± 0.0026        0.4722 ± 0.0023  0.1473 ± 0.0026  0.1740 ± 0.0037  
1  0.1518 ± 0.0042        0.4774 ± 0.0046  0.1518 ± 0.0042  0.1838 ± 0.0041  
2  0.1449 ± 0.0013        0.4722 ± 0.0012  0.1449 ± 0.0013  0.1706 ± 0.0024  
3  0.1286 ± 0.0025        0.4558 ± 0.0040  0.1286 ± 0.0025  0.1504 ± 0.0032  
