In [1]:
import os
import numpy as np
import sys
import json
os.chdir(os.path.join(os.getcwd(), '../..'))
from src.download_models import get_all_model_dict

In [2]:
uncertainty_files = ["verbalised_and_sampling_hybrid_uncertainty.npy", "sample_avg_dev_uncertainty.npy", "verbalised_uncertainty.npy"]

model_uqs = {}
models = get_all_model_dict()
for model_name in models.keys():
    model_results_dir = os.path.join("results", model_name)
    uqs = {}
    for uncertainty_file in uncertainty_files:
        uncertainty_path = os.path.join(model_results_dir, "uncertainty_estimates", uncertainty_file)
        uncertainty_estimates = np.load(uncertainty_path)
        uqs[uncertainty_file] = uncertainty_estimates
        
    model_uqs[model_name] = uqs

In [5]:
for model_name, uqs_dict in model_uqs.items():
    print(f"==============Model: {model_name}===================")
    for uq_type, uqs in uqs_dict.items():
        valid_uqs = uqs[uqs != -1]
        print(f"-----------UQ type: {uq_type}-----------")
        print(f"Min: {np.min(valid_uqs)}")
        print(f"Max: {np.max(valid_uqs)}")
        print(f"Mean: {np.mean(valid_uqs)}")
        print(f"Median: {np.median(valid_uqs)}")
        print(f"Std: {np.std(valid_uqs)}")
        print(f"Variance: {np.var(valid_uqs)}")
        print(f"Full Length with invalid scores: {len(uqs)}")
        print(f"Invalid Scores where -1 is value: {np.sum(uqs == -1)}")
        print(f"Lower score than 0.5: {np.sum(uqs < 0.5)}")

-----------UQ type: verbalised_and_sampling_hybrid_uncertainty.npy-----------
Min: 0.02168831168831169
Max: 0.9650000000000001
Mean: 0.43406818181818196
Median: 0.3880519480519481
Std: 0.2357242251088565
Variance: 0.055565910303170846
Full Length with invalid scores: 100
Invalid Scores where -1 is value: 0
Lower score than 0.5: 60
-----------UQ type: sample_avg_dev_uncertainty.npy-----------
Min: 0.0
Max: 1.0
Mean: 0.42090909090909095
Median: 0.35064935064935066
Std: 0.2758882678060769
Variance: 0.07611433631303761
Full Length with invalid scores: 100
Invalid Scores where -1 is value: 0
Lower score than 0.5: 64
-----------UQ type: verbalised_uncertainty.npy-----------
Min: 0.05
Max: 0.95
Mean: 0.5525
Median: 0.7
Std: 0.3717778234376009
Variance: 0.13821874999999997
Full Length with invalid scores: 100
Invalid Scores where -1 is value: 0
Lower score than 0.5: 38
-----------UQ type: verbalised_and_sampling_hybrid_uncertainty.npy-----------
Min: 0.034557046979865766
Max: 0.907
Mean: 0.461