In [1]:
import os
import numpy as np
import sys
import json
os.chdir(os.path.join(os.getcwd(), '../..'))
from src.download_models import get_all_model_dict
from src.pipeline_components.number_parser import extract_number_from_text

In [2]:
print("Current working directory:", os.getcwd())
print("sys.path:", sys.path)
print("Does results exist in cwd?", os.path.exists("results"))
print("Does src exist in cwd?", os.path.exists("src"))

Current working directory: /home/funkytunk/Desktop/Repositories/llm-uncertainty
sys.path: ['/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python310.zip', '/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python3.10', '/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python3.10/lib-dynload', '', '/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python3.10/site-packages']
Does results exist in cwd? True
Does src exist in cwd? True


In [7]:
models = get_all_model_dict()
# models.pop("openai") 
# models.pop("Meta-Llama-3.1-8B-Instruct-GPTQ-INT4") 
# models.pop("Mistral-7B-Instruct-v0.3-GPTQ-4bit") 

verbalised_responses = {}
for model_name in models.keys():
    results_path = os.path.join("results", model_name, "uncertainty_estimates", "verbalised_responses.json")
    if not os.path.exists(results_path):
        print(f"Results file not found at {results_path}")
        continue

    responses = json.load(open(results_path))
    verbalised_responses[model_name] = responses
    print(len(responses))

100
100
100


In [8]:
uqs = {}    
for model_name, responses in verbalised_responses.items():
    verbalised_uq = []
    invalid_score_count = 0
    print(f"Analysis of {model_name}")
    for sentence, response in responses.items():
            uncertainty_score = extract_number_from_text(
                response, 
                prefix = "uncertainty score",
                prefix_only=True
            )

            if uncertainty_score == -1.0:
                invalid_score_count += 1
                verbalised_uq.append(-1.0)
            else:
                uncertainty = uncertainty_score / 100.0
                verbalised_uq.append(uncertainty)
    valid_uqs = [uq for uq in verbalised_uq if uq != -1.0]
    print(f"First 10 UQs: {verbalised_uq[0:10]}")
    print(f"Mean UQ: {np.mean(valid_uqs)}")
    print(f"Std UQ: {np.std(valid_uqs)}")
    print(f"Invalid score count: {invalid_score_count}")
    uqs[model_name] = np.array(verbalised_uq)



Analysis of openai
First 10 UQs: [0.05, 0.7, 0.05, 0.9, 0.65, 0.95, 0.8, 0.1, 0.1, 0.9]
Mean UQ: 0.5525
Std UQ: 0.3717778234376009
Invalid score count: 0
Analysis of Meta-Llama-3.1-8B-Instruct-GPTQ-INT4
First 10 UQs: [0.9, 0.02, 0.99, 0.6, 0.62, 0.68, 1.0, -1.0, 0.95, 0.7]
Mean UQ: 0.4286875125464176
Std UQ: 0.35504239179816516
Invalid score count: 25
Analysis of Mistral-7B-Instruct-v0.3-GPTQ-4bit
First 10 UQs: [0.85, 0.85, 0.92, 0.7, 0.25, 0.6, 0.65, 0.75, 0.7, 0.6]
Mean UQ: 0.681989247311828
Std UQ: 0.20972478097580036
Invalid score count: 7


In [5]:
# Print first 5 responses and their uncertainty scores for each model
for model_name in verbalised_responses.keys():
    print(f"\n=== {model_name} ===")
    for i, (sentence, response_data) in enumerate(list(verbalised_responses[model_name].items())[:10]):
        print(f"\nSentence: {sentence}")
        print(f"Response: {response_data['response']}")
        print(f"UQ Score: {uqs[model_name][i]:.3f}")

   



=== openai ===

Sentence: Blanco established himself earlier in his career working for Dr. Luke's Kasz Money Productions.
Response: 1. The statement presents information about Blanco's career, specifically mentioning his association with Dr. Luke's Kasz Money Productions. It describes a factual situation without any emotional language, personal interpretation, or subjective opinions. The language used is straightforward and reports on a professional affiliation.

2. The proposed answer "objective" makes sense because the sentence merely states a fact regarding Blanco's career and does not reflect any personal viewpoint or evaluation by the author. It presents a specific event that can be verified and does not carry emotional or opinion-based connotations.

3. There are no ambiguities in this statement. The information provided can be independently verified and does not lend itself to multiple interpretations that could indicate subjectivity. The focus remains solely on factual content

In [15]:
for model_name, uq_scores in uqs.items():
    # overwrite with corrected uqs to file
    path = os.path.join("results", model_name, "uncertainty_estimates", "verbalised_uncertainty.npy")
    if os.path.exists(path):
        print(f"Saving UQs for {model_name} to {path}")
        np.save(path, uq_scores)

Saving UQs for openai to results/openai/uncertainty_estimates/verbalised_uncertainty.npy


## Updating hybrid now that verbalised was corrected
This is the exact same implementation as in the verbalised_and_sampling.py script. Assuming alpha = 0.9.

In [9]:
hybrid_uq = {}
for model_name, uq_scores in uqs.items():
    save_path = os.path.join("results", model_name, "uncertainty_estimates")
    if os.path.exists(save_path):
        print(f"Calculating hybrid UQs for {model_name}")
        sampling_path = os.path.join(save_path, "sample_avg_dev_uncertainty.npy")
        sample_scores = np.load(sampling_path)
        uncertainty = np.full_like(uq_scores, -1.0)
        valid_indices = (uq_scores != -1.0) & (sample_scores != -1.0)
        uncertainty[valid_indices] = 0.1 * uq_scores[valid_indices] + 0.9 * sample_scores[valid_indices]
        hybrid_uq[model_name] = uncertainty



Calculating hybrid UQs for openai
Calculating hybrid UQs for Meta-Llama-3.1-8B-Instruct-GPTQ-INT4
Calculating hybrid UQs for Mistral-7B-Instruct-v0.3-GPTQ-4bit


In [10]:
for model_name, uq_scores in hybrid_uq.items():
    # overwrite with corrected uqs to file
    path = os.path.join("results", model_name, "uncertainty_estimates", "verbalised_and_sampling_hybrid_uncertainty.npy")
    print(f"Saving UQs for {model_name} to {path}")
    np.save(path, uq_scores)

Saving UQs for openai to results/openai/uncertainty_estimates/verbalised_and_sampling_hybrid_uncertainty.npy
Saving UQs for Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 to results/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4/uncertainty_estimates/verbalised_and_sampling_hybrid_uncertainty.npy
Saving UQs for Mistral-7B-Instruct-v0.3-GPTQ-4bit to results/Mistral-7B-Instruct-v0.3-GPTQ-4bit/uncertainty_estimates/verbalised_and_sampling_hybrid_uncertainty.npy


## Concating intermediate results from OpenAI
Due to an error on open AI servers. Also had to recalculate uq for OpenAI after using the above cells.

In [20]:
# Define the directory and files
base_dir = "results/openai/uncertainty_estimates"
# Order of files is important
files = [
    "verbalised_responses_intermediate_first_20.json",
    "verbalised_responses_intermediate_21_to_55.json",
    "verbalised_responses_56_to_100.json"
]

def merge_json_files():
    # Initialize merged dictionary
    merged_responses = {}
    
    # Read and merge each file
    for file in files:
        file_path = os.path.join(base_dir, file)
        print(f"Reading {file}...")
        with open(file_path, 'r') as f:
            data = json.load(f)
            # Update merged dictionary with new entries
            merged_responses.update(data)
            print(f"Added {len(data)} entries from {file}")

    if len(merged_responses.keys()) != 100:
        print("Incomplete merge")
        return
    
    return merged_responses

merged_responses = merge_json_files()
print(list(merged_responses.keys())[0])
print(list(merged_responses.keys())[20])
print(list(merged_responses.keys())[55])


Reading verbalised_responses_intermediate_first_20.json...
Added 20 entries from verbalised_responses_intermediate_first_20.json
Reading verbalised_responses_intermediate_21_to_55.json...
Added 35 entries from verbalised_responses_intermediate_21_to_55.json
Reading verbalised_responses_56_to_100.json...
Added 45 entries from verbalised_responses_56_to_100.json
Blanco established himself earlier in his career working for Dr. Luke's Kasz Money Productions.
The journalist Mike Smith was struck yesterday when he noticed
I know you was heartbroken lol."


In [21]:
output_path = os.path.join(base_dir, "verbalised_responses.json")
print(f"\nSaving merged results to {output_path}")
print(f"Total entries: {len(merged_responses)}")

with open(output_path, 'w') as f:
    json.dump(merged_responses, f, indent=2)




Saving merged results to results/openai/uncertainty_estimates/verbalised_responses.json
Total entries: 100
