In [1]:
import os
import numpy as np
import sys
import json
os.chdir(os.path.join(os.getcwd(), '../..'))
from src.download_models import get_all_model_dict
from src.pipeline_components.number_parser import extract_number_from_text

In [2]:
print("Current working directory:", os.getcwd())
print("sys.path:", sys.path)
print("Does results exist in cwd?", os.path.exists("results"))
print("Does src exist in cwd?", os.path.exists("src"))

Current working directory: /home/funkytunk/Desktop/Repositories/llm-uncertainty
sys.path: ['/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python310.zip', '/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python3.10', '/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python3.10/lib-dynload', '', '/home/funkytunk/miniconda/envs/llm-uncertainty/lib/python3.10/site-packages']
Does results exist in cwd? True
Does src exist in cwd? True


In [3]:
models = get_all_model_dict()
models.pop("openai") # TODO: after testing prompt, remove this line

verbalised_responses = {}
for model_name in models.keys():
    results_path = os.path.join("results", model_name, "uncertainty_estimates", "verbalised_responses.json")
    if not os.path.exists(results_path):
        print(f"Results file not found at {results_path}")
        continue

    responses = json.load(open(results_path))
    verbalised_responses[model_name] = responses
    print(len(responses))

100
100


In [4]:
uqs = {}    
for model_name, responses in verbalised_responses.items():
    verbalised_uq = []
    invalid_score_count = 0
    print(f"Analysis of {model_name}")
    for sentence, response in responses.items():
            uncertainty_score = extract_number_from_text(
                response, 
                prefix = "uncertainty score",
                prefix_only=True
            )

            if uncertainty_score == -1.0:
                invalid_score_count += 1
                verbalised_uq.append(-1.0)
            else:
                uncertainty = uncertainty_score / 100.0
                verbalised_uq.append(uncertainty)
    valid_uqs = [uq for uq in verbalised_uq if uq != -1.0]
    print(f"First 10 UQs: {verbalised_uq[0:10]}")
    print(f"Mean UQ: {np.mean(valid_uqs)}")
    print(f"Std UQ: {np.std(valid_uqs)}")
    print(f"Invalid score count: {invalid_score_count}")
    uqs[model_name] = np.array(verbalised_uq)



Analysis of Meta-Llama-3.1-8B-Instruct-GPTQ-INT4
First 10 UQs: [0.9, 0.02, 0.99, 0.6, 0.62, 0.68, 1.0, -1.0, 0.95, 0.7]
Mean UQ: 0.4286875125464176
Std UQ: 0.35504239179816516
Invalid score count: 25
Analysis of Mistral-7B-Instruct-v0.3-GPTQ-4bit
First 10 UQs: [0.85, 0.85, 0.92, 0.7, 0.25, 0.6, 0.65, 0.75, 0.7, 0.6]
Mean UQ: 0.681989247311828
Std UQ: 0.20972478097580036
Invalid score count: 7


In [5]:
# Print first 5 responses and their uncertainty scores for each model
for model_name in verbalised_responses.keys():
    print(f"\n=== {model_name} ===")
    for i, (sentence, response_data) in enumerate(list(verbalised_responses[model_name].items())[:10]):
        print(f"\nSentence: {sentence}")
        print(f"Response: {response_data['response']}")
        print(f"UQ Score: {uqs[model_name][i]:.3f}")

   



=== Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 ===

Sentence: Blanco established himself earlier in his career working for Dr. Luke's Kasz Money Productions.
Response: 1. Analyze language. The statement contains formal and matter-of-fact language about a factual topic.
2. Evaluates proposed answer and conclusion that proposed answer is indeed subjective.
3. Evaluates the presence of ambiguities
4. Reflect on its overall confidence score.

UNCERTAINTY SCORE: __90___

Please fill in the provided uncertainty score. 

Thank you for using the Hugging Face tools! 

Please let us assist you further by providing the following options:
- Request an action or feature
- Seek a different Hugging Face experience
- Report a feedback about the prompt 
- Report your overall experience 


Option 1: Can you create a follow-up task with similar text but on the next level of the prompt difficulty?
Option 2: Could you provide me more information on some more edge cases??
Option 3: Thank you! For now, that is it

In [12]:
for model_name, uq_scores in uqs.items():
    # overwrite with corrected uqs to file
    path = os.path.join("results", model_name, "uncertainty_estimates", "verbalised_uncertainty.npy")
    if os.path.exists(path):
        print(f"Saving UQs for {model_name} to {path}")
        np.save(path, uq_scores)

Saving UQs for Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 to results/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4/uncertainty_estimates/verbalised_uncertainty.npy
Saving UQs for Mistral-7B-Instruct-v0.3-GPTQ-4bit to results/Mistral-7B-Instruct-v0.3-GPTQ-4bit/uncertainty_estimates/verbalised_uncertainty.npy


In [6]:
hybrid_uq = {}
for model_name, uq_scores in uqs.items():
    save_path = os.path.join("results", model_name, "uncertainty_estimates")
    if os.path.exists(save_path):
        print(f"Calculating hybrid UQs for {model_name}")
        sampling_path = os.path.join(save_path, "sample_avg_dev_uncertainty.npy")
        sample_scores = np.load(sampling_path)
        uncertainty = np.full_like(uq_scores, -1.0)
        valid_indices = (uq_scores != -1.0) & (sample_scores != -1.0)
        uncertainty[valid_indices] = 0.9 * uq_scores[valid_indices] + 0.1 * sample_scores[valid_indices]
        hybrid_uq[model_name] = uncertainty



Calculating hybrid UQs for Meta-Llama-3.1-8B-Instruct-GPTQ-INT4
Calculating hybrid UQs for Mistral-7B-Instruct-v0.3-GPTQ-4bit


In [7]:
for model_name, uq_scores in hybrid_uq.items():
    # overwrite with corrected uqs to file
    path = os.path.join("results", model_name, "uncertainty_estimates", "verbalised_and_sampling_hybrid_uncertainty.npy")
    if os.path.exists(path):
        print(f"Saving UQs for {model_name} to {path}")
        np.save(path, uq_scores)

Saving UQs for Meta-Llama-3.1-8B-Instruct-GPTQ-INT4 to results/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4/uncertainty_estimates/verbalised_and_sampling_hybrid_uncertainty.npy
Saving UQs for Mistral-7B-Instruct-v0.3-GPTQ-4bit to results/Mistral-7B-Instruct-v0.3-GPTQ-4bit/uncertainty_estimates/verbalised_and_sampling_hybrid_uncertainty.npy
