### Setup libraries and instalaltion

In [1]:

# 1. Upgrade pip to prevent installer bugs.
!pip install -q --upgrade pip

# 2. Install the required libraries. This gets the latest available version
#    of prometheus-eval and adds the VLLM engine for local inference.
!pip install -q "torch>=2.3"  "tiktoken>=0.7.0" prometheus-eval "vllm>=0.4.0" google-generativeai "huggingface_hub[hf_xet]"

print(" Dependencies installed successfully.")
print(" PLEASE RESTART THE RUNTIME NOW before running any other cells.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m110.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.6/394.6 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# 1. Verify GPU is available
#!nvidia-smi

# 2. Import all necessary libraries
import os
import json
from google.colab import userdata, files
import google.generativeai as genai
import torch

# Prometheus-eval imports for the modern API
# This will only work AFTER you have run the install cell AND restarted the runtime.
from prometheus_eval import PrometheusEval
from prometheus_eval.vllm import VLLM
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE

print(f"GPU is available: {torch.cuda.is_available()}")

INFO 06-22 09:43:24 [__init__.py:244] Automatically detected platform cuda.
GPU is available: True


In [2]:
PROMETHEUS_MODEL_NAME = "Unbabel/M-Prometheus-3B"

# --- File Paths ---
#INPUT_PATH = "cleaning_results.json"
#INPUT_PATH = "extracted_data.json"
INPUT_PATH = "extracted_data_ita.json"

OUTPUT_PATH_P_GEMINI = "judging_result_P_gemini.json"
OUTPUT_PATH_P_LLAMA = "judging_result_P_llama.json"
OUTPUT_PATH_P_MISTRAL = "judging_result_P_mistral.json"

print(f"\nConfiguration:")
print(f"  - Judge Model: PROMETHEUS ({PROMETHEUS_MODEL_NAME})")
print(f"  - Input File: '{INPUT_PATH}'")
print(f"  - Output Files:")
print(f"    - '{OUTPUT_PATH_P_GEMINI}'")
print(f"    - '{OUTPUT_PATH_P_LLAMA}'")
print(f"    - '{OUTPUT_PATH_P_MISTRAL}'")


Configuration:
  - Judge Model: PROMETHEUS (Unbabel/M-Prometheus-3B)
  - Input File: 'extracted_data_ita.json'
  - Output Files:
    - 'judging_result_P_gemini.json'
    - 'judging_result_P_llama.json'
    - 'judging_result_P_mistral.json'


# English dataset

In [None]:
OCR_RUBRIC_DATA = {
  "criteria": "How accurately and completely does the cleaned text represent the ground truth, minimizing OCR errors and maintaining readability?",
  "score1_description": "Poor: The cleaned text is mostly unrelated to the ground truth, unreadable, nonsensical, or omits vast portions of the original content. Contains severe and numerous errors.",
  "score2_description": "Fair: The cleaned text has multiple significant errors (e.g., many misrecognized words, incorrect formatting, missing phrases) that make it difficult to understand or misleading.",
  "score3_description": "Good: The cleaned text is largely correct and understandable but contains some noticeable OCR errors (e.g., a few misrecognized words, minor formatting issues, small omissions/additions) that don't obscure the overall meaning.",
  "score4_description": "Excellent: The cleaned text is highly accurate with only very minor errors (e.g., one or two typos, a single punctuation mistake, slight spacing issues) that do not affect meaning or readability significantly.",
  "score5_description": "Perfect: The cleaned text is an exact or near-exact match to the ground truth. It is perfectly readable and free of OCR errors."
}
SCORE_RUBRIC_FOR_OCR = SCORE_RUBRIC_TEMPLATE.format(**OCR_RUBRIC_DATA)

CUSTOM_ABSOLUTE_PROMPT = """###Task Description:
You are a text judge, your task is to act as an impartial judge and evaluate the quality of the provided cleaned text againist the ground truth.

###Evaluation Steps:
1. Read the instruction, reference answer, and the model's response carefully.
2. Compare the model's response to the reference answer and evaluate its quality based on the given rubric.
3. Provide a clear and concise feedback for your score, explaining how the model's response aligns with the ground truth and the rubric criteria.
4. IMPORTANT: you must give always a detailed feedback even if the model's response is a perfect match (Score 5). For example, explain that "The response is a perfect, character-for-character match with the ground truth, containing no errors and demonstrating perfect readability. Or is a very poor match, with many errors and unreadable text."
5. IMPORTANT The output format MUST look as follows: "(write a detailed feedback for criteria) [RESULT] (an integer number between 1 and 5)"
6. Please do not generate any other opening, closing, and explanations.

###Instruction:
{instruction}

###Model Response to evaluate (cleaned text):
{response}

###Reference Answer (score 5) (Ground Truth):
{reference_answer}

###Evaluation Rubric:
{rubric}



###Feedback: """


def initialize_prometheus_judge():
    """
    Initializes Prometheus using the modern VLLM API.
    """
    print(f"Initializing model '{PROMETHEUS_MODEL_NAME}' with the VLLM engine...")
    try:

        # We specify `dtype='float16'` for T4/A100 GPUs in Colab.
        # `tensor_parallel_size=1` is required for single-GPU environments.
        prometheus_model = VLLM(
            model=PROMETHEUS_MODEL_NAME,
            tensor_parallel_size=1,
            dtype='float16',
            trust_remote_code=True
        )

        # Pass the single VLLM model object to PrometheusEval.
        judge = PrometheusEval(
            model=prometheus_model,
            absolute_grade_template=CUSTOM_ABSOLUTE_PROMPT
        )
        print(" Prometheus judge (VLLM) initialized successfully!")
        return judge

    except Exception as e:
        print(f"Error initializing Hugging Face model with VLLM: {e}")
        raise

# The main judging function
def judge_with_prometheus(prometheus_judge, cleaned_text, ground_truth, original_ocr_text=None) -> tuple[str, int]:
    if not cleaned_text or not cleaned_text.strip():
      return "Cleaned text was empty.", 1

    # Normalize by stripping whitespace to handle cases with extra newlines (like gemini_cleaned)
    if cleaned_text.strip() == ground_truth.strip():
        return "The response is a perfect match to the ground truth, containing no errors.", 5

    instruction = (f"The following text was extracted via OCR and may contain errors: \"{original_ocr_text}\". "
                   "Please clean this text to improve its accuracy and readability." if original_ocr_text
                   else "The task was to clean a piece of text obtained from OCR.")
    try:
        # Use batch grading for better performance, even with one item
        feedbacks, scores = prometheus_judge.absolute_grade(
            instructions=[instruction],
            responses=[cleaned_text],
            rubric=SCORE_RUBRIC_FOR_OCR,
            reference_answers=[ground_truth]
        )
        feedback = feedbacks[0]
        score_str = str(scores[0])
        score_int = int(float(score_str)) if score_str.replace('.','',1).isdigit() else 1
        return feedback, score_int
    except Exception as e:
        print(f"Error during Prometheus grading: {e}"); return f"[PROMETHEUS_GRADING_ERROR: {e}]", 1


print(" Judging functions and rubric are defined.")

 Judging functions and rubric are defined.


# Italian dataset

In [3]:
# --- ITALIAN VERSION ---

OCR_RUBRIC_DATA_IT = {
  "criteria": "Con quale accuratezza e completezza il testo corretto rappresenta il testo di riferimento, minimizzando gli errori OCR e mantenendo la leggibilità?",
  "score1_description": "Pessimo: Il testo corretto è in gran parte non correlato al testo di riferimento, illeggibile, senza senso o omette ampie porzioni del contenuto originale. Contiene errori gravi e numerosi.",
  "score2_description": "Sufficiente: Il testo corretto presenta molteplici errori significativi (es. molte parole riconosciute erroneamente, formattazione errata, frasi mancanti) che lo rendono difficile da comprendere o fuorviante.",
  "score3_description": "Buono: Il testo corretto è in gran parte corretto e comprensibile, ma contiene alcuni errori OCR evidenti (es. alcune parole riconosciute erroneamente, problemi di formattazione minori, piccole omissioni/aggiunte) che non oscurano il significato generale.",
  "score4_description": "Eccellente: Il testo corretto è estremamente accurato con solo errori molto lievi (es. uno o due errori di battitura, un singolo errore di punteggiatura, lievi problemi di spaziatura) che non influenzano significativamente il significato o la leggibilità.",
  "score5_description": "Perfetto: Il testo corretto è una corrispondenza esatta o quasi esatta del testo di riferimento. È perfettamente leggibile e privo di errori OCR."
}
SCORE_RUBRIC_FOR_OCR = SCORE_RUBRIC_TEMPLATE.format(**OCR_RUBRIC_DATA_IT)

CUSTOM_ABSOLUTE_PROMPT_IT = """###Descrizione del Compito:
Sei un giudice di testi, il tuo compito è agire come un giudice imparziale e valutare la qualità del testo corretto fornito rispetto al testo di riferimento.

###Fasi di Valutazione:
1. Leggi attentamente l'istruzione, il testo di riferimento e la risposta del modello.
2. Confronta la risposta del modello con il testo di riferimento e valutane la qualità in base alla rubrica fornita.
3. Fornisci un feedback chiaro e conciso per il tuo punteggio, spiegando come la risposta del modello si allinea al testo di riferimento e ai criteri della rubrica.
4. IMPORTANTE: devi sempre fornire un feedback dettagliato anche se la risposta del modello è una corrispondenza perfetta (Punteggio 5). Ad esempio, spiega che "La risposta è una corrispondenza perfetta, carattere per carattere, con il testo di riferimento, non contiene errori e dimostra una leggibilità perfetta. Oppure, è una corrispondenza molto scarsa, con molti errori e testo illeggibile."
5. IMPORTANTE: Il formato dell'output DEVE essere il seguente: "(scrivi un feedback dettagliato per i criteri) [RISULTATO] (un numero intero tra 1 e 5)"
6. Per favore, non generare altre introduzioni, conclusioni o spiegazioni.

###Istruzione:
{instruction}

###Risposta del Modello da Valutare (testo corretto):
{response}

###Testo di Riferimento (punteggio 5) (Ground Truth):
{reference_answer}

###Rubrica di Valutazione:
{rubric}

###Feedback:"""

def initialize_prometheus_judge():
    """
    Initializes Prometheus using the modern VLLM API.
    """
    print(f"Initializing model '{PROMETHEUS_MODEL_NAME}' with the VLLM engine...")
    try:

        # We specify `dtype='float16'` for T4/A100 GPUs in Colab.
        # `tensor_parallel_size=1` is required for single-GPU environments.
        prometheus_model = VLLM(
            model=PROMETHEUS_MODEL_NAME,
            tensor_parallel_size=1,
            dtype='float16',
            trust_remote_code=True
        )

        # Pass the single VLLM model object to PrometheusEval.
        judge = PrometheusEval(
            model=prometheus_model,
            absolute_grade_template=CUSTOM_ABSOLUTE_PROMPT_IT
        )
        print(" Prometheus judge (VLLM) initialized successfully!")
        return judge

    except Exception as e:
        print(f"Error initializing Hugging Face model with VLLM: {e}")
        raise

# The main judging function
def judge_with_prometheus(prometheus_judge, cleaned_text, ground_truth, original_ocr_text=None) -> tuple[str, int]:
    if not cleaned_text or not cleaned_text.strip():
      return "Cleaned text was empty.", 1

    # Normalize by stripping whitespace to handle cases with extra newlines (like gemini_cleaned)
    if cleaned_text.strip() == ground_truth.strip():
        return "The response is a perfect match to the ground truth, containing no errors.", 5

    instruction = (f"Il seguente testo è stato estratto tramite OCR e potrebbe contenere errori. \"{original_ocr_text}\". "
                   "Per favore, correggi questo testo per migliorarne l'accuratezza e la leggibilità." if original_ocr_text
                   else "il compito è di pulire un pezzo di testo ottenuto tramite OCR.")
    try:
        # Use batch grading for better performance, even with one item
        feedbacks, scores = prometheus_judge.absolute_grade(
            instructions=[instruction],
            responses=[cleaned_text],
            rubric=SCORE_RUBRIC_FOR_OCR,
            reference_answers=[ground_truth]
        )
        feedback = feedbacks[0]
        score_str = str(scores[0])
        score_int = int(float(score_str)) if score_str.replace('.','',1).isdigit() else 1
        return feedback, score_int
    except Exception as e:
        print(f"Error during Prometheus grading: {e}"); return f"[PROMETHEUS_GRADING_ERROR: {e}]", 1


print(" Judging functions and rubric are defined.")

 Judging functions and rubric are defined.


In [4]:
# Click the button to upload your file.
# Make sure the filename exactly matches the INPUT_PATH variable from Cell 3.

try:
  # Clean up previous uploads if they exist
  if os.path.exists(INPUT_PATH):
    os.remove(INPUT_PATH)

  uploaded = files.upload()
  for fn in uploaded.keys():
    print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')
    # Check if the uploaded file has the expected name
    if fn != INPUT_PATH:
        print(f" Warning: Uploaded file name is '{fn}', but expected '{INPUT_PATH}'. Renaming to {INPUT_PATH}.")
        os.rename(fn, INPUT_PATH)

  if not os.path.exists(INPUT_PATH):
      raise FileNotFoundError("Upload failed or was cancelled.")

except (FileNotFoundError, Exception) as e:
    print(f"\n Error: Please run this cell again and upload your '{INPUT_PATH}' file. Details: {e}")

Saving extracted_data_ita.json to extracted_data_ita.json
User uploaded file "extracted_data_ita.json" with length 279479 bytes


In [5]:
# --- Load and Prepare Input Data ---
data_list = []
try:
    with open(INPUT_PATH, 'r', encoding='utf-8') as f:
        # The input is a dictionary of dictionaries, e.g., {"1": {...}, "2": {...}}
        data_dict = json.load(f)

    # Convert the dictionary into a list of items for easier iteration,
    # while preserving the original ID.
    for item_id, item_data in data_dict.items():
        item_data['id'] = item_id
        data_list.append(item_data)

    print(f" Successfully loaded and processed {len(data_list)} items from '{INPUT_PATH}'")
except Exception as e:
    print(f" Error loading or processing JSON file: {e}")
    # data_list will be empty, preventing further errors

# --- Initialize Judge and Process Data ---
if data_list:
    prometheus_judge = None
    results_gemini, results_llama, results_mistral = [], [], []

    try:
        # Initialize the judge model ONCE for efficiency
        prometheus_judge = initialize_prometheus_judge()

        # Loop through each data item ONCE
        for i, item in enumerate(data_list, 1):
            print(f"\n- - - Processing item {i}/{len(data_list)} (ID: {item.get('id', 'N/A')}) - - -")

            # Get common fields for the current item
            ground_truth = item.get('ground_truth', '')
            original_ocr = item.get('original_ocr', None)

            if not ground_truth:
                print("  Skipping item due to missing ground truth.")
                feedback_g, score_g = "Skipped - No ground truth", None
                feedback_l, score_l = "Skipped - No ground truth", None
                feedback_m, score_m = "Skipped - No ground truth", None
            else:
                # --- Judge Gemini's output ---
                cleaned_text_gemini = item.get('gemini_cleaned', '')
                feedback_g, score_g = judge_with_prometheus(prometheus_judge, cleaned_text_gemini, ground_truth, original_ocr)
                print(f"  > Prometheus Score for Gemini: {score_g}")

                # --- Judge Llama's output ---
                cleaned_text_llama = item.get('llama_cleaned', '')
                feedback_l, score_l = judge_with_prometheus(prometheus_judge, cleaned_text_llama, ground_truth, original_ocr)
                print(f"  > Prometheus Score for Llama: {score_l}")

                # --- Judge Mistral's output ---
                cleaned_text_mistral = item.get('mistral_cleaned', '')
                feedback_m, score_m = judge_with_prometheus(prometheus_judge, cleaned_text_mistral, ground_truth, original_ocr)
                print(f"  > Prometheus Score for Mistral: {score_m}")

            # Append results to their respective lists
            results_gemini.append({**item, 'prometheus_feedback': feedback_g, 'prometheus_score': score_g})
            results_llama.append({**item, 'prometheus_feedback': feedback_l, 'prometheus_score': score_l})
            results_mistral.append({**item, 'prometheus_feedback': feedback_m, 'prometheus_score': score_m})

    except Exception as e:
        print(f"\n A critical error occurred during Prometheus processing: {e}")
    finally:
        # Clean up the model to free GPU memory
        if prometheus_judge:
            del prometheus_judge
            import torch
            torch.cuda.empty_cache()
        print("\nJudging process finished. Freeing memory.")

    # --- 3. Save All Results ---
    print("\n--- Saving Results ---")
    if results_gemini:
        with open(OUTPUT_PATH_P_GEMINI, 'w', encoding='utf-8') as f:
            json.dump(results_gemini, f, ensure_ascii=False, indent=4)
        print(f" Gemini results saved to '{OUTPUT_PATH_P_GEMINI}'")

    if results_llama:
        with open(OUTPUT_PATH_P_LLAMA, 'w', encoding='utf-8') as f:
            json.dump(results_llama, f, ensure_ascii=False, indent=4)
        print(f" Llama results saved to '{OUTPUT_PATH_P_LLAMA}'")

    if results_mistral:
        with open(OUTPUT_PATH_P_MISTRAL, 'w', encoding='utf-8') as f:
            json.dump(results_mistral, f, ensure_ascii=False, indent=4)
        print(f" Mistral results saved to '{OUTPUT_PATH_P_MISTRAL}'")

else:
    print("\nNo data to process. Please check for errors in previous cells.")

 Successfully loaded and processed 12 items from 'extracted_data_ita.json'
Initializing model 'Unbabel/M-Prometheus-3B' with the VLLM engine...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

INFO 06-22 09:44:16 [config.py:823] This model supports multiple tasks: {'score', 'generate', 'classify', 'reward', 'embed'}. Defaulting to 'generate'.


tokenizer_config.json:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

INFO 06-22 09:44:16 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='Unbabel/M-Prometheus-3B', speculative_config=None, tokenizer='Unbabel/M-Prometheus-3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=Unbabel/M-Prometheus-3B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_cach

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

INFO 06-22 09:44:21 [cuda.py:275] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 06-22 09:44:21 [cuda.py:324] Using XFormers backend.
INFO 06-22 09:44:22 [parallel_state.py:1065] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
INFO 06-22 09:44:22 [model_runner.py:1171] Starting to load model Unbabel/M-Prometheus-3B...
INFO 06-22 09:44:23 [weight_utils.py:292] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

INFO 06-22 09:47:27 [weight_utils.py:308] Time spent downloading weights for Unbabel/M-Prometheus-3B: 184.071609 seconds


model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 06-22 09:47:58 [default_loader.py:272] Loading weights took 30.85 seconds
INFO 06-22 09:47:59 [model_runner.py:1203] Model loading took 5.7916 GiB and 215.643026 seconds
INFO 06-22 09:48:08 [worker.py:294] Memory profiling takes 9.31 seconds
INFO 06-22 09:48:08 [worker.py:294] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 06-22 09:48:08 [worker.py:294] model weights take 5.79GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 2.52GiB; the rest of the memory reserved for KV Cache is 4.90GiB.
INFO 06-22 09:48:09 [executor_base.py:113] # cuda blocks: 8928, # CPU blocks: 7281
INFO 06-22 09:48:09 [executor_base.py:118] Maximum concurrency for 32768 tokens per request: 4.36x
INFO 06-22 09:48:12 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 06-22 09:48:50 [model_runner.py:1671] Graph capturing finished in 38 secs, took 0.21 GiB
INFO 06-22 09:48:50 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 50.97 seconds
 Prometheus judge (VLLM) initialized successfully!

- - - Processing item 1/12 (ID: 1) - - -


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 2/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 3/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 4/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 5/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 6/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 7/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 8/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 9/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 10/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 3830.41it/s]

  > Prometheus Score for Gemini: 1





Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 2/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 3/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 4/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 5/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 6/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 7/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 8/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 9/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 10/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 1113.14it/s]

  > Prometheus Score for Llama: 1





Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 2/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 3/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 4/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 5/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 6/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 7/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


Judging process finished. Freeing memory.


KeyboardInterrupt: 

In [None]:
# Download all three result files
try:
    print("\nPreparing files for download...")
    all_files_exist = True

    if os.path.exists(OUTPUT_PATH_P_GEMINI):
        print(f"Downloading '{OUTPUT_PATH_P_GEMINI}'...")
        files.download(OUTPUT_PATH_P_GEMINI)
    else:
        print(f" File '{OUTPUT_PATH_P_GEMINI}' not found for download.")
        all_files_exist = False

    if os.path.exists(OUTPUT_PATH_P_LLAMA):
        print(f"Downloading '{OUTPUT_PATH_P_LLAMA}'...")
        files.download(OUTPUT_PATH_P_LLAMA)
    else:
        print(f" File '{OUTPUT_PATH_P_LLAMA}' not found for download.")
        all_files_exist = False

    if os.path.exists(OUTPUT_PATH_P_MISTRAL):
        print(f"Downloading '{OUTPUT_PATH_P_MISTRAL}'...")
        files.download(OUTPUT_PATH_P_MISTRAL)
    else:
        print(f" File '{OUTPUT_PATH_P_MISTRAL}' not found for download.")
        all_files_exist = False

    if not all_files_exist:
         print("\nSome output files were not created. Check the output of the previous cell for errors.")

except NameError:
    print(f" It seems the output file variables are not defined. Check for errors in Step 2.")
except Exception as e:
    print(f" An error occurred during download: {e}")