In [1]:
# Cell 1: Correct Installation (with vllm)

# 1. Upgrade pip to prevent installer bugs.
!pip install -q --upgrade pip

# 2. Install the required libraries. This gets the latest available version
#    of prometheus-eval and adds the VLLM engine for local inference.
!pip install -q "torch>=2.3" "transformers>=4.40.0" accelerate bitsandbytes "tiktoken>=0.7.0" prometheus-eval "vllm>=0.4.0" google-generativeai "huggingface_hub[hf_xet]"

print("✅ Dependencies installed successfully.")
print("‼️ PLEASE RESTART THE RUNTIME NOW before running any other cells. ‼️")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m68.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m163.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m394.6/394.6 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m135.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m393.1/393.1 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m169.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
# Cell 2: Verification (NEW CELL)
import torch
import accelerate
print("--- Verification ---")
print(f"PyTorch version: {torch.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"GPU is available: {torch.cuda.is_available()}")

print("--------------------")
print("✅ Verification successful. You can now proceed with the rest of the notebook.")

--- Verification ---
PyTorch version: 2.7.0+cu126
Accelerate version: 1.7.0
GPU is available: True
--------------------
✅ Verification successful. You can now proceed with the rest of the notebook.


In [2]:
# 1. Verify GPU is available
!nvidia-smi

# 2. Import all necessary libraries
import os
import json
from google.colab import userdata, files
import google.generativeai as genai

# Transformers and BitsAndBytes for model loading
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

# Prometheus-eval imports for the modern API
# This will only work AFTER you have run the install cell AND restarted the runtime.
from prometheus_eval import PrometheusEval
from prometheus_eval.vllm import VLLM
from prometheus_eval.prompts import ABSOLUTE_PROMPT, SCORE_RUBRIC_TEMPLATE


print("\n✅ GPU detected and libraries imported successfully.")
print("If this cell ran without errors, your environment is correctly set up. You can now proceed.")

Thu Jun 12 14:57:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   37C    P8              9W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [9]:
# --- Secret Management ---
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    genai.configure(api_key=GOOGLE_API_KEY)
    print("🔑 Google API Key loaded successfully.")
except Exception as e:
    print(f"An error occurred while configuring the API key: {e}")

# --- Model Configuration ---
GEMINI_MODEL_NAME = "gemini-1.5-flash-latest"
# Set the model name to the ORIGINAL, full-precision model.
# We will quantize it to 8-bit on the fly during loading.
PROMETHEUS_MODEL_NAME = "Unbabel/M-Prometheus-3B"

# --- File Paths ---
INPUT_PATH = "cleaning_results.json"
OUTPUT_PATH = "judging_results.json"

# --- Choose Your Judge ---
JUDGE_CHOICE = "prometheus"

print(f"\nConfiguration:")
print(f"  - Judge Model: {JUDGE_CHOICE.upper()} ({PROMETHEUS_MODEL_NAME})")
print(f"  - Quantization: 8-bit with bitsandbytes")
print(f"  - Input File: '{INPUT_PATH}'")
print(f"  - Output File: '{OUTPUT_PATH}'")

An error occurred while configuring the API key: Secret GOOGLE_API_KEY does not exist.

Configuration:
  - Judge Model: PROMETHEUS (Unbabel/M-Prometheus-3B)
  - Quantization: 8-bit with bitsandbytes
  - Input File: 'cleaning_results.json'
  - Output File: 'judging_results.json'


In [10]:

OCR_RUBRIC_DATA = {
  "criteria": "How accurately and completely does the cleaned text represent the ground truth, minimizing OCR errors and maintaining readability?",
  "score1_description": "Poor: The cleaned text is mostly unrelated to the ground truth, unreadable, nonsensical, or omits vast portions of the original content. Contains severe and numerous errors.",
  "score2_description": "Fair: The cleaned text has multiple significant errors (e.g., many misrecognized words, incorrect formatting, missing phrases) that make it difficult to understand or misleading.",
  "score3_description": "Good: The cleaned text is largely correct and understandable but contains some noticeable OCR errors (e.g., a few misrecognized words, minor formatting issues, small omissions/additions) that don't obscure the overall meaning.",
  "score4_description": "Excellent: The cleaned text is highly accurate with only very minor errors (e.g., one or two typos, a single punctuation mistake, slight spacing issues) that do not affect meaning or readability significantly.",
  "score5_description": "Perfect: The cleaned text is an exact or near-exact match to the ground truth. It is perfectly readable and free of OCR errors."
}
SCORE_RUBRIC_FOR_OCR = SCORE_RUBRIC_TEMPLATE.format(**OCR_RUBRIC_DATA)

# ----------------- LLM Judge Functions -----------------

def judge_with_gemini(gemini_model, gemini_cleaned: str, ground_truth: str) -> str:
    """Judges the quality of text using a pre-initialized Gemini model."""
    if not gemini_cleaned or not gemini_cleaned.strip(): return "0"
    prompt = f"""Evaluate the quality of the "cleaned text" against the "ground truth" reference on a scale of 0-5. 5 is a perfect match, 0 is empty/unrelated.
[GROUND TRUTH]: {ground_truth}
[CLEANED TEXT]: {gemini_cleaned}
Return ONLY the integer score (0-5)."""
    try:
        response = gemini_model.generate_content(prompt)
        score = response.text.strip()
        if score.isdigit() and 0 <= int(score) <= 5: return score
        else: return "0"
    except Exception as e:
        print(f"Error during Gemini judging: {e}"); return "[GEMINI_API_ERROR]"

def initialize_prometheus_judge():
    """
    Initializes Prometheus using the modern VLLM API.
    """
    print(f"Initializing model '{PROMETHEUS_MODEL_NAME}' with the VLLM engine...")
    try:
        # The VLLM wrapper handles model loading and quantization.
        # `quantization='awq'` or `'gptq'` are options, but for 8-bit,
        # vLLM handles it best by managing memory automatically.
        # We specify `dtype='bfloat16'` for T4/A100 GPUs in Colab.
        # `tensor_parallel_size=1` is required for single-GPU environments.
        prometheus_model = VLLM(
            model=PROMETHEUS_MODEL_NAME,
            tensor_parallel_size=1,
            dtype='float16',
            trust_remote_code=True
        )

        # Pass the single VLLM model object to PrometheusEval.
        judge = PrometheusEval(
            model=prometheus_model,
            absolute_grade_template=ABSOLUTE_PROMPT
        )
        print("✅ Prometheus judge (VLLM) initialized successfully!")
        return judge

    except Exception as e:
        print(f"Error initializing Hugging Face model with VLLM: {e}")
        raise

# The main judging function remains the same
def judge_with_prometheus(prometheus_judge, cleaned_text, ground_truth, original_ocr_text=None) -> tuple[str, int]:
    if not cleaned_text or not cleaned_text.strip(): return "Cleaned text was empty.", 1
    instruction = (f"The following text was extracted via OCR and may contain errors: \"{original_ocr_text}\". "
                   "Please clean this text to improve its accuracy and readability." if original_ocr_text
                   else "The task was to clean a piece of text obtained from OCR.")
    try:
        # Use batch grading for better performance, even with one item
        feedbacks, scores = prometheus_judge.absolute_grade(
            instructions=[instruction],
            responses=[cleaned_text],
            rubric=SCORE_RUBRIC_FOR_OCR,
            reference_answers=[ground_truth]
        )
        feedback = feedbacks[0]
        score_str = str(scores[0])
        score_int = int(float(score_str)) if score_str.replace('.','',1).isdigit() else 1
        return feedback, score_int
    except Exception as e:
        print(f"Error during Prometheus grading: {e}"); return f"[PROMETHEUS_GRADING_ERROR: {e}]", 1


print("✅ Judging functions and rubric are defined.")

✅ Judging functions and rubric are defined.


In [5]:
# Click the button to upload your file.
# Make sure the filename exactly matches the INPUT_PATH variable from Cell 3.

try:
  # Clean up previous uploads if they exist
  if os.path.exists(INPUT_PATH):
    os.remove(INPUT_PATH)

  uploaded = files.upload()
  for fn in uploaded.keys():
    print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')
    # Check if the uploaded file has the expected name
    if fn != INPUT_PATH:
        print(f"⚠️ Warning: Uploaded file name is '{fn}', but expected '{INPUT_PATH}'. Renaming to {INPUT_PATH}.")
        os.rename(fn, INPUT_PATH)

  if not os.path.exists(INPUT_PATH):
      raise FileNotFoundError("Upload failed or was cancelled.")

except (FileNotFoundError, Exception) as e:
    print(f"\n🛑 Error: Please run this cell again and upload your '{INPUT_PATH}' file. Details: {e}")

Saving cleaning_results.json to cleaning_results.json
User uploaded file "cleaning_results.json" with length 9374 bytes


In [11]:
# --- 1. Load Input Data ---
try:
    with open(INPUT_PATH, 'r', encoding='utf-8') as f:
        data_dict = json.load(f)
    print(f"✅ Successfully loaded {len(data_dict)} items from '{INPUT_PATH}'")
except Exception as e:
    print(f"🛑 Error loading JSON file: {e}")
    data_dict = [] # Prevent further errors

# --- 2. Initialize Judge and Process Data ---
results = []
if data_dict:
    if JUDGE_CHOICE == "gemini":
        print("\nInitializing Gemini judge...")
        gemini_model = genai.GenerativeModel(GEMINI_MODEL_NAME)
        for i, item in enumerate(data_dict, 1):
            print(f"Processing item {i}/{len(data_dict)} with Gemini...")
            score = judge_with_gemini(gemini_model, item.get('gemini_cleaned', ''), item.get('ground_truth', ''))
            results.append({**item, 'gemini_judge_score': score})

    elif JUDGE_CHOICE == "prometheus":
        try:
            prometheus_judge = initialize_prometheus_judge()
            for i, item in enumerate(data_dict, 1):
                print(f"\nProcessing item {i}/{len(data_dict)} (ID: {item.get('id', 'N/A')}) with Prometheus...")
                cleaned_text = item.get('gemini_cleaned', '')
                ground_truth = item.get('ground_truth', '')
                original_ocr = item.get('original_ocr', None)

                if not ground_truth:
                    feedback, score = "Skipped - No ground truth", None
                else:
                    feedback, score = judge_with_prometheus(prometheus_judge, cleaned_text, ground_truth, original_ocr)

                print(f"  > Prometheus Score: {score}")
                results.append({**item, 'prometheus_feedback': feedback, 'prometheus_score': score})
        except Exception as e:
            print(f"🛑 A critical error occurred during Prometheus processing: {e}")

# --- 3. Save Results to a file in Colab ---
if results:
    with open(OUTPUT_PATH, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)
    print(f"\n\n🎉 Judging complete! Results saved to '{OUTPUT_PATH}' in the Colab environment.")
else:
    print("\nNo results were generated. Please check for errors in previous cells.")

✅ Successfully loaded 6 items from 'cleaning_results.json'
Initializing model 'Unbabel/M-Prometheus-3B' with the VLLM engine...


config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

INFO 06-12 15:11:04 [config.py:823] This model supports multiple tasks: {'generate', 'embed', 'reward', 'score', 'classify'}. Defaulting to 'generate'.


tokenizer_config.json:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

INFO 06-12 15:11:04 [llm_engine.py:230] Initializing a V0 LLM engine (v0.9.1) with config: model='Unbabel/M-Prometheus-3B', speculative_config=None, tokenizer='Unbabel/M-Prometheus-3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=None, served_model_name=Unbabel/M-Prometheus-3B, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_c

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

INFO 06-12 15:11:09 [model_runner.py:1171] Starting to load model Unbabel/M-Prometheus-3B...
INFO 06-12 15:11:10 [weight_utils.py:292] Using model weights format ['*.safetensors']


model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

INFO 06-12 15:17:17 [weight_utils.py:308] Time spent downloading weights for Unbabel/M-Prometheus-3B: 367.122579 seconds


model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 06-12 15:17:44 [default_loader.py:272] Loading weights took 26.32 seconds
INFO 06-12 15:17:44 [model_runner.py:1203] Model loading took 5.7838 GiB and 394.143102 seconds
INFO 06-12 15:17:53 [worker.py:294] Memory profiling takes 8.64 seconds
INFO 06-12 15:17:53 [worker.py:294] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.90) = 13.27GiB
INFO 06-12 15:17:53 [worker.py:294] model weights take 5.78GiB; non_torch_memory takes 0.03GiB; PyTorch activation peak memory takes 3.06GiB; the rest of the memory reserved for KV Cache is 4.40GiB.
INFO 06-12 15:17:54 [executor_base.py:113] # cuda blocks: 8003, # CPU blocks: 7281
INFO 06-12 15:17:54 [executor_base.py:118] Maximum concurrency for 32768 tokens per request: 3.91x
INFO 06-12 15:17:57 [model_runner.py:1513] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 06-12 15:18:34 [model_runner.py:1671] Graph capturing finished in 36 secs, took 0.21 GiB
INFO 06-12 15:18:34 [llm_engine.py:428] init engine (profile, create kv cache, warmup model) took 49.19 seconds
✅ Prometheus judge (VLLM) initialized successfully!

Processing item 1/6 (ID: N/A) with Prometheus...


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 2/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 3/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 1995.39it/s]

  > Prometheus Score: 5

Processing item 2/6 (ID: N/A) with Prometheus...





Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 2/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 3/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 4/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 5/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 6/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 7/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 2880.70it/s]

  > Prometheus Score: 1

Processing item 3/6 (ID: N/A) with Prometheus...





Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 2706.00it/s]

  > Prometheus Score: 5

Processing item 4/6 (ID: N/A) with Prometheus...





Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 2/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 3/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 4/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 5/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 6/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 7/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 8/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 2365.65it/s]

  > Prometheus Score: 1

Processing item 5/6 (ID: N/A) with Prometheus...





Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 2/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 3/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 4/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 3137.10it/s]

  > Prometheus Score: 5

Processing item 6/6 (ID: N/A) with Prometheus...





Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Retrying failed batches: Attempt 1/10


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed 1/1 instances.


Finalizing: 100%|██████████| 1/1 [00:00<00:00, 3196.88it/s]

  > Prometheus Score: 3


🎉 Judging complete! Results saved to 'judging_results.json' in the Colab environment.





In [12]:
try:
    print(f"Preparing '{OUTPUT_PATH}' for download...")
    files.download(OUTPUT_PATH)
except NameError:
    print(f"It seems the output file was not created. Check for errors in the previous step.")
except FileNotFoundError:
    print(f"Could not find the file '{OUTPUT_PATH}' to download. Did the previous step run correctly?")

Preparing 'judging_results.json' for download...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>