In [3]:
# Ensure this is a CODE cell in Colab

# @title 1. Setup: Install Libraries
# Install required libraries
!pip install google-generativeai python-dotenv jiwer evaluate rouge_score groq --quiet
print("Libraries installed successfully.")

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.6/129.6 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
Libraries installed successfully.


In [4]:
from google.colab import drive
drive.mount('/content/drive')
DRIVE_PATH = "/content/drive/MyDrive/Prompto_Ergo_Sum_HW2/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import json
import os
import re
import time
from google import genai
from dotenv import load_dotenv
import jiwer
import evaluate
import difflib
from groq import Groq

try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    if not GOOGLE_API_KEY:
        raise ValueError("API Key not found in Colab Secrets. Please add it or use Option 2.")
    print("GOOGLE_API_KEY loaded from Colab Secrets.")
except Exception as e:
    print(f"Could not load API key from Colab Secrets: {e}")
    print("Please use Option 2 below if you haven't set up the secret, OR ensure the secret name is 'GOOGLE_API_KEY'.")
    GOOGLE_API_KEY = None # Ensure it's None if not found

try:
    rouge_metric_evaluator = evaluate.load("rouge")
    print("ROUGE metric evaluator loaded.")
except Exception as e:
    print(f"Error loading ROUGE metric: {e}")
    rouge_metric_evaluator = None

Could not load API key from Colab Secrets: name 'userdata' is not defined
Please use Option 2 below if you haven't set up the secret, OR ensure the secret name is 'GOOGLE_API_KEY'.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE metric evaluator loaded.


In [11]:
GEMINI_MODEL_NAME = "gemini-1.5-flash-latest"
INPUT_PATH = "/content/drive/MyDrive/Prompto_Ergo_Sum_HW2/dataset/eng/the_vampyre_subset.json"

OUTPUT_PATH_CLEANING_ONLY = "/content/drive/MyDrive/Prompto_Ergo_Sum_HW2/results/cleaning_results.json" # From cleaner_LLM.py
OUTPUT_PATH_JUDGING_ONLY = "/content/drive/MyDrive/Prompto_Ergo_Sum_HW2/results/judging_results.json" # From judge_LLM.py
OUTPUT_PATH_PIPELINE = "/content/drive/MyDrive/Prompto_Ergo_Sum_HW2/results/full_pipeline_results.json" # From main.py

# Set to a small number for testing, or None to process all items in the dataset
NUM_ITEM_TO_PROCESS = 6 # Adjusted for the small dummy dataset
# NUM_ITEM_TO_PROCESS = None # To process all items

print("Configuration set.")
print(f"GEMINI_MODEL_NAME: {GEMINI_MODEL_NAME}")
print(f"INPUT_PATH: {INPUT_PATH}")
print(f"OUTPUT_PATH_PIPELINE: {OUTPUT_PATH_PIPELINE}")
print(f"NUM_ITEM_TO_PROCESS: {NUM_ITEM_TO_PROCESS if NUM_ITEM_TO_PROCESS is not None else 'All'}")

Configuration set.
GEMINI_MODEL_NAME: gemini-1.5-flash-latest
INPUT_PATH: /content/drive/MyDrive/Prompto_Ergo_Sum_HW2/dataset/eng/the_vampyre_subset.json
OUTPUT_PATH_PIPELINE: /content/drive/MyDrive/Prompto_Ergo_Sum_HW2/results/full_pipeline_results.json
NUM_ITEM_TO_PROCESS: 6


In [6]:
def clean_with_gemini(client: genai.Client, ocr_text: str) -> str:
    """Cleans OCR text using Google Gemini (Client API style)."""
    if not ocr_text.strip():
        return ""

    #try:
    #    # Initialize the client. Passing api_key explicitly is robust.
    #    client = genai.Client(api_key=GOOGLE_API_KEY)
    #except Exception as e:
    #    print(f"Error initializing Gemini Client (genai.Client): {e}")
    #    print("This could be due to an invalid API key, network issues, or problems with the 'google-generativeai' library.")
    #    return f"[GEMINI_CLIENT_INIT_ERROR: {e}]"

    contents = f"""Clean the following OCR text. Correct spelling errors, fix punctuation, remouve also the \n present in the text. Preserve the original
    meaning and style. Do not add new information or summarize. Return only the cleaned text.

OCR Text:
---
{ocr_text}
---
Cleaned Text:
"""
    response = client.models.generate_content(
        model=GEMINI_MODEL_NAME,
        contents=contents
    )
    #print(response.text)
    return response.text

def clean_with_groq(client: Groq, ocr_text: str, model_id: str) -> str:
    """Cleans OCR text using a model from the Groq API."""
    if not ocr_text.strip():
        return ""

    # This chat-based prompt works well for modern instruct models like Llama3 and Mixtral
    messages = [
        {
            "role": "system",
            "content": "Clean the following OCR text. Correct spelling errors, fix punctuation, remouve also the \\n present in the text. Preserve the original meaning and style. Do not add new information or summarize. Return only the cleaned text.",
        },
        {
            "role": "user",
            "content": f"OCR Text:\n---\n{ocr_text}\n---\nCleaned Text:",
        }
    ]

    try:
        chat_completion = client.chat.completions.create(
            messages=messages,
            model=model_id,
            temperature=0.1,  # Lower temperature for more deterministic output
            max_tokens=2048,
        )
        cleaned_text = chat_completion.choices[0].message.content.strip()
        return cleaned_text
    except Exception as e:
        return f"[GROQ_API_ERROR: {e}]"

def judge_with_gemini(client: genai.Client, gemini_cleaned: str, ground_truth: str) -> str:
    """Judges the quality of Gemini-generated text using a pre-initialized Gemini client."""

    # Handle empty input gracefully
    if not gemini_cleaned or not gemini_cleaned.strip():
        return "0" # Assign a score of 0 if the cleaned text is empty

    #try:
    #    # Initialize the client. Passing api_key explicitly is robust.
    #    client = genai.Client(api_key=GOOGLE_API_KEY)
    #except Exception as e:
    #    print(f"Error initializing Gemini Client (genai.Client): {e}")
    #    print("This could be due to an invalid API key, network issues, or problems with the 'google-generativeai' library.")
    #    return f"[GEMINI_CLIENT_INIT_ERROR: {e}]"

    contents = f"""Evaluate the quality of the "cleaned text" against the "ground truth" reference.
Provide a score from 0 to 5 based on the following scale:
5: Perfect. The cleaned text fully and accurately matches the ground truth.
4: Excellent. Very minor errors (e.g., one or two typos, a single punctuation mistake) that do not affect meaning.
3: Good. Some errors persist (e.g., a few OCR mistakes) but the overall meaning is clear and correct.
2: Fair. Multiple issues make the text difficult to understand or it contains misleading information.
1: Poor. Unacceptable quality; the output is mostly unrelated, unreadable, or nonsensical.
0: Empty/No Output. The cleaned text was empty.

---
[GROUND TRUTH]:
{ground_truth}
---
[CLEANED TEXT]:
{gemini_cleaned}
---

Return ONLY the integer score (0-5) and nothing else.
"""

    response = client.models.generate_content(
        model=GEMINI_MODEL_NAME,
        contents=contents
    )
    #print(response.text)
    return response.text


In [13]:
def parse_score(response_text: str) -> int:
    """Extracts the first integer from the judge's response for robustness."""
    numbers = re.findall(r'\d+', response_text)
    return int(numbers[0]) if numbers else -1 # -1 indicates a parsing error

def calculate_metrics(reference: str, hypothesis: str) -> dict:
    """Calculates WER and CER, handling edge cases."""
    if not reference.strip() and not hypothesis.strip():
        return {"wer": 0.0, "cer": 0.0}
    if not reference.strip() or not hypothesis.strip():
        return {"wer": 1.0, "cer": 1.0}

    return {
        "wer": jiwer.wer(reference, hypothesis),
        "cer": jiwer.cer(reference, hypothesis)
    }

MODELS_TO_RUN = [
    {
        "name": "Gemini-1.5-Flash",
        "type": "gemini",
        "function": clean_with_gemini,
        "model_id_or_client": None  # Gemini uses a global client
    },
    {
        "name": "Groq Llama-3-8B-Instruct",
        "type": "groq",
        "function": clean_with_groq,
        "model_id_or_client": "llama-3.3-70b-versatile"
    }
]

def get_detailed_diffs(text1: str, text2: str) -> list[dict]:
    """
    Compares two strings word by word and returns a list of dictionaries
    highlighting the specific differing words.
    """
    # Split texts into lists of words. We use a regex to better handle
    # punctuation, treating it as a separate "word".
    import re
    words1 = re.findall(r'\w+|[^\w\s]', text1)
    words2 = re.findall(r'\w+|[^\w\s]', text2)

    s = difflib.SequenceMatcher(None, words1, words2)
    diffs = []

    for tag, i1, i2, j1, j2 in s.get_opcodes():
        if tag == 'equal':
            continue

        diff_item = {
            "type": tag,
            # Join the words to recreate the original text slice
            "ground_truth_slice": " ".join(words1[i1:i2]),
            "model_cleaned_slice": " ".join(words2[j1:j2])
        }
        diffs.append(diff_item)

    return diffs

print("Metrics and parsing functions defined.")

Metrics and parsing functions defined.


In [1]:
def main():
    """
    Main function to run the complete clean, evaluate, and judge pipeline.
    """
    # --- Initialize APIs (ONCE) ---
    try:
        if not GOOGLE_API_KEY:
            raise ValueError("GOOGLE_API_KEY is not set. Please check the setup cell (Cell 3) and your Colab Secrets.")
        gemini_client = genai.Client(api_key=GOOGLE_API_KEY)

        if not GROQ_API_KEY:
            raise ValueError("GROQ_API_KEY is not set. Please check the setup cell (Cell 3) and your Colab Secrets.")
        groq_client = Groq(api_key=GROQ_API_KEY)

        rouge_metric = evaluate.load("rouge")
        print("Successfully initialized Gemini Client and ROUGE metric evaluator.")
    except Exception as e:
        print(f"Fatal Error during initialization: {e}")
        return

    # --- Load Data ---
    try:
        with open(INPUT_PATH, 'r', encoding='utf-8') as f:
            data_dict = json.load(f)
        print(f"Successfully loaded {len(data_dict)} items from '{INPUT_PATH}'")
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading input file: {e}")
        return

    list_of_items = list(data_dict.values())
    subset_to_process = list_of_items[:NUM_ITEM_TO_PROCESS] if NUM_ITEM_TO_PROCESS is not None else list_of_items

    all_results = []
    print(f"\nStarting pipeline for {len(subset_to_process)} items across {len(MODELS_TO_RUN)} models...")

    # --- Process Each Item in the Dataset ---
    for i, item in enumerate(subset_to_process):
        print(f"\n{'='*20} Processing item {i+1}/{len(subset_to_process)} {'='*20}")

        ocr_text = item.get('ocr', '')
        ground_truth = item.get('clean', '')

        # Structure to save all results for this item
        item_result = {
            "item_id": i + 1,
            "original_ocr": ocr_text,
            "ground_truth": ground_truth,
            "model_outputs": []
        }

        # --- Loop through each configured model ---
        for model_config in MODELS_TO_RUN:
            model_name = model_config["name"]
            cleaning_function = model_config["function"]
            print(f"\n--- Running on model: {model_name} ---")

            # Step 1: Clean the text
            print("1. Cleaning text...")
            cleaned_text = "[ERROR: Unknown model type in config]"
            if model_config["type"] == "gemini":
                cleaned_text = cleaning_function(gemini_client, ocr_text)
            elif model_config["type"] == "huggingface":
                model_id = model_config["model_id_or_client"]
                cleaned_text = cleaning_function(model_id, ocr_text)
            elif model_config["type"] == "groq":
                model_id = model_config["model_id_or_client"]
                cleaned_text = cleaning_function(groq_client, ocr_text, model_id)

            if "[ERROR:" in cleaned_text or "[GEMINI_" in cleaned_text or "[HUGGINGFACE_" in cleaned_text or "[GROQ_" in cleaned_text:
                print(f"  -> Skipping further processing for {model_name} due to cleaning error: {cleaned_text}")
                model_run_result = {"model_name": model_name, "cleaned_text": cleaned_text, "metrics": None, "judgement": None, "differences": []}
                item_result["model_outputs"].append(model_run_result)
                time.sleep(1) # Still sleep to avoid hammering a failing API
                continue

            # Step 2: Evaluate with quantitative metrics
            print("2. Calculating WER, CER, and ROUGE metrics...")
            edit_metrics = calculate_metrics(ground_truth, cleaned_text)
            rouge_scores = rouge_metric.compute(predictions=[cleaned_text], references=[ground_truth])

            # Step 3: Judge the quality with Gemini (using Gemini as the standard judge for all)
            print("3. Judging quality with Gemini...")
            raw_judgement = judge_with_gemini(gemini_client, cleaned_text, ground_truth)
            parsed_judgement_score = parse_score(raw_judgement)

            # Step 4: Get detailed differences
            detailed_differences = get_detailed_diffs(ground_truth, cleaned_text)

            # Step 5: Aggregate all information for this model run
            model_run_result = {
                "model_name": model_name,
                "cleaned_text": cleaned_text,
                "metrics": {
                    "wer": edit_metrics['wer'],
                    "cer": edit_metrics['cer'],
                    "rouge": rouge_scores
                },
                "judgement": {
                    "score": parsed_judgement_score,
                    "raw_score_text": raw_judgement.strip()
                },
                "differences": detailed_differences
            }
            item_result["model_outputs"].append(model_run_result)

            print(f"  -> Metrics: WER={edit_metrics['wer']:.4f}, CER={edit_metrics['cer']:.4f}")
            print(f"  -> Judgement: Score={parsed_judgement_score}")

            # Avoid hitting API rate limits
            time.sleep(1)

        all_results.append(item_result)

    # --- Save Final Combined Results ---
    print("\n--- Pipeline Complete ---")
    output_dir = os.path.dirname(OUTPUT_PATH_PIPELINE)
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)

    try:
        with open(OUTPUT_PATH_PIPELINE, "w", encoding="utf-8") as outfile:
            json.dump(all_results, outfile, indent=2, ensure_ascii=False)
        print(f"\nAll processed data and results saved to: {OUTPUT_PATH_PIPELINE}")
    except IOError as e:
        print(f"\nError saving final results file: {e}")

    return all_results

print("Main pipeline function defined.")

IndentationError: expected an indented block after 'if' statement on line 11 (<ipython-input-1-2361511132>, line 12)

In [15]:
if __name__ == "__main__":
    final_results = main()

    if final_results:
        print("\nScript finished successfully.")
    else:
        print("\nScript finished with an error.")

Successfully initialized Gemini Client and ROUGE metric evaluator.
Successfully loaded 24 items from '/content/drive/MyDrive/Prompto_Ergo_Sum_HW2/dataset/eng/the_vampyre_subset.json'

Starting pipeline for 6 items...

--- Processing item 1/6 ---
1. Cleaning text with Gemini...
2. Calculating WER, CER, and ROUGE metrics...
3. Judging quality with Gemini...
  -> Metrics: WER=0.0000, CER=0.0000
  -> Judgement: Score=5 (Raw: '5
')

--- Processing item 2/6 ---
1. Cleaning text with Gemini...
2. Calculating WER, CER, and ROUGE metrics...
3. Judging quality with Gemini...
  -> Metrics: WER=0.0000, CER=0.0000
  -> Judgement: Score=5 (Raw: '5
')

--- Processing item 3/6 ---
1. Cleaning text with Gemini...
2. Calculating WER, CER, and ROUGE metrics...
3. Judging quality with Gemini...
  -> Metrics: WER=0.0000, CER=0.0000
  -> Judgement: Score=5 (Raw: '5
')

--- Processing item 4/6 ---
1. Cleaning text with Gemini...
2. Calculating WER, CER, and ROUGE metrics...
3. Judging quality with Gemini...
