In [None]:
# TODO gemini details

In [None]:
# Install the required libraries
%pip install -q -U google-generativeai pandas

search_for_models = False

In [None]:

JUDGE_RUBRIC = """
You are evaluating the quality of a modern Italian translation based on an original archaic Italian sentence.
Please score the "Candidate Modern Translation" using the following 1-5 scale:

1.  **Completely Unacceptable Translation:**
    *   The translation has no pertinence with the original meaning.
    *   The generated sentence is either gibberish, makes no sense, or is completely unrelated to the archaic text.
    *   Contains severe errors that render it incomprehensible or entirely misleading.

2.  **Severe Semantic Errors/Omissions:**
    *   The translation contains significant semantic errors, critical omissions of meaning from the archaic text, or substantial incorrect additions.
    *   While some words might be recognizable, the core meaning is lost or heavily distorted.
    *   The modernization is poor, leaving many archaic forms or incorrectly modernizing them.
    *   Likely many grammatical errors in modern Italian.

3.  **Partially Wrong Translation / Lackluster:**
    *   The translation captures some of the original meaning but is lackluster or contains noticeable errors.
    *   Errors are mostly minor (e.g., awkward phrasing, typos, minor grammatical mistakes in modern Italian, some less critical semantic misunderstandings or misinterpretations of archaic terms).
    *   Some archaic features might be awkwardly modernized or missed.
    *   The overall quality is mediocre; it's understandable but clearly flawed.

4.  **Good Translation:**
    *   The translation is mostly accurate and successfully conveys the core meaning of the archaic sentence.
    *   It is substantially faithful to the original text.
    *   The modern Italian is fluent and comprehensible.
    *   Archaic features are generally well modernized.
    *   There might be minor stylistic imperfections (e.g., style doesn't perfectly match natural modern Italian, slight awkwardness) or very minor errors that do not significantly impact understanding or meaning.

5.  **Perfect Translation:**
    *   The translation is completely accurate, fully conveying the meaning and nuances of the original archaic sentence.
    *   It is perfectly fluent, natural-sounding, and grammatically correct modern Italian.
    *   All archaic linguistic features (vocabulary, syntax, orthography) are correctly and appropriately modernized.
    *   The style is appropriate for modern Italian.
    *   No errors.
"""

print("Rubric Defined.")


def create_judge_prompt(archaic_text, modern_translation, rubric):
    """
    Creates the prompt for the LLM-as-a-Judge.
    """
    return f"""You are an expert evaluator specializing in the translation of archaic Italian to modern Italian.
                Your task is to assess the quality of the "Candidate Modern Translation" provided below, based on the "Original Archaic Sentence".

                Please use the following detailed 1-5 scale and rubric for your evaluation:
                --- RUBRIC START ---
                {rubric}
                --- RUBRIC END ---

                Original Archaic Sentence:
                "{archaic_text}"

                Candidate Modern Translation:
                "{modern_translation}"

                Carefully consider the rubric. Based on your assessment, provide a single integer score from 1 to 5 that best reflects the quality of the "Candidate Modern Translation".
                Output ONLY the integer score. Do not add any explanation, prefix, or other text.

                Score:"""

print("Judge Prompting Function Defined.")

def format_individual_judgment_request(archaic_text, modern_translation):
    """
    Formats the specific archaic-modern pair for an ongoing chat session.
    The main rubric and instructions are assumed to be set in the chat history.
    """
    return f"""
Now, please evaluate the following pair:

Original Archaic Sentence:
"{archaic_text}"

Candidate Modern Translation:
"{modern_translation}"

Score:"""

print("Individual judgment request formatter defined.")

Config the API and the models and prompts names

In [None]:

# List gemini models available
if search_for_models:  # Set to True to list available models
  import google.generativeai as genai

  print("Available Gemini models that supports 'generateContent':")
  for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
      print(f"- Name: {m.name}, Display Name: {m.display_name}, Description: {m.description}")
      # print(f"  Supported methods: {m.supported_generation_methods}")

In [None]:
# This cell set up Gemini and prepare to loop through translation CSVs

import google.generativeai as genai
import pandas as pd
import os
import time
import re
import configparser 
import json


# --- CONFIGURATION ---
CONFIG_API_FILE_PATH = "gemini_config.ini" # Path to configuration file
API_KEY = None
CONFIG_FILE_PATH = "config.json"

# Read API Key from gemini config file
config = configparser.ConfigParser()
if os.path.exists(CONFIG_API_FILE_PATH):
    try:
        config.read(CONFIG_API_FILE_PATH)
        API_KEY = config.get('GEMINI', 'API_KEY', fallback=None)
    except Exception as e:
        print(f"ERROR: Could not read API key from {CONFIG_API_FILE_PATH}. Error: {e}")
else:
    print(f"ERROR: Configuration file {CONFIG_API_FILE_PATH} not found.")

if not API_KEY:
    print("ERROR: Gemini API Key not found or not set correctly in gemini_config.ini.")
    # Raise an exception here
    raise ValueError("Gemini API Key not configured. Halting execution.")
else:
    try:
        genai.configure(api_key=API_KEY)
        print("Gemini API Key configured successfully.")
    except Exception as e:
        print(f"ERROR: Failed to configure Gemini API with the provided key. Error: {e}")
        raise ValueError("Gemini API Key configuration failed. Halting execution.")

JUDGE_MODEL_NAME = "models/gemini-1.5-flash-latest" # Suitable Gemini model
TRANSLATION_FILES_DIR = "translations/" # Directory where translation_model.csv are saved 
ARCHAIC_SENTENCE_COLUMN_IN_TRANSLATION_FILES = "Sentence" # Column name of archaic sentences in translation CSVs
TRANSLATION_COLUMN_IN_TRANSLATION_FILES = "translation"  # Column name of the LLM's translation in translation CSVs



# Models and prompt types you used with Ollama (to find the CSV files)
try:
    with open(CONFIG_FILE_PATH, 'r', encoding='utf-8') as f:
        config = json.load(f)

    ollama_models_used = config.get('models', [])
    ollama_prompt_names_used = config.get('prompts', [])

    print("Config success:")
    print(f"Models: {ollama_models_used}")
    print(f"Prompts: {ollama_prompt_names_used}")

except FileNotFoundError:
    print(f"Error: Config file '{CONFIG_FILE_PATH}' not found.")
    # Puoi anche uscire o impostare valori di default qui
    ollama_models_used = []
    ollama_prompt_names_used = []
except json.JSONDecodeError:
    print(f"Error: File '{CONFIG_FILE_PATH}' not a valid JSON.")
    ollama_models_used = []
    ollama_prompt_names_used = []

print("Configuration loaded successfully.")
print(f"Models used: {ollama_models_used}")


all_judgments_data = [] # This will store judgments from ALL files

print("Gemini Setup Complete.")
print(f"Judge Model: {JUDGE_MODEL_NAME}")
print(f"Will look for translation CSVs in: {TRANSLATION_FILES_DIR}")

# --- SAFETY AND GENERATION CONFIG (can be in the same cell or next) ---
# (This part can be taken from your cell e8a09461, it's good)
generation_config = genai.types.GenerationConfig(
    # temperature=0.1 # For more deterministic output from the judge
)
safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]

# Initialize the generative model for the judge
judge_llm_model = genai.GenerativeModel(JUDGE_MODEL_NAME) # Renamed to avoid conflict with ollama 'model' variable
print(f"Using Judge LLM: {judge_llm_model.model_name}")

In [None]:
# This cell will contain the main judging loop
import sys
# --- START A CHAT SESSION WITH GEMINI --- (This part is conceptual for the loop, actual start is inside)
initial_prompt_for_chat = f"""You are an expert evaluator specializing in the translation of archaic Italian to modern Italian.
Your consistent task throughout this session is to assess the quality of "Candidate Modern Translation"s provided to you, based on their corresponding "Original Archaic Sentence"s.

Please use the following detailed 1-5 scale and rubric for ALL your evaluations in this session:
--- RUBRIC START ---
{JUDGE_RUBRIC}
--- RUBRIC END ---

For each pair I provide, carefully consider the rubric. Based on your assessment, provide a single integer score from 1 to 5 that best reflects the quality of the "Candidate Modern Translation".
Output ONLY the integer score for each. Do not add any explanation, prefix, or other text.
I will provide you with sentence pairs one by one. Let's begin.
"""

# Define the directory for saving judged outputs
JUDGED_OUTPUT_DIR = "scores_gemini/" # You can change this path
os.makedirs(JUDGED_OUTPUT_DIR, exist_ok=True) # Create the directory if it doesn't exist

# judge_model_name for the filename (e.g., "gemini-pro")
# We can get this from JUDGE_MODEL_NAME or judge_llm_model.model_name
# Let's sanitize it to be filename-friendly (replace slashes etc.)
safe_judge_model_name = JUDGE_MODEL_NAME.replace("/", "-").replace(":", "-").replace(" ", "_")


# Loop through each combination of Ollama model and prompt type to load the corresponding CSV
for ollama_model_name in ollama_models_used:
    # Sanitize ollama_model_name for filename (e.g., if it's "qwen:7B", make it "qwen-7B")
    safe_ollama_model_name = ollama_model_name.replace(":", "-").replace("/", "-").replace(" ", "_")
    

    for ollama_prompt_name in ollama_prompt_names_used:
        # This list will store judgments for the CURRENT file/combination
        current_file_judgments = []

        input_csv_filename = f"translation_{ollama_model_name}_{ollama_prompt_name}.csv"
        
        full_input_csv_path = os.path.join(TRANSLATION_FILES_DIR, input_csv_filename)

        print(f"\n--- Processing file: {full_input_csv_path} ---")

        try:
            df_translations_to_judge = pd.read_csv(full_input_csv_path)
            print(f"Successfully loaded {len(df_translations_to_judge)} rows from {full_input_csv_path}")
        except FileNotFoundError:
            print(f"ERROR: File {full_input_csv_path} not found. Skipping.")
            continue
        except Exception as e:
            print(f"An error occurred while loading {full_input_csv_path}: {e}. Skipping.")
            continue

        if df_translations_to_judge.empty:
            print(f"File {full_input_csv_path} is empty. Skipping.")
            continue
        
        # --- (RE)START CHAT SESSION FOR EACH FILE ---
        if not judge_llm_model: # Check if judge_llm_model was initialized (API key issue)
            print("ERROR: Judge LLM model not initialized. Cannot proceed with judging. Check API Key.")
            break # Break from the ollama_model_name loop if judge model isn't ready
        
        try:
            chat = judge_llm_model.start_chat(history=[
                {'role': 'user', 'parts': [initial_prompt_for_chat]},
                {'role': 'model', 'parts': ["Understood. I am ready to evaluate the sentence pairs based on the provided rubric and will output only a single integer score from 1 to 5 for each. Please provide the first pair."]}
            ])
            print("Gemini chat session (re)started with initial instructions and rubric.")
        except Exception as e:
            print(f"ERROR: Could not start Gemini chat session for {full_input_csv_path}: {e}. Skipping file.")
            continue # Skip this file if chat can't start
        i = 0
        # Iterate through rows of the current translation file
        for index, row in df_translations_to_judge.iterrows():
            archaic_sent = str(row[ARCHAIC_SENTENCE_COLUMN_IN_TRANSLATION_FILES])
            modern_trans = str(row[TRANSLATION_COLUMN_IN_TRANSLATION_FILES]) if pd.notna(row[TRANSLATION_COLUMN_IN_TRANSLATION_FILES]) else ""
            # Data to store for the current row's judgment (will be part of current_file_judgments)
            judgment_details = {
                "archaic_sentence": archaic_sent, # Will be renamed to 'Sentence'
                "modern_translation": modern_trans, # Will be renamed to 'Translation'
                "judge_llm_score_parsed": None,   # Will be renamed to 'Vote'
                # --- Keeping these for debugging, but won't be in the final simple CSV ---
                "source_csv_debug": input_csv_filename,
                "ollama_model_debug": ollama_model_name,
                "ollama_prompt_type_debug": ollama_prompt_name,
                "judge_llm_raw_output_debug": "",
                "error_message_debug": None
            }

            if modern_trans.startswith("[ERROR]:"):
                print(f"  Skipping row {index+1} due to previous translation error: {modern_trans}")
                judgment_details["judge_llm_raw_output_debug"] = "SKIPPED_OLLAMA_ERROR"
                judgment_details["error_message_debug"] = "Skipped due to original translation error"
                current_file_judgments.append(judgment_details)
                continue

            if not modern_trans:
                print(f"  - Row {index + 1}: No modern translation provided. Skipping.")
                judgment_details["judge_llm_raw_output_debug"] = "NO_TRANSLATION_PROVIDED"
                judgment_details["error_message_debug"] = "No modern translation in source CSV"
                current_file_judgments.append(judgment_details)
                continue

            current_pair_prompt = format_individual_judgment_request(archaic_sent, modern_trans)
            
            try:
                response = chat.send_message(
                    current_pair_prompt,
                    generation_config=generation_config,
                    safety_settings=safety_settings
                )
                raw_score_text = response.text.strip()
                judgment_details["judge_llm_raw_output_debug"] = raw_score_text

                parsed_score = None
                match = re.search(r'^\s*([1-5])\s*$', raw_score_text)
                if match:
                    parsed_score = int(match.group(1))
                else:
                    fallback_match = re.search(r'\b([1-5])\b', raw_score_text)
                    if fallback_match:
                        parsed_score = int(fallback_match.group(1))
                        print(f"    Warning: Parsed score '{parsed_score}' from less strict match. LLM Response: '{raw_score_text}'")
                    else:
                        judgment_details["error_message_debug"] = f"Could not parse a 1-5 score from: '{raw_score_text}'"
                        print(f"    ERROR: {judgment_details['error_message_debug']}. Raw response: '{raw_score_text}'")
                
                judgment_details["judge_llm_score_parsed"] = parsed_score

            except Exception as e:
                print(f"    ERROR judging translation for row {index + 1}: {e}")
                judgment_details["judge_llm_raw_output_debug"] = "API_ERROR_DURING_CHAT"
                judgment_details["error_message_debug"] = str(e)
            
            current_file_judgments.append(judgment_details)
            max_msg_len = len(f"Elaborato: 100/100 elementi...")
            i= i + 1
    
            # Costruisci il messaggio
            message = f"Elaborato: {i}/100 elementi..."
        
            # Pulisci la riga precedente riempiendo con spazi fino alla lunghezza massima prevista
            # e poi sposta il cursore all'inizio della riga con '\r'
            sys.stdout.write(f"\r{message.ljust(max_msg_len)}")
        
            # Forza la scrittura su stdout (necessario per aggiornare in tempo reale)
            sys.stdout.flush()

            time.sleep(4.3) # Rate limiting if needed
        
        # --- After processing all rows for the current input CSV ---
        if current_file_judgments:
            df_current_judgments = pd.DataFrame(current_file_judgments)
            
            # Select and rename columns for the final output format
            df_output_specific = df_current_judgments[[
                "archaic_sentence", 
                "modern_translation", 
                "judge_llm_score_parsed"
            ]].copy() # Use .copy() to avoid SettingWithCopyWarning
            
            df_output_specific.rename(columns={
                "archaic_sentence": "Sentence",
                "modern_translation": "Translation",
                "judge_llm_score_parsed": "Score"
            }, inplace=True)

            # Construct the output filename
            # judge_model_name is `safe_judge_model_name`
            # translation_model_name is `safe_ollama_model_name`
            # prompt is `ollama_prompt_name`
            output_filename_specific = f"judge_{safe_judge_model_name}_{safe_ollama_model_name}_{ollama_prompt_name}.csv"
            full_output_path_specific = os.path.join(JUDGED_OUTPUT_DIR, output_filename_specific)

            try:
                df_output_specific.to_csv(full_output_path_specific, index=False)
                print(f"Successfully saved judged results to '{full_output_path_specific}'")
            except Exception as e:
                print(f"Error saving specific judged results to '{full_output_path_specific}': {e}")
        else:
            print(f"No judgments were made for {full_input_csv_path} (e.g., all rows skipped).")

    if not judge_llm_model: # If API key was bad, no point continuing outer loop
        print("Halting further processing due to Judge LLM initialization failure.")
        break


print("\nOverall Judging process complete. Individual CSVs saved.")