# With Ollama

In [None]:
%pip install transformers accelerate torch
%pip install ollama

## Define parameters

In [4]:
import pandas as pd
from tqdm import tqdm

dataset_path = "dataset/dataset_cleaned.csv"
df = pd.read_csv(dataset_path) #creation of dataframe
output_path = "dataset/output.csv"
col_name = "Sentence"
models = ["gemma", "mistral"]
#prompts
prompt_templates={
    "base":        "Translate the following sentence from archaic italian to modern italian:\n\n{sentence}\n\nTranslation:",
    "detailed": "The following text is written in archaic Italian from the 13th century, originating from the Tuscany region. Rewrite it in modern Italian while preserving the original meaning, clarity, and syntactic coherence. First analyze the structure, then identify key words, then write the final version:\n\n {sentence}\n\nTranslation:",
    "role-based":   "You are an expert linguist specializing in the evolution of Italian language. Translate this 13th century Tuscan text to contemporary Italian while preserving the original meaning, clarity, and syntactic coherence: \n\n{sentence}\n\nTranslation:",
    "few_shot": (
            "Here are some examples of sentences in archaic Italian from the 13th century translated into modern Italian:\n\n"
            "Archaic Italian: «quella guerra ben fatta l' opera perché etc.». Modern Italian: «quella guerra fu condotta bene, e l'opera fu compiuta come previsto.».\n"
            "Archaic Italian: «crudele, e di tutte le colpe pigli vendetta». Modern Italian: «crudele, e si vendica di tutte le colpe.»\n"
            "Archaic Italian: «Non d' altra forza d' animo fue ornato Ponzio Aufidiano». Modern Italian: «Ponzio Aufidiano non era dotato di altro vigore d’animo.»\n\n"
            "Now translate the following sentence from archaic italian to modern italian while preserving the original meaning:\n\n"
            "{sentence}\n\nTranslation:"
                ),
    "teacher_student": (
            "A student asked: 'What does this old Italian sentence mean in modern language?'\n"
            "You, a university professor of historical linguistics, respond with a clear and faithful modern Italian translation\n\n{sentence}\n\nTranslation:"
)
}


## Translate the sentences

In [5]:

from ollama import Client

client = Client(host='http://localhost:11434')#client to local ollama

for model_name in models:#iterate through models
    for prompt_name, prompt_template in prompt_templates.items():#iterate through prompt templates
        print(f"\n Translation with model: {model_name} | prompt: {prompt_name}")
        translations = []#list to store translations
        for sentence in tqdm(df[col_name]):#iterate through sentences
            try:
                prompt = prompt_template.format(sentence=sentence)#give the prompt

                response = client.chat(
                    model=model_name,
                    messages=[{"role": "user", "content": prompt}]
                )#ollama api call

                translation = response['message']['content'].strip()#extract the translation


            except Exception as e:
                translation = f"[ERROR]: {e}"
            translations.append(translation)#append the translation to the list

        df["translation"] = translations# add the translations to the dataframe
        if model_name == "qwen:7B":
            output_file = f"translation_qwen_{prompt_name}.csv"# create the output file name
        else:
            output_file = f"translation_{model_name}_{prompt_name}.csv"# create the output file name

        df.to_csv(output_file, index=False)# save the dataframe to a csv file
        print(f"Translation saved in '{output_file}'")      


 Translation with model: gemma | prompt: base


100%|██████████| 97/97 [05:03<00:00,  3.13s/it]


Translation saved in 'translation_gemma_base.csv'

 Translation with model: gemma | prompt: detailed


100%|██████████| 97/97 [23:56<00:00, 14.81s/it]


Translation saved in 'translation_gemma_detailed.csv'

 Translation with model: gemma | prompt: role-based


100%|██████████| 97/97 [06:27<00:00,  4.00s/it]


Translation saved in 'translation_gemma_role-based.csv'

 Translation with model: gemma | prompt: few_shot


100%|██████████| 97/97 [05:59<00:00,  3.71s/it]


Translation saved in 'translation_gemma_few_shot.csv'

 Translation with model: gemma | prompt: teacher_student


100%|██████████| 97/97 [09:28<00:00,  5.87s/it]


Translation saved in 'translation_gemma_teacher_student.csv'

 Translation with model: mistral | prompt: base


100%|██████████| 97/97 [17:32<00:00, 10.85s/it]


Translation saved in 'translation_mistral_base.csv'

 Translation with model: mistral | prompt: detailed


100%|██████████| 97/97 [12:58<00:00,  8.03s/it]


Translation saved in 'translation_mistral_detailed.csv'

 Translation with model: mistral | prompt: role-based


100%|██████████| 97/97 [08:00<00:00,  4.96s/it]


Translation saved in 'translation_mistral_role-based.csv'

 Translation with model: mistral | prompt: few_shot


100%|██████████| 97/97 [04:23<00:00,  2.72s/it]


Translation saved in 'translation_mistral_few_shot.csv'

 Translation with model: mistral | prompt: teacher_student


100%|██████████| 97/97 [06:02<00:00,  3.74s/it]

Translation saved in 'translation_mistral_teacher_student.csv'





In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Unbabel/M-Prometheus-3B"
cache_dir = "./models/m_prometheus_3b"  # Optional: specify local directory to cache the model

# Load tokenizer and model, this will download if not cached locally
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir)


tokenizer_config.json:   0%|          | 0.00/7.31k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00002-of-00002.safetensors:   0%|          | 0.00/1.21G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

LLM Judge

In [None]:

JUDGE_RUBRIC = """
You are evaluating the quality of a modern Italian translation based on an original archaic Italian sentence.
Please score the "Candidate Modern Translation" using the following 1-5 scale:

1.  **Completely Unacceptable Translation:**
    *   The translation has no pertinence with the original meaning.
    *   The generated sentence is either gibberish, makes no sense, or is completely unrelated to the archaic text.
    *   Contains severe errors that render it incomprehensible or entirely misleading.

2.  **Severe Semantic Errors/Omissions:**
    *   The translation contains significant semantic errors, critical omissions of meaning from the archaic text, or substantial incorrect additions.
    *   While some words might be recognizable, the core meaning is lost or heavily distorted.
    *   The modernization is poor, leaving many archaic forms or incorrectly modernizing them.
    *   Likely many grammatical errors in modern Italian.

3.  **Partially Wrong Translation / Lackluster:**
    *   The translation captures some of the original meaning but is lackluster or contains noticeable errors.
    *   Errors are mostly minor (e.g., awkward phrasing, typos, minor grammatical mistakes in modern Italian, some less critical semantic misunderstandings or misinterpretations of archaic terms).
    *   Some archaic features might be awkwardly modernized or missed.
    *   The overall quality is mediocre; it's understandable but clearly flawed.

4.  **Good Translation:**
    *   The translation is mostly accurate and successfully conveys the core meaning of the archaic sentence.
    *   It is substantially faithful to the original text.
    *   The modern Italian is fluent and comprehensible.
    *   Archaic features are generally well modernized.
    *   There might be minor stylistic imperfections (e.g., style doesn't perfectly match natural modern Italian, slight awkwardness) or very minor errors that do not significantly impact understanding or meaning.

5.  **Perfect Translation:**
    *   The translation is completely accurate, fully conveying the meaning and nuances of the original archaic sentence.
    *   It is perfectly fluent, natural-sounding, and grammatically correct modern Italian.
    *   All archaic linguistic features (vocabulary, syntax, orthography) are correctly and appropriately modernized.
    *   The style is appropriate for modern Italian.
    *   No errors.
"""

print("Rubric Defined.")


In [None]:
def create_judge_prompt(archaic_text, modern_translation, rubric):
    """
    Creates the prompt for the LLM-as-a-Judge.
    """
    return f"""You are an expert evaluator specializing in the translation of archaic Italian to modern Italian.
                Your task is to assess the quality of the "Candidate Modern Translation" provided below, based on the "Original Archaic Sentence".

                Please use the following detailed 1-5 scale and rubric for your evaluation:
                --- RUBRIC START ---
                {rubric}
                --- RUBRIC END ---

                Original Archaic Sentence:
                "{archaic_text}"

                Candidate Modern Translation:
                "{modern_translation}"

                Carefully consider the rubric. Based on your assessment, provide a single integer score from 1 to 5 that best reflects the quality of the "Candidate Modern Translation".
                Output ONLY the integer score. Do not add any explanation, prefix, or other text.

                Score:"""

print("Judge Prompting Function Defined.")

In [None]:
import google.generativeai as genai
import pandas as pd
import os
import time
import re # For parsing the score


API_KEY = ""

genai.configure(api_key=API_KEY)

JUDGE_MODEL_NAME = "gemini-pro"



llms = []
prompts = []

for llm in llms:

    for prompt in prompts:

        CSV_FILE_PATH = "translations.csv"
        OUTPUT_CSV_PATH = "judged_translations.csv"


        TRANSLATION_SYSTEM_COLUMNS = ["translation_system_1", "translation_system_2", "translation_system_3"]
        ARCHAIC_SENTENCE_COLUMN = "archaic_sentence"

        print("Setup Complete.")
        print("\nJudge Model:", JUDGE_MODEL_NAME)
        print("\nJudging: ",) # The model and prompt that is judging



        





In [None]:
try:
    df_translations = pd.read_csv(CSV_FILE_PATH)
    print(f"Successfully loaded {len(df_translations)} rows from {CSV_FILE_PATH}")
    # Display first few rows to verify
    print("\nFirst 5 rows of your data:")
    print(df_translations.head())
except FileNotFoundError:
    print(f"ERROR: The file {CSV_FILE_PATH} was not found. Please check the path.")
    df_translations = pd.DataFrame() # Create empty df to avoid later errors if notebook run continues
except Exception as e:
    print(f"An error occurred while loading the CSV: {e}")
    df_translations = pd.DataFrame()

# List to store all judgment results
all_judgments_data = []


# Initialize the generative model
judge_model = genai.GenerativeModel(JUDGE_MODEL_NAME)
print(f"Using Judge Model: {judge_model.model_name}")

# Safety setting for generation - can be adjusted.
# Higher values for 'block_...' mean more conservative blocking.
# See https://ai.google.dev/docs/safety_setting_gemini
# You might want to adjust these if your content is being blocked.
# For this task, default should be mostly fine.
generation_config = genai.types.GenerationConfig(
    # temperature=0.1 # For more deterministic output from the judge
)
safety_settings = [
    {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
    {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
]


if not df_translations.empty:
    print(f"\nStarting judging process for {len(df_translations)} archaic sentences...")
    for index, row in df_translations.iterrows():
        archaic_sent = str(row[ARCHAIC_SENTENCE_COLUMN]) # Ensure it's a string
        print(f"\nJudging translations for archaic sentence {index + 1}/{len(df_translations)}: \"{archaic_sent[:70]}...\"")

        for system_col_name in TRANSLATION_SYSTEM_COLUMNS:
            modern_trans = str(row[system_col_name]) if pd.notna(row[system_col_name]) else "" # Handle potential NaN/empty translations

            if not modern_trans: # Skip if no translation from this system
                print(f"  - System '{system_col_name}': No translation provided. Skipping.")
                all_judgments_data.append({
                    "archaic_sentence": archaic_sent,
                    "translation_system": system_col_name,
                    "modern_translation": modern_trans,
                    "judge_llm_raw_output": "NO_TRANSLATION_PROVIDED",
                    "judge_llm_score_parsed": None,
                    "error_message": "No translation provided by system"
                })
                continue

            prompt_text = create_judge_prompt(archaic_sent, modern_trans, JUDGE_RUBRIC)

            try:
                print(f"  - System '{system_col_name}': Sending to judge...")
                response = judge_model.generate_content(
                    prompt_text,
                    generation_config=generation_config,
                    safety_settings=safety_settings
                )
                raw_score_text = response.text.strip()
                print(f"    Raw response from Judge: '{raw_score_text}'")

                # Attempt to parse the score
                parsed_score = None
                error_msg = None
                match = re.search(r'^\s*([1-5])\s*$', raw_score_text) # Looks for a single digit 1-5, allows whitespace
                if match:
                    parsed_score = int(match.group(1))
                else:
                    # Fallback if LLM didn't follow instructions perfectly
                    fallback_match = re.search(r'\b([1-5])\b', raw_score_text) # Finds a digit 1-5 within other text
                    if fallback_match:
                        parsed_score = int(fallback_match.group(1))
                        print(f"    Warning: Parsed score '{parsed_score}' from less strict match. LLM Response: '{raw_score_text}'")
                    else:
                        error_msg = f"Could not parse a 1-5 score from: '{raw_score_text}'"
                        print(f"    ERROR: {error_msg}")

                all_judgments_data.append({
                    "archaic_sentence": archaic_sent,
                    "translation_system": system_col_name,
                    "modern_translation": modern_trans,
                    "judge_llm_raw_output": raw_score_text,
                    "judge_llm_score_parsed": parsed_score,
                    "error_message": error_msg
                })

            except genai.types.generation_types.BlockedPromptException as bpe:
                print(f"    ERROR: Prompt for '{system_col_name}' was blocked. Reason: {bpe}")
                all_judgments_data.append({
                    "archaic_sentence": archaic_sent,
                    "translation_system": system_col_name,
                    "modern_translation": modern_trans,
                    "judge_llm_raw_output": "BLOCKED_PROMPT",
                    "judge_llm_score_parsed": None,
                    "error_message": f"Prompt blocked by API: {bpe}"
                })
            except Exception as e:
                print(f"    ERROR judging translation from '{system_col_name}': {e}")
                all_judgments_data.append({
                    "archaic_sentence": archaic_sent,
                    "translation_system": system_col_name,
                    "modern_translation": modern_trans,
                    "judge_llm_raw_output": "API_ERROR",
                    "judge_llm_score_parsed": None,
                    "error_message": str(e)
                })

            # Be mindful of API rate limits (requests per minute)
            # Gemini free tier allows 60 RPM for gemini-pro.
            # If you have many items or use a model with stricter limits, uncomment and adjust sleep time.
            # time.sleep(1.1) # Sleep for a bit over 1 second to stay under 60 RPM

    print("\nJudging process complete.")
else:
    print("No data loaded from CSV. Skipping judging process.")

# Convert results to a DataFrame
df_judged_results = pd.DataFrame(all_judgments_data)

In [None]:
if not df_judged_results.empty:
    print("\n--- Judged Results (First 10 rows) ---")
    print(df_judged_results.head(10))

    print("\n--- Value Counts of Parsed Scores ---")
    print(df_judged_results['judge_llm_score_parsed'].value_counts(dropna=False).sort_index())

    print("\n--- Cases with Parsing Errors or API Issues ---")
    print(df_judged_results[df_judged_results['judge_llm_score_parsed'].isna() & (df_judged_results['error_message'] != "No translation provided by system")])


    # Save the results to a new CSV
    try:
        df_judged_results.to_csv(OUTPUT_CSV_PATH, index=False)
        print(f"\nSuccessfully saved judged results to {OUTPUT_CSV_PATH}")
    except Exception as e:
        print(f"\nError saving results to CSV: {e}")
else:
    print("No judgments were made. Output DataFrame is empty.")