In [None]:
!python --version
!pip install evaluate
!pip install rdflib
!pip install rouge_score
import torch
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    EncoderDecoderModel,
    TrainingArguments,
    Trainer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    DataCollatorForTokenClassification
)
import evaluate
import numpy as np
from sklearn.model_selection import train_test_split

import torch # Import torch to move models and data to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device here as well for clarity within this cell

In [None]:
!git clone https://github.com/omerday/nlp-idiom-he.git idiomem
!git clone https://github.com/dice-group/LIdioms.git LIdioms

In [None]:
import rdflib

LIDIOM_LANGUAGES = [('english', 'en'), ('german', 'de'), ('italian', 'it'), ('portuguese', 'pt'), ('russian', 'rus')]

def parse_lidioms():
  translation_dataset = {}
  for language, prefix in LIDIOM_LANGUAGES:
    translation_dataset[language] = []
    g = rdflib.Graph()
    try:
        g.parse(f"LIdioms/{prefix}/{language}.ttl", format="turtle")
    except Exception as e:
        print(f"Error parsing file: {e}")
        return []

    # SPARQL query to find the label and definition pairs by traversing the graph
    query = """
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX ontolex: <http://www.w3.org/ns/lemon/ontolex#>
        PREFIX skos: <http://www.w3.org/2004/02/skos/core#>

        SELECT ?label ?definition
        WHERE {
          ?entry a ontolex:LexicalEntry ;
                 rdfs:label ?label ;
                 ontolex:sense ?sense .
          ?sense ontolex:isLexicalizedSenseOf ?concept .
          ?concept skos:definition ?definition .
        }
    """

    results = g.query(query)

    for row in results:
        # The query returns Literal objects, we convert them to strings
        if not row.label or not row.definition:
            continue
        translation_dataset[language].append(f'The English definition of {row.label} is: {row.definition}')

    print(f"{language.upper()} Corpus: {len(translation_dataset[language])}")

  return translation_dataset

translation_dataset = parse_lidioms()

In [None]:
def parse_hebrew_translation():
  hebrew_sentences = []
  with open("idiomem/hebrew_idioms_with_english_translation.txt", "r") as file:
    for line in file:
      hebrew_sentences.append(line.replace("\n", ""))
  return hebrew_sentences


In [None]:
MODEL_CHECKPOINTS = {
    "M2M100": "facebook/m2m100_418M",
    "mBART": "facebook/mbart-large-50",
    "mT5": "google/mt5-base",
    "mBERT": "bert-base-multilingual-cased",
    "XLM-RoBERTa": "xlm-roberta-base",
}

In [None]:
def preprocess_function(examples, tokenizer, max_input_length=128, max_target_length=128):
    """Tokenizes the input and target texts for a seq2seq task."""
    dropped = 0
    inputs = []
    targets = []
    for ex in examples["text"]:
      if " is: " in ex:
        inputs.append(ex.split(" is: ")[0] + " is: ")
        targets.append(ex.split(" is: ")[1])
      else:
        dropped += 1
        inputs.append("")
        targets.append("")

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(text_target=targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    print(f"Dropped sentences: {dropped}")
    return model_inputs

# Load the ROUGE metric for evaluation
rouge = evaluate.load("rouge")

In [None]:
def compute_metrics(eval_pred, tokenizer):
    """Computes ROUGE, token-level F1/Precision/Recall, and strict accuracy."""
    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE score
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    # Calculate token-level F1, Precision, Recall
    total_f1, total_precision, total_recall = 0, 0, 0
    correct_predictions = 0

    for pred, label in zip(decoded_preds, decoded_labels):
        pred_tokens = set(pred.strip().split())
        label_tokens = set(label.strip().split())

        intersection = len(pred_tokens.intersection(label_tokens))

        if len(pred_tokens) > 0:
            precision = intersection / len(pred_tokens)
        else:
            precision = 0

        if len(label_tokens) > 0:
            recall = intersection / len(label_tokens)
        else:
            recall = 0

        if (precision + recall) > 0:
            f1 = 2 * (precision * recall) / (precision + recall)
        else:
            f1 = 0

        total_precision += precision
        total_recall += recall
        total_f1 += f1

        # Calculate strict accuracy (exact match)
        if pred.strip() == label.strip():
            correct_predictions += 1

    num_samples = len(decoded_labels)
    result['f1'] = (total_f1 / num_samples) * 100
    result['precision'] = (total_precision / num_samples) * 100
    result['recall'] = (total_recall / num_samples) * 100
    result['accuracy'] = (correct_predictions / num_samples) * 100

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def show_translation_examples(trainer, tokenizer, test_dataset, num_examples=3):
    """Predicts and prints a few translation examples."""
    print("\n--- Translation Examples ---")

    generation_kwargs = {
        "min_length": 5,
        "max_length": 128,
        "num_beams": 4, # Beam search can find better sequences
    }

    # Get predictions
    predictions = trainer.predict(test_dataset, **generation_kwargs)
    preds = predictions.predictions
    labels = predictions.label_ids

    # Decode
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_preds = [pred.replace("<extra_id_0>", "").strip() for pred in decoded_preds]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Get original inputs
    inputs = tokenizer.batch_decode(test_dataset['input_ids'], skip_special_tokens=True)

    for i in range(min(num_examples, len(inputs))):
        print(f"INPUT:    {inputs[i]}")
        print(f"EXPECTED: {decoded_labels[i]}")
        print(f"PREDICTED:  {decoded_preds[i]}\n")
        print("-" * 25)

In [None]:
def run_experiment(model_name, model_checkpoint, training_data, test_data, src_lang, tgt_lang):
    """Fine-tunes and evaluates a model, now with example printing."""
    print(f"\n--- Running experiment for {model_name} ---")
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    if "mbart" in model_checkpoint.lower():
        tokenizer.src_lang = src_lang
        tokenizer.tgt_lang = tgt_lang
    elif "m2m100" in model_checkpoint.lower():
        tokenizer.src_lang = src_lang[:2]
        tokenizer.tgt_lang = tgt_lang[:2]

    raw_datasets = DatasetDict({
        'train': Dataset.from_dict({'text': training_data}),
        'test': Dataset.from_dict({'text': test_data})
    })
    tokenized_datasets = raw_datasets.map(
        lambda x: preprocess_function(x, tokenizer), batched=True
    )

    if "bert" in model_checkpoint or "roberta" in model_checkpoint:
        print(f"Adapting {model_name} for Seq2Seq task using EncoderDecoderModel.")
        model = EncoderDecoderModel.from_encoder_decoder_pretrained(
            model_checkpoint, model_checkpoint
        )
        model.config.decoder_start_token_id = tokenizer.cls_token_id
        model.config.eos_token_id = tokenizer.sep_token_id
        model.config.pad_token_id = tokenizer.pad_token_id
        model.generation_config.decoder_start_token_id = tokenizer.cls_token_id
        model.generation_config.eos_token_id = tokenizer.sep_token_id
        model.generation_config.pad_token_id = tokenizer.pad_token_id
    else:
        model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

    args = Seq2SeqTrainingArguments(
        output_dir=f"./results/{model_name}_finetuned",
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=2,
        num_train_epochs=10,
        predict_with_generate=True,
        fp16=False,
        logging_steps=10,
        generation_max_length=128
    )
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
    trainer = Seq2SeqTrainer(
        model, args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=lambda p: compute_metrics(p, tokenizer),
    )

    print("Starting training...")
    trainer.train()
    print("Evaluating...")
    eval_results = trainer.evaluate()

    show_translation_examples(trainer, tokenizer, tokenized_datasets["test"], 10)

    return eval_results

In [None]:
idiom_data = parse_lidioms()
idiom_data["hebrew"] = parse_hebrew_translation()
hebrew_dataset = idiom_data["hebrew"]
hebrew_train_test_split = Dataset.from_dict({'text': hebrew_dataset}).train_test_split(test_size=0.5, seed=42)
hebrew_train = hebrew_train_test_split['train']['text']
hebrew_test = hebrew_train_test_split['test']['text']

print(f"Hebrew training dataset is of size {len(hebrew_train)} and includes, for example:\n{hebrew_train[:5]}")
print(f"Hebrew testing dataset is of size {len(hebrew_test)} and includes, for example:\n{hebrew_test[:5]}")

all_languages_train = [idiom for lang, idioms in idiom_data.items() if lang != "hebrew" for idiom in idioms]
print(f"ALL_LANGUAGE training dataset size: {len(all_languages_train)}")
print(f"ENGLISH training dataset size: {len(idiom_data['english'])}")
print(f"HEBREW training dataset size: {len(hebrew_train)}")

experiments = {
    "Hebrew-Only": {"train": hebrew_train, "test": hebrew_test, "src": "he_IL", "tgt": "en_XX"},
    "English-Only (Zero-Shot)": {"train": idiom_data["english"], "test": hebrew_dataset, "src": "en_XX", "tgt": "en_XX"},
    "All-Languages (Zero-Shot)": {"train": all_languages_train, "test": hebrew_dataset, "src": "en_XX", "tgt": "en_XX"}
}

final_results = {}
for model_name, model_checkpoint in MODEL_CHECKPOINTS.items():
    final_results[model_name] = {}
    for exp_name, data_config in experiments.items():
        print(f"\n{'='*25}\nMODEL: {model_name} | EXPERIMENT: {exp_name}\n{'='*25}")
        results = run_experiment(
            model_name,
            model_checkpoint,
            data_config["train"],
            data_config["test"],
            data_config["src"],
            data_config["tgt"]
        )
        # Store key metrics for summary table
        final_results[model_name][exp_name] = {
            'ROUGE-1': results.get('eval_rouge1', 0),
            'F1-Score': results.get('eval_f1', 0),
            'Accuracy': results.get('eval_accuracy', 0)
        }

print("\n\n--- Experiment Summary ---")
# Reformat results for better display with multi-level columns
reformated_results = {}
for model, exps in final_results.items():
    for exp_name, metrics in exps.items():
        if exp_name not in reformated_results:
            reformated_results[exp_name] = {}
        reformated_results[exp_name][model] = metrics

# Print a separate table for each experiment scenario
for exp_name, model_metrics in reformated_results.items():
    print(f"\n--- SCENARIO: {exp_name} ---\n")
    df = pd.DataFrame(model_metrics).T # Transpose to have models as rows
    print(df.to_markdown())