In [41]:
!pip install sacrebleu
!pip install rouge-score
!pip install comet-ml
!pip install unbabel-comet
!pip install datasets
!pip install evaluate
!pip install nltk
!pip install sacrebleu



In [44]:
from google.colab import drive
import random
import sacrebleu
from rouge_score import rouge_scorer
from comet import download_model, load_from_checkpoint
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu
import nltk
import evaluate
from evaluate import load

In [3]:
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/en-it.tmx"
sentence_pairs = []

Mounted at /content/drive


In [4]:
# Load the saved model and tokenizer
MODEL_PATH = "/content/drive/MyDrive/llama-translation2/checkpoint-1500"  # Replace with your saved model path
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

# Constants
START_SYMBOL_SOURCE = "<START_SYMBOL_source>"
END_SYMBOL_SOURCE = "<END_SYMBOL_SOURCE>"
START_SYMBOL_TARGET = "<START_SYMBOL_TARGET>"
END_SYMBOL_TARGET = "<END_SYMBOL_TARGET>"

In [5]:
for token_name, token_string in tokenizer.special_tokens_map.items():
  token_id = tokenizer.convert_tokens_to_ids(token_string)
  print(f'{token_name}: {token_string} (ID: {token_id})')

bos_token: <|begin_of_text|> (ID: 128000)
eos_token: <|eot_id|> (ID: 128009)
pad_token: [PAD] (ID: 128256)
additional_special_tokens: ['<START_SYMBOL_source>', '<END_SYMBOL_SOURCE>', '<START_SYMBOL_TARGET>', '<END_SYMBOL_TARGET>'] (ID: [128257, 128258, 128259, 128260])


In [9]:
# Function to translate a list of sentences
def translate_sentences(sentences, model, tokenizer, device="cuda" if torch.cuda.is_available() else "cpu"):
    model.to(device)
    model.eval()
    translations = []
    with torch.no_grad():
        for sentence in tqdm(sentences, desc="Translating Sentences"):
            input_text = f"{START_SYMBOL_SOURCE} {sentence} {END_SYMBOL_SOURCE} {START_SYMBOL_TARGET}"
            inputs = tokenizer(input_text, return_tensors="pt").to(device)
            outputs = model.generate(
                **inputs,
                max_length=512,
                num_beams=5,
                early_stopping=True
            )
            translation = tokenizer.decode(outputs[0], skip_special_tokens=False)
            translation = translation.split("<END_SYMBOL_SOURCE>")[1].strip()
            translation = translation.split(">")[1].split("<")[0].strip()
            #translation = translation.replace("<START_SYMBOL_TARGET>", "").replace("<END_SYMBOL_TARGET>", "
            translations.append(translation)
    return translations

In [10]:
sample_sentences = [
    "in an asexual species if you get two different mutations in different creatures a green one and a red one then one has to be better than the other",
    "He’s sort of a Homer Simpson with fins",
    "So, if algorithms are going to curate the world for us if they re going to decide what we get to see and what we don t get to see then we need to make sure that they re not just keyed to relevance",
    "They’re not even autonomous",
    "there s a marker line called the trim line above our little red illustration there"
]

In [20]:
dataframe = pd.read_csv("preprocessed_data.csv")

In [11]:



sample_translations = translate_sentences(sample_sentences, model, tokenizer)
print("Sample Translations:")
for sentence, translation in zip(sample_sentences, sample_translations):
    print(f"Source: {sentence}")
    print(f"Translation: {translation}")
    print()

Translating Sentences:   0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:  20%|██        | 1/5 [00:12<00:50, 12.64s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:  40%|████      | 2/5 [00:24<00:36, 12.17s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:  60%|██████    | 3/5 [00:36<00:23, 11.88s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:  80%|████████  | 4/5 [00:47<00:11, 11.79s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences: 100%|██████████| 5/5 [00:59<00:00, 11.82s/it]

Sample Translations:
Source: in an asexual species if you get two different mutations in different creatures a green one and a red one then one has to be better than the other
Translation: In una specie aspettativa se una specie una specie diversa è diversa una grande diversa e poi si mette in un un altra

Source: He’s sort of a Homer Simpson with fins
Translation: Lui ha fatta con una specie di nome

Source: So, if algorithms are going to curate the world for us if they re going to decide what we get to see and what we don t get to see then we need to make sure that they re not just keyed to relevance
Translation: Quindi se stiamo in modo il mondo ci stiamo cercando di spiegare ciò che stiamo cercando di capire cosa stiamo cercando di capire cosa stiamo facendo per raccogliere ciò che vogliamo fare per capire

Source: They’re not even autonomous
Translation: Hanno raggiungono la peggiora

Source: there s a marker line called the trim line above our little red illustration there
Transl




In [29]:
sample_dataframe = dataframe.sample(100)

In [30]:
sample_translations = translate_sentences(sample_dataframe['Source_clean'], model, tokenizer)
sample_translations

Translating Sentences:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:   1%|          | 1/100 [00:11<19:07, 11.59s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:   2%|▏         | 2/100 [00:23<18:52, 11.55s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:   3%|▎         | 3/100 [00:34<18:37, 11.52s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:   4%|▍         | 4/100 [00:46<18:23, 11.50s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:   5%|▌         | 5/100 [00:57<17:57, 11.34s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Translating Sentences:   6%|▌         | 6/100 [01:08<17:57, 11.46s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Transla

['Questo è il primo posto dove si trovassero l acqua e si muove',
 'In realtà il migliore di una scuola elementare da un scuola superiore a scuola in scuola',
 'Ma usando il linguaggio di questo linguaggio possiamo capire che le imprese possiamo capire queste cose sono gli stessi',
 'Il primo giorno dopo il primo giorno dopo e se ne andavano in gara fosse un picco di minuti',
 'Ma sapevo che c erano esempi di gente che sono in grado di raccogliere le loro stesse anche che riconoscevano attraverso la loro popolazione del loro tasso di riconoscere quando i loro rifiuti i loro prodotti',
 'Andiamo al di là',
 'Io credo in un certo senso il concetto di poter diventare uno strumento prima di poter essere diventato',
 'Non puoi realizzare le funzioni di un onda che ci mettono insieme in realtà lo sviluppo dell umanità',
 'Quando voglio guardare vedo persone che voglio fare con persone che vogliono ascoltare un mondo migliore osservando persone che utilizzano le persone che fanno un mondo mig

In [33]:
sample_dataframe['translated'] = sample_translations
sample_dataframe.head()

Unnamed: 0,Source_clean,Target_clean,translated
5690,This was the first place where we got some wat...,Fu il primo posto dove trovammo acqua e viveri,Questo è il primo posto dove si trovassero l a...
35891,As a matter of fact the best advice I got was ...,In realtà il miglior consiglio che mi è stato ...,In realtà il migliore di una scuola elementare...
32792,But using the language of Galois we can unders...,Ma usando il linguaggio di Galois possiamo cap...,Ma usando il linguaggio di questo linguaggio p...
45298,The first day they would take a hike and it wa...,Il primo giorno hanno fatto un escursione ed e...,Il primo giorno dopo il primo giorno dopo e se...
61041,But I knew that there were examples of creatur...,Ma sapevo che c erano esempi di creature mammi...,Ma sapevo che c erano esempi di gente che sono...


In [58]:
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
comet_path = download_model("wmt20-comet-da")  # Download COMET model
comet_model = load_from_checkpoint(comet_path)


wmt20-comet-da.tar.gz: 1.79GB [00:35, 50.3MB/s]                            
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt20-comet-da/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [62]:
# Metric calculation functions
def calculate_bleu(references, predictions):
    references_list = references.tolist()
    predictions_list = predictions.tolist()
    bleu = bleu_metric.compute(predictions = references_list, references = predictions_list)
    return bleu

def calculate_rouge(references, predictions):
    references_list = references.tolist()
    predictions_list = predictions.tolist()
    rouge = rouge_metric.compute(predictions=references_list, references=predictions_list)
    return rouge

def calculate_comet(references, predictions, sources):
    data = [{"src": sources, "mt": pred, "ref": ref} for src, pred, ref in zip(sources, predictions, references)]
    comet_scores = comet_model.predict(data, batch_size=8, gpus=1 if torch.cuda.is_available() else 0)
    return comet_scores

# Main evaluation function
def evaluate_metrics(test_dataset, model, tokenizer):
    sources = test_dataset["Source_clean"]
    references = test_dataset["Target_clean"]
    predictions = test_dataset["translated"]
    # Calculate metrics
    bleu_score = calculate_bleu(references, predictions)
    rouge_scores = calculate_rouge(references, predictions)
    comet_scores = calculate_comet(references, predictions, sources)

    bleu_score = bleu_score["bleu"]  # Solo il BLEU score principale
    rouge_score = rouge_scores["rougeL"]  # ROUGE-L score
    comet_score = (sum(comet_scores["scores"]) / len(comet_scores["scores"]))

                  #Organizza i risultati in un dizionario e crea una tabella ordinata
    scores_data = {
        "Metric": ["BLEU", "ROUGE-L", "COMET"],
        "Score": [bleu_score, rouge_score, comet_score]
    }

    scores_df = pd.DataFrame(scores_data)
    return scores_df

metrics = evaluate_metrics(sample_dataframe, model, tokenizer)
print(metrics)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting DataLoader 0: 100%|██████████| 13/13 [00:04<00:00,  2.63it/s]


    Metric     Score
0     BLEU  0.211719
1  ROUGE-L  0.434268
2    COMET -0.934076
