<a href="https://colab.research.google.com/github/Pakyy/NLP-Project---MT-en-it/blob/main/seq2seq_pre_trained_t5_mt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install tqdm
!pip install keras
!pip install tensorflow
!pip install translate-toolkit
!pip install evaluate
!pip install nltk rouge-score
!pip install unbabel-comet
!pip install sacrebleu
!pip install comet-ml
!pip install datasets --upgrade

In [2]:
%%capture
from google.colab import drive
from translate.storage import tmx
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter
from wordcloud import WordCloud
import tensorflow as tf
import keras
import sklearn
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from tqdm import tqdm
import evaluate
from evaluate import load
import xml.etree.ElementTree as ET
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import datasets
from datasets import Dataset
from transformers import MBartForConditionalGeneration, MBartTokenizer


In [3]:
# Mount Google Drive to access the file
from google.colab import drive
drive.mount('/content/drive')

file_path = "/content/drive/MyDrive/en-it.tmx"
sentence_pairs = []

# Open the TMX file in read-binary mode
with open(file_path, 'rb') as f:
    tmx_file = tmx.tmxfile(f)

# Iterate through the translation units and store sentence pairs
for unit in tmx_file.units:
    source_text = unit.source
    target_text = unit.target
    if source_text and target_text:
        sentence_pairs.append((source_text, target_text))

# Create a DataFrame from the sentence pairs
df = pd.DataFrame(sentence_pairs, columns=['Source', 'Target'])

Mounted at /content/drive


In [4]:
# Create a DataFrame from the sentence pairs
df = pd.DataFrame(sentence_pairs, columns=['Source', 'Target'])

# Function to clean text by removing special characters and handling accents
def clean_text(text):
    text = re.sub(r"http\S+|www.\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-ZàèéìòùÀÈÉÌÒÙçÇ]", " ", text)  # Keep letters and specific Italian characters
    text = re.sub(r"\s+", " ", text)  # Replace multiple spaces with single space
    return text.strip().lower()  # Convert to lowercase for consistency

# Apply cleaning function to Source and Target
df['Source_clean'] = df['Source'].apply(clean_text)
df['Target_clean'] = df['Target'].apply(clean_text)

# Remove rows with empty sentences after cleaning
df = df[(df['Source_clean'].str.strip() != '') & (df['Target_clean'].str.strip() != '')]

# Remove sentences that are too short (less than 3 words) or too long (more than 96 words)
df = df[
    df['Source_clean'].apply(lambda x: 3 <= len(x.split()) <= 96) &
    df['Target_clean'].apply(lambda x: 3 <= len(x.split()) <= 96)
]

# Print the length of the dataset after cleaning
print(f"Dataset length after cleaning: {len(df)}")

# Check for any null values
print("Null values in each column:")
print(df.isnull().sum())

# Remove duplicate sentence pairs
df.drop_duplicates(subset=['Source_clean', 'Target_clean'], inplace=True)

# Display the first 10 cleaned sentence pairs
print("Sample cleaned data:")
print(df[['Source_clean', 'Target_clean']].head(10))

Dataset length after cleaning: 153568
Null values in each column:
Source          0
Target          0
Source_clean    0
Target_clean    0
dtype: int64
Sample cleaned data:
                                         Source_clean  \
1   there s a tight and surprising link between th...   
2             fish health mission blue oceans science   
4         stephen palumbi following the mercury trail   
5        it can be a very complicated thing the ocean   
6   and it can be a very complicated thing what hu...   
7   and bringing those two together might seem a v...   
8   and those simple themes aren t really themes a...   
9   and i m going to start with this one if momma ...   
10          we know that right we ve experienced that   
11  and if we just take that and we build from the...   

                                         Target_clean  
1   esiste uno stretto e sorprendente legame tra l...  
2             fish health mission blue oceans science  
4           stephen palumbi sull

In [5]:
# Step 2: Split into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Testing set size: {len(test_df)}")

Training set size: 122826
Testing set size: 30707


In [6]:
# Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df[['Source_clean', 'Target_clean']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df[['Source_clean', 'Target_clean']].reset_index(drop=True))

# Rename columns to fit the expected format
train_dataset = train_dataset.rename_column("Source_clean", "source")
train_dataset = train_dataset.rename_column("Target_clean", "target")
test_dataset = test_dataset.rename_column("Source_clean", "source")
test_dataset = test_dataset.rename_column("Target_clean", "target")

# Set the format for compatibility
train_dataset.set_format(type='torch')
test_dataset.set_format(type='torch')

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['source', 'target'],
    num_rows: 122826
})
Dataset({
    features: ['source', 'target'],
    num_rows: 30707
})


In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the pre-trained T5-small tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the translation task prefix
source_prefix = "translate English to Italian: "

# Function to preprocess the data
def preprocess_function(examples):
    inputs = [source_prefix + ex for ex in examples['source']]
    targets = [ex for ex in examples['target']]
    model_inputs = tokenizer(inputs, max_length=96, padding='max_length', truncation=True)

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=96, padding='max_length', truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/122826 [00:00<?, ? examples/s]



Map:   0%|          | 0/30707 [00:00<?, ? examples/s]

Dataset({
    features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 122826
})
Dataset({
    features: ['source', 'target', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 30707
})


In [8]:
# Define training arguments with smaller batch size and gradient accumulation
training_args = TrainingArguments(
    output_dir="./t5_small_en_it",
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    per_device_train_batch_size=8,  # Reduced batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,  # Accumulate gradients to simulate larger batch size
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,  # Reduced number of epochs
    fp16=True,  # Use mixed precision
    logging_steps=100,
)

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,0.6295,0.523075
2,0.5213,0.471269
3,0.4961,0.456292


TrainOutput(global_step=23031, training_loss=0.6171066577681644, metrics={'train_runtime': 4704.3231, 'train_samples_per_second': 78.328, 'train_steps_per_second': 4.896, 'total_flos': 9350714310524928.0, 'train_loss': 0.6171066577681644, 'epoch': 3.0})

In [9]:
# Prendi alcuni esempi di test per fare delle traduzioni
n_examples = 5  # numero di esempi da visualizzare
test_samples = tokenized_test.select(range(n_examples))

# Genera traduzioni di esempio
for i, sample in enumerate(test_samples):
    input_ids = torch.tensor(sample['input_ids']).unsqueeze(0).to(device)  # aggiungi batch dimension
    attention_mask = torch.tensor(sample['attention_mask']).unsqueeze(0).to(device)

    # Genera la traduzione
    translated_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=96)
    translation = tokenizer.decode(translated_ids[0], skip_special_tokens=True)

    # Mostra l'input, la traduzione generata e la traduzione di riferimento
    print(f"\nEsempio {i+1}:")
    print("Input (EN):", tokenizer.decode(input_ids[0], skip_special_tokens=True).replace(source_prefix, ""))
    print("Traduzione Generata (IT):", translation)
    print("Traduzione di Riferimento (IT):", sample['target'])

  input_ids = torch.tensor(sample['input_ids']).unsqueeze(0).to(device)  # aggiungi batch dimension
  attention_mask = torch.tensor(sample['attention_mask']).unsqueeze(0).to(device)



Esempio 1:
Input (EN): in an asexual species if you get two different mutations in different creatures a green one and a red one then one has to be better than the other
Traduzione Generata (IT): in una specie assue se otteniamo due mutazioni diverse in creature diverse una verde e una rossa allora una deve essere meglio dell altra
Traduzione di Riferimento (IT): in una specie asessuata due mutazioni in creature diverse una rossa ed una verde devono essere una migliore dell altra

Esempio 2:
Input (EN): he s sort of a homer simpson with fins
Traduzione Generata (IT): e una specie di homer simpson con polvere
Traduzione di Riferimento (IT): e una sorta di homer simpson con le pinne

Esempio 3:
Input (EN): so if algorithms are going to curate the world for us if they re going to decide what we get to see and what we don t get to see then we need to make sure that they re not just keyed to relevance
Traduzione Generata (IT): quindi se gli algoritmi saranno curati il mondo per noi se deci

In [12]:
# Carica BLEU e ROUGE
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

# Importa COMET
comet_metric = None
try:
    from comet import download_model, load_from_checkpoint
    comet_path = download_model("Unbabel/wmt20-comet-da")  # Scarica il modello
    comet_metric = load_from_checkpoint(comet_path)  # Carica il modello COMET
except ImportError:
    print("Installa `unbabel-comet` per utilizzare la metrica COMET.")

# Funzione di valutazione con barra di progresso
def evaluate_translations_in_batches(trainer, test_dataset, batch_size=8):
    all_predictions = []
    all_references = []

    for i in tqdm(range(0, len(test_dataset), batch_size), desc="Valutazione in corso"):
        batch = test_dataset.select(range(i, min(i + batch_size, len(test_dataset))))

        input_ids = torch.stack([torch.tensor(ex['input_ids']) for ex in batch]).to(trainer.model.device)
        attention_mask = torch.stack([torch.tensor(ex['attention_mask']) for ex in batch]).to(trainer.model.device)

        translated_ids = trainer.model.generate(input_ids, attention_mask=attention_mask, max_length=96)
        translations = [trainer.tokenizer.decode(ids, skip_special_tokens=True) for ids in translated_ids]

        references = [[ex['target']] for ex in batch]

        all_predictions.extend(translations)
        all_references.extend(references)

    # Calcola le metriche
    bleu_score = bleu_metric.compute(predictions=all_predictions, references=all_references)
    rouge_score = rouge_metric.compute(predictions=all_predictions, references=[ref[0] for ref in all_references])

    # Calcola COMET se disponibile
    comet_score = None
    if comet_metric:
        # Prepara i dati nel formato corretto per COMET
        comet_inputs = [{"src": "", "mt": pred, "ref": ref[0]} for pred, ref in zip(all_predictions, all_references)]
        comet_score = comet_metric.predict(comet_inputs)

    return {
        "BLEU": bleu_score,
        "ROUGE": rouge_score,
        "COMET": comet_score if comet_score else "COMET non disponibile. Installa `unbabel-comet`."
    }

# Esegui la valutazione
results = evaluate_translations_in_batches(trainer, tokenized_test, batch_size=32)
print("Risultati delle metriche:")
print("BLEU:", results["BLEU"])
print("ROUGE:", results["ROUGE"])
print("COMET:", results["COMET"])

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/huggingface/hub/models--Unbabel--wmt20-comet-da/snapshots/4c372befe4d603e6d0363f434248ecad66945607/checkpoints/model.ckpt`
  input_ids = torch.stack([torch.tensor(ex['input_ids']) for ex in batch]).to(trainer.model.device)
  attention_mask = torch.stack([torch.tensor(ex['attention_mask']) for ex in batch]).to(trainer.model.device)
Valutazione in corso: 100%|██████████| 960/960 [28:15<00:00,  1.77s/it]
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Predicting Dat

Risultati delle metriche:
BLEU: {'bleu': 0.24445744626873456, 'precisions': [0.5525782613523746, 0.31708660641509095, 0.19561636588796846, 0.12400971851605735], 'brevity_penalty': 0.957403741016096, 'length_ratio': 0.9582857314955175, 'translation_length': 477279, 'reference_length': 498055}
ROUGE: {'rouge1': 0.5436858849523856, 'rouge2': 0.3165620202933652, 'rougeL': 0.5245589826143651, 'rougeLsum': 0.5245490580903149}
COMET: Prediction([('scores', [0.4218979477882385, -0.8348182439804077, -0.27214768528938293, 1.2973231077194214, -0.40955641865730286, -0.23441454768180847, -0.9453123211860657, -0.3966473639011383, 1.0783299207687378, 0.7264689803123474, 1.1863592863082886, 0.4316979944705963, 0.2704804837703705, -0.33148160576820374, 0.30931714177131653, -1.1440449953079224, 0.5153287053108215, 1.0989596843719482, 0.5858182907104492, 1.2739393711090088, -0.6166977286338806, 0.5208408236503601, 0.5354133248329163, -0.45310842990875244, -0.35072875022888184, 0.37816914916038513, -0.045

In [14]:
# Estrai i punteggi puri dai risultati
bleu_score = results["BLEU"]["bleu"]  # Solo il BLEU score principale
rouge_score = results["ROUGE"]["rougeL"]  # ROUGE-L score
comet_score = (sum(results["COMET"]["scores"]) / len(results["COMET"]["scores"]))

               #Organizza i risultati in un dizionario e crea una tabella ordinata
scores_data = {
    "Metric": ["BLEU", "ROUGE-L", "COMET"],
    "Score": [bleu_score, rouge_score, comet_score]
}

scores_df = pd.DataFrame(scores_data)

# Visualizza la tabella con i risultati
print("\n--- Risultati delle metriche principali ---")
print(scores_df.to_markdown(index=False))  # Stampa la tabella in formato markdown per una visualizzazione chiara


--- Risultati delle metriche principali ---
| Metric   |     Score |
|:---------|----------:|
| BLEU     | 0.244457  |
| ROUGE-L  | 0.524559  |
| COMET    | 0.0182936 |
