# 1. Setup Google Drive and Environment


In [27]:
%pip install transformers datasets evaluate sacrebleu
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset, load_from_disk
from transformers import DataCollatorWithPadding
import evaluate 

from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)


from sklearn.model_selection import train_test_split
from datasets import Dataset



from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F



python(85712) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


# 2. Model and Hyper-Parameters


In [None]:
# Model: Distil-Bert
language_model_name = "google/mt5-small"

# HYPER-PARAMETERS

device = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)


# 3. Accessing the dataset

In [18]:
df = pd.read_csv("gold_labeled_dataset_1.csv", sep=";")
df = df.rename(columns={"Sentence": "source", "Traductions": "target"})
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(0.2,shuffle=True)

dataset


DatasetDict({
    train: Dataset({
        features: ['Author', 'Date', 'Region', 'source', 'target'],
        num_rows: 77
    })
    test: Dataset({
        features: ['Author', 'Date', 'Region', 'source', 'target'],
        num_rows: 20
    })
})

# 4. Function to compute the Metrics


# 5. Initialization of the model


In [19]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [20]:
source_lang = "source"
target_lang = "target"
prefix = "Translate in an accurate way from Not Modern Itailan to Modern Italian: "

def preprocess_function(examples):
    inputs = [prefix + example for example in examples[source_lang]]
    #print(inputs)
    targets = [example for example in examples[target_lang]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [21]:
source_lang = "source"
target_lang = "target"

metric = evaluate.load("sacrebleu")



def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


In [26]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [23]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# 8. Trainer

In [None]:
batch_size = 32
learning_rate = 1e-3
weight_decay = 0.01
epochs = 4

In [24]:
training_args = Seq2SeqTrainingArguments(
    output_dir="non-modern-it-model",
    eval_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=weight_decay,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=False, #change to bf16=True for XPU
    push_to_hub=False,
    report_to="none",

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [25]:
# Let's Train ...
trainer.train()



Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,2.722965,3.1303,20.0


KeyboardInterrupt: 

# 9. Evaluating the performance of our model

In [None]:
# Evaluate the model ...
trainer.evaluate()

{'eval_loss': 0.970496416091919,
 'eval_bleu': 13.5911,
 'eval_gen_len': 19.35,
 'eval_runtime': 35.4845,
 'eval_samples_per_second': 1.127,
 'eval_steps_per_second': 0.085,
 'epoch': 10.0}

In [None]:
#input_sentence = "Gli uomini spessamente a stare fermi nella bugia incontra la verità"
input_sentence = "E come l’un pensier de l’altro scoppia, così nacque di quello un altro poi, che la prima paura mi fé doppia"
#input_sentence = "Di qua, di là, su per lo sasso tetro vidi demon cornuti con gran ferze"
#input_sentence = "Et però ch'egli nol potea fare per senno, né per consiglio"
#input_sentence = "L’angoscia che tu hai forse ti tira fuor de la mente"
#input_sentence = "sì che non par ch’i’ ti vedessi mai"

text = prefix + input_sentence
text

'Translate Not Modern Itailan to Modern Italian: E come l’un pensier de l’altro scoppia, così nacque di quello un altro poi, che la prima paura mi fé doppia'

In [None]:
from transformers import AutoTokenizer

inputs = tokenizer(text, return_tensors="pt").input_ids

In [None]:
from transformers import AutoModelForSeq2SeqLM

outputs = model.to("cpu").generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)

In [None]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'E come un pensier della scoppia, cos nacque di quello un altro poi, che la prima paura mi fé'