In [1]:
!pip install sentencepiece

import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [2]:
df = pd.read_csv("gold_labeled_dataset_1.csv", sep=";")
df = df.rename(columns={"Sentence": "source", "Traductions": "target"})

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1, shuffle=True)

dataset

DatasetDict({
    train: Dataset({
        features: ['Author', 'Date', 'Region', 'source', 'target'],
        num_rows: 87
    })
    test: Dataset({
        features: ['Author', 'Date', 'Region', 'source', 'target'],
        num_rows: 10
    })
})

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


model_name = "gsarti/it5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32103, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32103, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [4]:
import evaluate 
source_lang = "source"
target_lang = "target"

metric = evaluate.load("sacrebleu")




def preprocess(example):
    prompt = "Traduci la seguent frase da italiano antico a italiano moderno rispettando il significato e la semantica: "
    mod_source = [prompt + example for example in example[source_lang]]

    inputs = tokenizer(mod_source, truncation=True, padding="max_length", max_length=256)
    targets = tokenizer(example["target"], truncation=True, padding="max_length", max_length=260)
    inputs["labels"] = targets["input_ids"]
    return inputs

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

tokenized_dataset = dataset.map(preprocess, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./marian_finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=False, #torch.cuda.is_available(), 
    logging_dir='./logs',
    logging_steps=10
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [6]:
trainer.train()


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


KeyboardInterrupt: 

INFERENCE

In [None]:
prompt = "Traduci la seguent frase da italiano antico a italiano moderno rispettando il significato e la semantica: "

#input_sentence = "Gli uomini spessamente a stare fermi nella bugia incontra la verità"
#input_sentence = "E come l’un pensier de l’altro scoppia, così nacque di quello un altro poi, che la prima paura mi fé doppia"
#input_sentence = "Di qua, di là, su per lo sasso tetro vidi demon cornuti con gran ferze"
input_sentence = prompt + "Et però ch'egli nol potea fare per senno, né per consiglio"
#input_sentence = "L’angoscia che tu hai forse ti tira fuor de la mente"
#input_sentence = "sì che non par ch’i’ ti vedessi mai"

In [None]:

inputs = tokenizer(input_sentence, return_tensors="pt").to("cpu")
model = model.to("cpu")

output = model.generate(**inputs)
print(tokenizer.decode(output[0], skip_special_tokens=True))


馘, e che egli potea fare per conto proprio, e non
