In [2]:
import pandas as pd
import torch
from datasets import Dataset, DatasetDict
from transformers import MarianTokenizer, MarianMTModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
df = pd.read_csv("archaic_to_english_dataset.csv")
df = df.rename(columns={"archaic": "source", "modern": "target"})

dataset = DatasetDict({
     "train": Dataset.from_pandas(df.sample(frac=0.9, random_state=42).reset_index(drop=True)),
     "test": Dataset.from_pandas(df.sample(frac=0.1, random_state=42).reset_index(drop=True))
})

In [3]:
model_name = "Helsinki-NLP/opus-mt-it-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
model.to(device)



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(80379, 512, padding_idx=80378)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(80379, 512, padding_idx=80378)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [4]:
def preprocess(example):
    inputs = tokenizer(example["source"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(example["target"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/87 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [5]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./marian_finetuned",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    fp16=torch.cuda.is_available(), 
    logging_dir='./logs',
    logging_steps=10
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

  trainer = Seq2SeqTrainer(


In [6]:
trainer.train()

model.save_pretrained("./marian_finetuned")
tokenizer.save_pretrained("./marian_finetuned")

Step,Training Loss
10,4.7526
20,1.7907
30,1.5168




('./marian_finetuned/tokenizer_config.json',
 './marian_finetuned/special_tokens_map.json',
 './marian_finetuned/vocab.json',
 './marian_finetuned/source.spm',
 './marian_finetuned/target.spm',
 './marian_finetuned/added_tokens.json')

INFERENCE

In [4]:
model_it_en = MarianMTModel.from_pretrained("./marian_finetuned").to(device)
tokenizer_it_en = MarianTokenizer.from_pretrained("./marian_finetuned")

model_en_it = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-it").to(device)
tokenizer_en_it = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-it")



In [5]:
input_sentence = "Gli uomini spessamente a stare fermi nella bugia incontra la verità"
#input_sentence = "E come l’un pensier de l’altro scoppia, così nacque di quello un altro poi, che la prima paura mi fé doppia"
#input_sentence = "Di qua, di là, su per lo sasso tetro vidi demon cornuti con gran ferze"
#input_sentence = "Et però ch'egli nol potea fare per senno, né per consiglio"
#input_sentence = "L’angoscia che tu hai forse ti tira fuor de la mente"
#input_sentence = "sì che non par ch’i’ ti vedessi mai"

In [6]:
# archaic it -> en
inputs = tokenizer_it_en([input_sentence], return_tensors="pt", padding=True, truncation=True).to(device)
translated_en_ids = model_it_en.generate(**inputs, max_length=128, num_beams=4, early_stopping=True)
translated_en = tokenizer_it_en.decode(translated_en_ids[0], skip_special_tokens=True)
print("Modern English Translation:", translated_en)

# en -> modern it
inputs_en_it = tokenizer_en_it([translated_en], return_tensors="pt", padding=True, truncation=True).to(device)
translated_it_ids = model_en_it.generate(**inputs_en_it, max_length=128, num_beams=4, early_stopping=True)
translated_modern_it = tokenizer_en_it.decode(translated_it_ids[0], skip_special_tokens=True)

print("Modern Italian Translation:", translated_modern_it)


Modern English Translation: The men thickly to stand still in the lie meets the truth
Modern Italian Translation: Gli uomini per stare fermi nella menzogna incontrano la verità
