# 1. Setup Google Drive and Environment


In [13]:
import torch
import numpy as np
import pandas as pd
from typing import Dict
import torch
from datasets import load_dataset, load_from_disk
from transformers import DataCollatorWithPadding
import evaluate

from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    set_seed,
)


from sklearn.model_selection import train_test_split
from datasets import Dataset



from transformers import AutoModel
import torch.nn as nn
import torch.nn.functional as F



# 2. Model and Hyper-Parameters


In [14]:
# Model: Distil-Bert
language_model_name = "google/mt5-small"

# HYPER-PARAMETERS

batch_size = 32
learning_rate = 1e-4
weight_decay = 0.001
epochs = 4
device = "cuda" if torch.cuda.is_available() else "cpu"
set_seed(42)


# 3. Accessing the dataset

In [15]:

dataset = Dataset.from_csv("NonModernItDataset.csv")
dataset = dataset.train_test_split(0.2,shuffle=True)

dataset


Generating train split: 200 examples [00:00, 5905.47 examples/s]


DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 160
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 40
    })
})

In [None]:
dataset["train"][:10]


{'source': ["Poi che 'l maestro mio mi ebbe veduto tanto attento, disse: Dentro da quei fuochi son li spiriti; ciascun s'infiamma di quel ch'in lui è acceso.",
  'E’ mi par che ‘l tempo si rincorra.',
  'E’ mi par che ‘l tempo si rincorra.',
  'L’alma s’affligge e non trova conforto.',
  "Poi che 'l maestro mio mi ebbe veduto tanto attento, disse: Dentro da quei fuochi son li spiriti; ciascun s'infiamma di quel ch'in lui è acceso.",
  'Li occhi porta la mia donna sovra lor gloria.',
  'Non era l’impresa da pigliarsi a gabbo.',
  'Deh peregrini che pensosi andate.',
  'Veggio l’onore che non perde mai.',
  'Veggio l’onore che non perde mai.'],
 'target': ['Quando il mio maestro mi vide così attento, disse: Dentro quei fuochi ci sono gli spiriti; ognuno è infiammato da ciò che lo brucia interiormente.',
  'E mi sembra che il tempo corra veloce.',
  'E mi sembra che il tempo corra veloce.',
  'L’anima si affligge e non trova consolazione.',
  'Quando il mio maestro mi vide così attento, d

In [18]:
dataset["test"][:5]

{'source': ["Amor, ch'al cor gentil ratto s'apprende, prese costui de la bella persona che mi fu tolta; e 'l modo ancor m'offende.",
  'Or sì che tu se’ quel Virgilio e quella fonte.',
  'A ciò che l’animo mio s’acqueti.',
  'Or sì che tu se’ quel Virgilio e quella fonte.',
  'Deh peregrini che pensosi andate.'],
 'target': ['Amore, che si accende rapidamente in un cuore gentile, prese costui per la bellezza della donna che mi fu tolta; e ancora il modo in cui accadde mi ferisce.',
  'Allora tu sei proprio Virgilio e quella fonte di saggezza.',
  'Affinché il mio animo trovi pace.',
  'Allora tu sei proprio Virgilio e quella fonte di saggezza.',
  'Oh pellegrini che camminate pensierosi.']}

# 4. Function to compute the Metrics


# 5. Initialization of the model


In [19]:
from transformers import AutoTokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [29]:
source_lang = "source"
target_lang = "target"
prefix = "Translate Not Modern Itailan to Modern Italian: "
metric = evaluate.load("sacrebleu")



def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


Downloading builder script: 100%|██████████| 8.15k/8.15k [00:00<00:00, 5.29MB/s]


In [30]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [31]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

# 8. Trainer

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="non-modern-it-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    predict_with_generate=True,
    fp16=True, #change to bf16=True for XPU
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
# Let's Train ...
trainer.train()

# 9. Evaluating the performance of our model

In [None]:
# Evaluate the model ...
trainer.evaluate()

In [None]:
# PLOTTING THE CONFUSION MATRIX

import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

predictions = trainer.predict(tokenized_datasets_sst2["validation"])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

cm = confusion_matrix(y_true, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')

In [None]:
# ACCURACY, PRECISION, RECALL AND F1-SCORE

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# 10. Extra, compute the label

In [None]:
dataset_unlabeled = load_dataset("csv", data_files="./test_unlabeled.csv")

# TO CONVERT THE TEST DATASET IN THE FORMAT NEEDED

# dataset_test = dataset_unlabeled.map(
#     fetch_wiki_text,
#     remove_columns=["item", "description", "type", "category", "subcategory"],
# )

# dataset_test.save_to_disk("./dataset_test_unlabeled")

In [None]:
dataset_test = load_from_disk("./dataset_test_unlabeled")
dataset_test = dataset_test["train"]

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

dataset_test = dataset_test.map(tokenize_function, batched=True)

predictions = trainer.predict(dataset_test)
y_pred = np.argmax(predictions.predictions, axis=1)

inv_label_map = {v: k for k, v in label_map.items()}

df_unlabeled = dataset_unlabeled['train'].to_pandas()
df_unlabeled['label'] = [inv_label_map[label] for label in y_pred]

df_unlabeled.to_csv("./test_unlabeled_with_predictions_transformers_colab.csv", index=False)