Read the data of Europian parlement proceedings, and transform to the right format for the model. 

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from torchinfo import summary

df = pd.read_csv('data.csv')
df = df.head(1000)

df = df.dropna(subset=['en', 'ro']).reset_index(drop=True)
df = df[~df['en'].str.contains('<.*?>')]
df = df[~df['ro'].str.contains('<.*?>')]
df['en'] = df['en'].str.slice(0, 200)
df['ro'] = df['ro'].str.slice(0, 200)

book = pd.DataFrame({'translation': []})
li = []
for i, row in df.iterrows():
    newrow = {'ro': row['ro'], 'en': row['en']}
    li.append(newrow)



df = pd.DataFrame(li, columns=['translation'])
df = pd.DataFrame({'translation': li})
df['index'] = df.index
df = df[['index', 'translation']]






Make a dataset from the data and make a train-test split of 0.7 - 0.30

In [2]:

dataset = Dataset.from_pandas(df)

split = dataset.train_test_split(test_size=0.3, seed=107)
split


DatasetDict({
    train: Dataset({
        features: ['index', 'translation'],
        num_rows: 700
    })
    test: Dataset({
        features: ['index', 'translation'],
        num_rows: 300
    })
})

Initialize the pre trained model and the tokenizer for the model

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration

checkpoint = "google-t5/t5-small"
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")
max_source_length = 1024
max_target_length = 1024

source_lang = "en"
target_lang = "ro"
prefix = "translate English to Romanian: "


def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=1024, truncation=True, padding=True)
    return model_inputs

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Tokenize the train test split

In [4]:
tokenized_books = split.map(preprocess_function, batched=True)

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Make functions for the metrics used for the training and for the post processing steps of the data

In [5]:
import numpy as np
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Define the training arguments and the trainer

In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    output_dir="Translation_model",
    learning_rate=0.0005,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=15,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_books["train"],
    eval_dataset=tokenized_books["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)



dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Specifiy the model, by training it on our dataset

In [8]:

trainer.train()


  0%|          | 0/88 [00:00<?, ?it/s]

{'train_runtime': 203.3353, 'train_samples_per_second': 6.885, 'train_steps_per_second': 0.433, 'train_loss': 1.2665220607410779, 'epoch': 2.0}


TrainOutput(global_step=88, training_loss=1.2665220607410779, metrics={'train_runtime': 203.3353, 'train_samples_per_second': 6.885, 'train_steps_per_second': 0.433, 'train_loss': 1.2665220607410779, 'epoch': 2.0})

Save the model

In [9]:
trainer.save_model(output_dir="Translation_model")

Test the model on testing set

In [10]:
from evaluate import evaluator
from datasets import load_dataset


df = pd.read_csv('data.csv')
df = df[10100:10300]


for i, row in enumerate(df.iterrows()):
    df['en'].iloc[i] = "translate from English to Romanian: " + df['en'].iloc[i]

    
datasetTest = Dataset.from_pandas(df)
task_evaluator = evaluator(task="translation")
results = task_evaluator.compute(
    model_or_pipeline="Translation_model",
    data=datasetTest,
    input_column="en",
    label_column="ro",
    tokenizer=tokenizer
)

print(results)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['en'].iloc[i] = "translate from English to Romanian: " + df['en'].iloc[i]


{'bleu': 0.38243340023308625, 'precisions': [0.660013160780873, 0.45389908256880734, 0.3347752944003845, 0.24911660777385158], 'brevity_penalty': 0.9619197001524957, 'length_ratio': 0.9626266891891891, 'translation_length': 4559, 'reference_length': 4736, 'total_time_in_seconds': 214.39734749984927, 'samples_per_second': 0.9328473618366039, 'latency_in_seconds': 1.0719867374992464}


In [11]:
text = "translate English to Romanian: Can someone here explain to me what these visits to Moscow by the Slovakian and the Bulgarian Prime Ministers are all about"

from transformers import pipeline

translator = pipeline("translation", model="Translation_model")
translator(text)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


[{'translation_text': 'Poate cineva să-mi explice care sunt vizitele prim-ministrului slovac şi bulgar la Moscova?'}]