# This is a tutorial for machine translation with T5

In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch
import pandas as pd
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import evaluate

## We will use pretrained t5-small model to finetune a English to French model.

In [2]:
#load tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [3]:
#we will use bleu score as the evaluation metric
bleu = evaluate.load("bleu")

## First we need to read the files and convert it into a dataframe

In [4]:
lst=[]
with open('eng-fra.txt',encoding='UTF-8') as my_file:
    Lines = my_file.readlines()
    for line in Lines:
        strs=line.strip().split("	", 1)
        lst.append([strs[0],strs[1]])
    

In [5]:
df = pd.DataFrame(lst,columns =['eng','fra'])

In [6]:
df

Unnamed: 0,eng,fra
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !
...,...,...
135837,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
135838,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
135839,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
135840,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


In [7]:
#Create a customized dataset class
class CustomDataset():

    def __init__(self, dataframe, tokenizer):
        self.tokenizer = tokenizer
        self.eng = list(dataframe['eng'])
        self.fra = list(dataframe['fra'])

    def __len__(self):
        return len(self.eng)

    def __getitem__(self, index):
        #we need to get the input ids of input(English) and output(French)
        inputs = self.tokenizer.encode_plus(
            self.eng[index],
            None,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True
        )
        outputs = self.tokenizer.encode_plus(
            self.fra[index],
            None,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True
        )
        
        
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        labels=outputs['input_ids']   
        return torch.tensor(input_ids, dtype=torch.long),torch.tensor(attention_mask, dtype=torch.long),torch.tensor(labels, dtype=torch.long)

In [17]:
#Train test split and create dataloders

train, test = train_test_split(df, test_size=0.5,random_state=12345)

train_set = CustomDataset(train, tokenizer)
trainloader = DataLoader(train_set, batch_size=2,shuffle=True)
#we only randomly pick 2000 samples as test_set
test_set = CustomDataset(test[:2000], tokenizer)
testloader = DataLoader(test_set, batch_size=2,shuffle=False)

In [18]:
#ground truth of test_set
true_list=test[:2000]['fra'].to_list()

In [10]:
def training(train_loader, model,optimizer):
    """one epoch training"""


    epoch_loss = 0
    model.train()


    for idx, (input_ids,attention_mask,labels) in enumerate(train_loader):


        # replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels==0]=-100
        if torch.cuda.is_available():
            input_ids = input_ids.cuda()
            attention_mask = attention_mask.cuda()
            labels= labels.cuda()


        
        
        #calculate the loss 
        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
        
        #accumulate the loss
        epoch_loss += loss.item()
        


        # SGD
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    



    return epoch_loss / len(train_loader)

In [11]:
def evaluating(eval_loader, model,optimizer):
    epoch_loss = 0
    pred_list=[]
    model.eval()
    
    with torch.no_grad():
    
        for idx, (input_ids,attention_mask,labels) in enumerate(eval_loader):
            if torch.cuda.is_available():
                input_ids = input_ids.cuda()
                attention_mask = attention_mask.cuda()
            
            
            #get the output sequence
            output_sequences = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
            )
            outputs=tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
            for samples in outputs:
                pred_list.append(samples)
    
    
    
    #calculate bleu score
    bleu_score=bleu.compute(predictions=pred_list, references=true_list)    
    return bleu_score

In [12]:
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
#Set the optimizer and learning rate is recommended to be 1e-4 by huggingface
optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=1e-4,
    )

In [None]:
#Train the model
for i in range(5):


    print('epochs:'+ str(i+1))
    
    #training
    tr_loss=training(trainloader, model,optimizer)
    print('training_loss:'+str(round(tr_loss, 5)))

    #evaluating
    bleu_score=evaluating(testloader, model,optimizer)
    print('bleu_score:'+str(round(bleu_score['bleu'], 5)))

## Due to the large amount of training data. The training is done on a cluster.
we can run evaluating function again to check the bleu score after training

In [22]:
bleu_score=evaluating(testloader, model,optimizer)

In [23]:
bleu_score

{'bleu': 0.4875162591403384,
 'precisions': [0.7436706689536878,
  0.5589662027833002,
  0.4481323877068558,
  0.3667598416026089],
 'brevity_penalty': 0.9535654925674059,
 'length_ratio': 0.9546109510086456,
 'translation_length': 14575,
 'reference_length': 15268}

## a simple example

In [14]:
#Check the model outputs after training
sentences = ["HuggingFace is a company.", "Welcome to NYC."]
inputs = tokenizer(sentences, return_tensors="pt", padding=True)
output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
)
print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))


['HuggingFace est une société.', 'Bienvenue à NYC.']


In [21]:
bleu.compute(predictions=['HuggingFace est une société.', 'Bienvenue à NYC.'], references=["HuggingFace est une entreprise.", "Bienvenue à NYC."])

{'bleu': 0.5969491792019646,
 'precisions': [0.8888888888888888,
  0.7142857142857143,
  0.6,
  0.3333333333333333],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0,
 'translation_length': 9,
 'reference_length': 9}