# Comparação de performance entre diferentes caminhos de tradução
O Google tradutor usa o inglês como intermediário padrão, mas será que é a escolha mais adequada? Qual o melhor caminho entre quaisquer dois nós?

## Bibliotecas

In [10]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
nltk.download('punkt')

import torch
from torch import optim
from torch.utils.data import DataLoader

from random import shuffle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
!pip3 install sentencepiece
!pip3 install transformers
!pip3 install translate-toolkit



In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from translate.storage.tmx import tmxfile

## Córpus
Vamos utilizar um córpus de legendas de TED talks

In [13]:
!wget https://object.pouta.csc.fi/OPUS-TED2020/v1/tmx/en-pt_br.tmx.gz
!gunzip en-pt_br.tmx.gz

--2022-03-16 20:19:53--  https://object.pouta.csc.fi/OPUS-TED2020/v1/tmx/en-pt_br.tmx.gz
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32862474 (31M) [application/gzip]
Saving to: ‘en-pt_br.tmx.gz’


2022-03-16 20:19:57 (12.7 MB/s) - ‘en-pt_br.tmx.gz’ saved [32862474/32862474]

gzip: en-pt_br.tmx already exists; do you wish to overwrite (y or n)? n
	not overwritten


Lendo o córpus e separando em conjuntos de treino e teste

In [14]:
# ler córpus
with open("en-pt_br.tmx", 'rb') as fin:
  f = tmxfile(fin, 'en', 'pt')

prefixo = '>>pt_br<<'

# formatar as traduções corretamente 
data = [{ 'src': prefixo + ' ' + w.source, 'trg': w.target } for w in f.unit_iter()]

# embaralhar os pares
shuffle(data)

# separar em conjuntos de treino e teste
size = int(len(data) * 0.2)
treino = data[size:][:10000]
teste = data[:size][:1000]

In [15]:
treino[10]

{'src': ">>pt_br<< But, for example, net interest payments didn't, the difference between what banks were earning in interest if they gave you a loan and what they were paying out for a deposit. ",
 'trg': 'Mas, por exemplo, pagamentos de juros líquidos não iam. A diferença entre o que os bancos estavam ganhando em juros, se eles lhe dessem um empréstimo e o que pagavam por um depósito, '}

## Treinamento
Definindo parâmetros do modelo e treinamento

In [16]:
learning_rate = 1e-5 
epochs = 2
batch_size = 16
batch_status = 32
early_stop = 5
write_path = 'model.pt'

Separando dados em batches ( lotes )

In [17]:
train_data = DataLoader(treino, batch_size = batch_size, shuffle = True)
dev_data = DataLoader(teste, batch_size = batch_size, shuffle = True)

Método de avaliação

In [18]:
def evaluate(tokenizer, model, devdata, batch_size, batch_status, device):

    model.eval()
    
    y_real = []
    y_pred = []
    
    for batch_idx, inp in enumerate(dev_data):
        y_real.extend(inp['trg'])
        
        # tokenize
        model_inputs = tokenizer(
            inp['src'], 
            truncation = True, 
            padding = True, 
            max_length = 128, 
            return_tensors="pt"
        ).to(device)
        
        # Translate
        generated_ids = model.generate(**model_inputs, num_beams = 1)
        
        # Post-process translation
        output = tokenizer.batch_decode(generated_ids, skip_special_tokens = True)
        y_pred.extend(output)
    
        # Display
        if (batch_idx+1) % batch_status == 0:
            print('Evaluation: [{}/{} ({:.0f}%)]'.format(batch_idx + 1, \
                len(devdata), 100. * batch_idx / len(devdata)))

    # evaluating based on bleu
    hyps, refs = [], []
    
    for i, snt_pred in enumerate(y_pred):
        hyps.append(nltk.word_tokenize(snt_pred))
        refs.append([nltk.word_tokenize(y_real[i])])
    
    bleu = corpus_bleu(refs, hyps)

    return bleu

Método do treinamento

In [19]:
def train(tokenizer, model, train_data, dev_data, optimizer, num_epochs, 
          batch_size, batch_status, device, 
          early_stop = 5, write_path = 'model.pt'):
    
    max_bleu = evaluate(tokenizer, model, dev_data, batch_size, batch_status, device)
    print('BLEU inicial:', max_bleu)
    
    model.train()
    repeat = 0
    
    for epoch in range(num_epochs):
        losses = []
        batch_src, batch_trg = [], []

        for batch_idx, inp in enumerate(train_data):
            # Init
            optimizer.zero_grad()

            # tokenize
            model_inputs = tokenizer(
                inp['src'], 
                truncation = True,
                padding = True, 
                max_length = 128, 
                return_tensors="pt"
            ).to(device)
            
            with tokenizer.as_target_tokenizer():
                labels = tokenizer(
                    inp['trg'], 
                    truncation = True, 
                    padding = True, 
                    max_length = 128, 
                    return_tensors="pt"
                ).input_ids.to(device)
            
            # translate
            output = model(**model_inputs, labels=labels) # forward pass

            # Calculate loss
            loss = output.loss
            losses.append(float(loss))

            # Backpropagation
            loss.backward()
            optimizer.step()

            batch_src, batch_trg = [], []

            # Display
            if (batch_idx+1) % batch_status == 0:
                print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tTotal Loss: {:.6f}'.format(
                epoch, batch_idx+1, len(train_data), 100. * batch_idx / len(train_data), 
                float(loss), round(sum(losses) / len(losses), 5)))

        bleu = evaluate(tokenizer, model, dev_data, batch_size, batch_status, device)
        print('BLEU:', bleu)
        
        if bleu > max_bleu:
            max_bleu = bleu
            repeat = 0

            print('Saving best model...')
            torch.save(model, write_path)
        else:
            repeat += 1

        if repeat == early_stop:
            break

Inicializando o modelo

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ROMANCE").to(device)
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ROMANCE")

Downloading:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/298M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/761k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Treinando

In [21]:
optimizer = optim.AdamW(model.parameters(), lr = learning_rate)

train(
    tokenizer, model, train_data, dev_data, optimizer, epochs, 
    batch_size, batch_status, device, early_stop, write_path
)

BLEU inicial: 0.392787426653673
BLEU: 0.4079722924288597
Saving best model...
BLEU: 0.4030857576580365


## Resultados

In [23]:
# sentenças a serem traduzidas
batch_input_str = (
    (">>pt_br<< Please, don't fail me now."), 
    (">>pt_br<< Who is a good translator? You are!."), 
    (">>pt_br<< I hope you are able to translate a big sentence, because people nowadays love texting. And I want to present this to my teacher and colleagues, so you have to work!"),
    (">>pt_br<< I really don't want to study tonight but I have to do it because I want to graduate and get a job and have a lot of money.")
)

# tokenizando as sentenças
encoded = tokenizer(batch_input_str, return_tensors = 'pt', padding = True).to(device)

# traduzindo
translated = model.generate(**encoded)

# preparando a saída
tokenizer.batch_decode(translated, skip_special_tokens = True)

['Por favor, não me falhem agora.',
 'Quem é um bom tradutor? Você é!.',
 'Espero que vocês consigam traduzir uma frase importante, porque hoje as pessoas adoram mensagens de texto. E eu quero apresentar isso ao meu professor e colegas, então vocês têm que trabalhar!',
 'Eu realmente não quero estudar hoje, mas eu tenho que fazer isso porque eu quero me formar e conseguir um emprego e ter muito dinheiro.']