### Notes 

T5 Paper: https://arxiv.org/pdf/1910.10683.pdf

T5 Tokenizer: https://github.com/huggingface/transformers/blob/master/src/transformers/tokenization_t5.py

Important Tasks: https://docs.google.com/document/d/1weIZM6QTlnitpPQmpg-WeV2RW70TnYmDuogBQPr5mB0/edit

In [1]:
#installation step
!pip install transformers
!pip install sentencepiece
#creating the folders 
!mkdir data/
!mkdir data/AD_NMT-master
!mkdir data/train
!mkdir data/test
!mkdir data/val
#fetching the pkl files
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V9crCmqvgQcv0Sx2MCNWB9AET2j6M6FW' -O data/AD_NMT-master/english-Arabic-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V8_tp8ZlWUYaX7QQL46t0uSRNrVehSf1' -O data/AD_NMT-master/english-Arabic-test.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1V7X0qtuDIyjTHY0wh-ZNoVwsiF4lId2e' -O data/AD_NMT-master/english-Arabic-train.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UzL4cOWTMCee83KBUh2QO_H62AFVpDQV' -O data/AD_NMT-master/LAV-MSA-2-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UpfCbkxhztof7dvNjeAs1bHjD4SER6h3' -O data/AD_NMT-master/LAV-MSA-2-test.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UlAZGtYsSfXzK7hrC_PbxQFqTSXD0DMw' -O data/AD_NMT-master/LAV-MSA-2-train.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UjDX7cCG2S23SPfSHxSPdVayMTxB5Y16' -O data/AD_NMT-master/Magribi_MSA-both.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UaVWIqRXo0rxuxDF4KArA4bEK1TaLX3l' -O data/AD_NMT-master/Magribi_MSA-test.pkl
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UYvlhdYAdfa4riP_4hn3-IEVd1ZUXVTQ' -O data/AD_NMT-master/Magribi_MSA-train.pkl

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |▋                               | 10kB 17.1MB/s eta 0:00:01[K     |█▏                              | 20kB 1.8MB/s eta 0:00:01[K     |█▊                              | 30kB 2.6MB/s eta 0:00:01[K     |██▎                             | 40kB 1.7MB/s eta 0:00:01[K     |███                             | 51kB 2.1MB/s eta 0:00:01[K     |███▌                            | 61kB 2.5MB/s eta 0:00:01[K     |████                            | 71kB 2.9MB/s eta 0:00:01[K     |████▋                           | 81kB 2.3MB/s eta 0:00:01[K     |█████▎                          | 92kB 2.5MB/s eta 0:00:01[K     |█████▉                          | 102kB 2.8MB/s eta 0:00:01[K     |██████▍                         | 112kB 2.8MB/s eta 0:00:01[K     |███████                         | 122kB 2.8M

In [0]:
#James Chartouni
#Joey Park
#Raef Khan

import torch
import pandas as pd
import numpy as np
import pickle
import os, io, glob

import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader

from torchtext.datasets.translation import TranslationDataset
from torchtext.data import Field, BucketIterator

import sentencepiece as spm

from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config, T5PreTrainedModel


from spacy.tokenizer import Tokenizer
from spacy.vocab import Vocab

from sklearn.model_selection import train_test_split


In [3]:
ls data/AD_NMT-master

english-Arabic-both.pkl   LAV-MSA-2-both.pkl   Magribi_MSA-both.pkl
english-Arabic-test.pkl   LAV-MSA-2-test.pkl   Magribi_MSA-test.pkl
english-Arabic-train.pkl  LAV-MSA-2-train.pkl  Magribi_MSA-train.pkl


In [0]:
file_path = 'data/AD_NMT-master/'

with open(file_path + "english-Arabic-train.pkl", 'rb') as handle:
    en_msa_pre_train_ds = pickle.load(handle) 
    
with open(file_path + "english-Arabic-test.pkl", 'rb') as handle:
    en_msa_test_ds = pickle.load(handle) 
    
with open(file_path + "LAV-MSA-2-train.pkl", 'rb') as handle:
    lav_msa_pre_train_ds = pickle.load(handle)
    
with open(file_path + "LAV-MSA-2-test.pkl", 'rb') as handle:
    lav_msa_test_ds = pickle.load(handle)    
    
with open(file_path + "LAV-MSA-2-both.pkl", 'rb') as handle:
    data_LAV_MSA = pickle.load(handle) 

with open(file_path + "english-Arabic-both.pkl", 'rb') as handle:
    data_English_MSA = pickle.load(handle) 
    
with open(file_path + "Magribi_MSA-both.pkl", 'rb') as handle:
    data_Magribi_MSA = pickle.load(handle) 
    

In [5]:
print(en_msa_pre_train_ds[0:5])
print(en_msa_test_ds[0:5])

[['Tom was also there', 'كان توم هنا ايضا'], ['That old woman lives by herself', 'تلك المراة العجوز تسكن بمفردها'], ['He went abroad for the purpose of studying English', 'سافر خارج البلد ليتعلم الانجليزية'], ['There is a fork missing', 'هناك شوكة ناقصة'], ["I don't know this game", 'لا اعرف هذه اللعبة']]
[["Where's your money?", 'اين مالك؟'], ['Be prepared', 'كن مستعدا'], ["I figured you'd be impressed", 'توقعت انك ستنبهر'], ['May I come in?', 'هل بامكاني الدخول؟'], ['Read through the article', 'اقرا المقالة']]


## Prepare Datasets

example: https://iwslt2010.fbk.eu/node/32/

We need to take our training and test sets from the pkl files and create new .txt files that are formatted so that the standard torchtext Dataset class can read them

Data format:
each line consists of three fields divided by the character '\'
sentences consisting of words divided by single spaces
format: <SENTENCE_ID>\<PARAPHRASE_ID>\<TEXT>
Field_1: sentence ID
Field_2: paraphrase ID
Field_3: MT develop sentence / reference translation
Text input example:
DEV_001\01\This is the first develop sentence.
DEV_002\01\This is the second develop sentence.
Reference translation example:
DEV_001\01\1st reference translation for 1st input
DEV_001\02\2nd reference translation for 1st input
...
DEV_002\01\1st reference translation for 2nd input
DEV_002\02\2nd reference translation for 2nd input
...
Languages:
Arabic-English
CSTAR03 testset: 506 sentences, 16 reference translations
IWSLT04 testset: 500 sentences, 16 reference translations
IWSLT05 testset: 506 sentences, 16 reference translations
IWSLT07 testset: 489 sentences, 6 reference translations
IWSLT08 testset: 507 sentences, 16 reference translations
French-English
CSTAR03 testset: 506 sentences, 16 reference translations
IWSLT04 testset: 500 sentences, 16 reference translations
IWSLT05 testset: 506 sentences, 16 reference translations
Turkish-English
CSTAR03 testset: 506 sentences, 16 reference translations
IWSLT04 testset: 500 sentences, 16 reference translations


In [6]:
ls data/AD_NMT-master/

english-Arabic-both.pkl   LAV-MSA-2-both.pkl   Magribi_MSA-both.pkl
english-Arabic-test.pkl   LAV-MSA-2-test.pkl   Magribi_MSA-test.pkl
english-Arabic-train.pkl  LAV-MSA-2-train.pkl  Magribi_MSA-train.pkl


In [0]:
file_path = 'data/'

def pytorch_format(ds, src='en', trg='msa',datatype=''):
    src_formatted = datatype + '.' + src
    trg_formatted = datatype + '.' + trg
    
    with open(file_path + datatype + "/" + src_formatted, 'wt') as srctxt, open(file_path + datatype + "/" + trg_formatted, 'wt') as trgtxt:
        for i, arr in enumerate(ds):
            srctxt.write(datatype.upper() + '_' + str(i).zfill( len(str(len(ds))) - len(str(i))) + '\\01\\' + arr[0] + '\n')
            trgtxt.write(datatype.upper() + '_' + str(i).zfill( len(str(len(ds))) - len(str(i))) + '\\01\\' + arr[1] + '\n')

In [0]:
#splits the datatsets into train and validation sets 
en_msa_train_ds, en_msa_val_ds = train_test_split(en_msa_pre_train_ds, test_size=.2)
lav_msa_train_ds, lav_msa_val_ds = train_test_split(lav_msa_pre_train_ds, test_size=.2)

In [59]:
print(len(en_msa_train_ds))
print(len(en_msa_val_ds))

7200
1800


In [0]:
pytorch_format(en_msa_train_ds, 'en', 'msa', 'train')
pytorch_format(en_msa_val_ds, 'en', 'msa', 'val')
pytorch_format(en_msa_test_ds, 'en', 'msa', 'test')

pytorch_format(lav_msa_train_ds, 'lav', 'msa', 'train')
pytorch_format(lav_msa_val_ds, 'lav', 'msa', 'val')
pytorch_format(lav_msa_test_ds, 'lav', 'msa', 'test')

In [0]:
cat data/train/train.en

In [0]:
cat data/train/train.msa

## Build Vocabulary 

Sentence Piece Google Colab
https://github.com/google/sentencepiece/blob/master/python/sentencepiece_python_module_example.ipynb

In [0]:
len(data_English_MSA)

In [0]:
"""
Create a text file with all the MSA vocab available for SentencePiece to create a library 
"""

text_file_en = open("data/english_data.txt", "wt")
text_file_msa = open("data/arabic_data.txt", "wt")

MSA_text = ""
EN_text = ""

for line in data_English_MSA:
        english = line[0]
        english_words = english.split(" ")
        for count, word in enumerate(english_words):
            text_file_en.write(word)
        text_file_en.write("\n")
        
        arabic = line[1]
        arabic_words = arabic.split(" ")
        for count, word in enumerate(arabic_words):
            text_file_msa.write(word)
        text_file_msa.write("\n")


text_file_en.close()
text_file_msa.close()

In [29]:
spm.SentencePieceTrainer.train('--input=data/arabic_data.txt --model_prefix=data/msa --vocab_size=2000')
spm.SentencePieceTrainer.train('--input=data/english_data.txt --model_prefix=data/en --vocab_size=2000')

True

In [30]:
sp = spm.SentencePieceProcessor()
sp.load('data/en.model')

True

In [0]:
ls

In [0]:
print(sp.encode_as_pieces('This is a test'))
print(sp.encode_as_ids('This is a test'))

## Spacy Tokenizer

In [0]:
ls data/

In [0]:
text_file_en = open("data/en.vocab", "r")
text_file_ar = open("data/msa.vocab", "r")

en_vocab_list = []
ar_vocab_list = []
for line in text_file_en.readlines():
    en_vocab_list.append(line.split("\t")[0])

for line in text_file_ar.readlines():
    # print(line)
    # print(line.split("\t")[0])
    # print("---------")
    ar_vocab_list.append(line.split("\t")[0])

In [0]:
ar_vocab_list[0:200]

In [0]:
text_file_en = open("data/en.vocab", "r")
text_file_ar = open("data/msa.vocab", "r")

line = text_file_en.readlines()

In [0]:
en_vocab = Vocab(strings=en_vocab_list)
spacy_en_tokenizer = Tokenizer(en_vocab)

ar_vocab = Vocab(strings=ar_vocab_list)
spacy_msa_tokenizer = Tokenizer(ar_vocab)

## TF Tokenizer

https://huggingface.co/transformers/model_doc/t5.html#t5tokenizer

In [0]:
ls data/

In [0]:
msa_tokenizer = T5Tokenizer("data/msa.model")
en_tokenizer = T5Tokenizer("data/en.model")

In [0]:
input_ids = msa_tokenizer.encode('هل بامكاني الدخول؟ </s>', return_tensors='pt')

In [0]:
input_ids

In [0]:
#decode to make sure you can go back and forth between the encoding properly 
#@raef can you verify the Arabic?

In [0]:
ls data/train

## Pytorch Data Set and Data Loader

https://github.com/google-research/text-to-text-transfer-transformer

pytorch dataset: https://pytorch.org/text/_modules/torchtext/datasets/translation.html

pytorch dataset documentation: https://torchtext.readthedocs.io/en/latest/datasets.html#iwslt

example dataset: https://iwslt2010.fbk.eu/node/32/

Field API: https://pytorch.org/text/data.html#torchtext.data.Field
Field Source: https://pytorch.org/text/_modules/torchtext/data/field.html#Field

https://pytorch.org/text/_modules/torchtext/data/utils.html#get_tokenizer
    
we need to pass our tokenizer as a function     

In [0]:
SRC = Field(tokenize = spacy_en_tokenizer,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = False)

TRG = Field(tokenize = spacy_msa_tokenizer,
            init_token = '<sos>',
            eos_token = '<eos>',
            lower = False)

In [0]:
train_dataset = TranslationDataset(path='data/train/train.', exts=('en', 'msa'), fields=(SRC, TRG))

In [0]:
SRC.build_vocab(dataset)
TRG.build_vocab(dataset)

In [0]:
ex = dataset[0]

In [64]:
print(ex.src) 
print(ex.trg)

TRAIN_000\01\They are proud of their daughter
TRAIN_0000\01\هذه الرسالة منقولة يا ايمان


In [65]:
ls data/train

train.en  train.lav  train.msa


In [0]:
#train, validation and test are probably wrong.  
#https://github.com/pytorch/text/blob/master/torchtext/datasets/translation.py

'''
Arguments:
            exts: A tuple containing the extension to path for each language.
            fields: A tuple containing the fields that will be used for data
                in each language.
            path (str): Common prefix of the splits' file paths, or None to use
                the result of cls.download(root).
            root: Root dataset storage directory. Default is '.data'.
            train: The prefix of the train data. Default: 'train'.
            validation: The prefix of the validation data. Default: 'val'.
            test: The prefix of the test data. Default: 'test'.
            Remaining keyword arguments: Passed to the splits method of
                Dataset.

'''


train_data, valid_data, test_data = dataset.splits(path= 'data/', train='train/train', validation='val/val', test='test/test', exts=('.en', '.msa'),
                                                    fields = (SRC, TRG))

In [0]:
BATCH_SIZE =32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE,
    device = device)

## Model Training

https://huggingface.co/transformers/model_doc/t5.html#training

In [0]:
configuration = T5Config()

In [0]:
model = T5PreTrainedModel.from_pretrained('t5-base')

In [96]:
model
model.init_weights()
optimizer = optim.AdamW(model.parameters())
pad_idx = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

0


ValueError: ignored

In [0]:
def train(model:nn.Module, 
          iterator:BucketIterator, 
          optimizer: optim.Optimizer,
          criterion:nnCrossEntropyLoss, 
          clip: float):
  model.train()
  epoch_loss = 0
  for i, batch in enumerate(train_iterator):
    src = batch.src
    trg = batch.trg

    optimizer.zero_grad()

    output = model(src, trg)
    loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1))
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    epoch_loss += loss.item()
  return epoch_loss / len(iterator)

In [0]:
def evaluate(model: nn.Module,
             iterator: BucketIterator,
             criterion: nn.Module):

    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg
            output = model(src, trg, 0) #turn off teacher forcing
            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)
            loss = criterion(output, trg)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

In [0]:
import time

In [0]:
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

## Model Inference

https://github.com/huggingface/transformers/blob/master/examples/translation/t5/evaluate_wmt.py