In [1]:
!pip install transformers sentencepiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
from datasets import load_dataset, load_dataset_builder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import MT5ForConditionalGeneration, T5Tokenizer
from transformers import get_linear_schedule_with_warmup
import tqdm
from tqdm import tqdm_notebook

In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] ='0'

In [4]:
## Model and Tokenizer Download

In [5]:
tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
model = MT5ForConditionalGeneration.from_pretrained('Davlan/mt5_base_yor_eng_mt')

In [6]:
model = model.cuda()

In [7]:
# max sequence length allowed by model
max_seq_len = model.config.max_length

In [8]:
test_string = "Akọni ajìjàgbara obìnrin tó sun àtìmalé torí owó orí"

In [9]:
inputs = tokenizer.encode(test_string, return_tensors="pt")

tokens = tokenizer.convert_ids_to_tokens(inputs[0])

# tokenized string
inputs

tensor([[ 3994,  2807,   266,  1479,  1135, 16816,   318, 10102,   999, 56302,
          2280, 31029,  5693,   259,   369,  9164,  3130,   361,   288, 28821,
           259,   268, 39574,   631,   420,     1]])

In [10]:
#token ids
tokens

['▁Ak',
 'ọn',
 'i',
 '▁aj',
 'ì',
 'jà',
 'g',
 'bara',
 '▁ob',
 'ìn',
 'rin',
 '▁tó',
 '▁sun',
 '▁',
 'à',
 'tì',
 'mal',
 'é',
 '▁to',
 'rí',
 '▁',
 'o',
 'wó',
 '▁or',
 'í',
 '</s>']

In [11]:
# Dataset Source - https://huggingface.co/datasets/menyo20k_mt
dataset = load_dataset('menyo20k_mt')
dataset



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 10070
    })
})

In [12]:
train_dataset = dataset['train']

In [13]:
train_dataset[0]['translation']

{'en': 'Unit 1: What is Creative Commons?',
 'yo': '\ufeffÌdá 1: Kín ni Creative Commons?'}

In [14]:
MAPPING = {
    'en': "<en>",
    'yo': "<yo>"
}

In [15]:
token_ids = tokenizer.encode(
    test_string, return_tensors='pt',
    padding='max_length', truncation=True,
    max_length=max_seq_len
)

In [16]:
token_ids

tensor([[ 3994,  2807,   266,  1479,  1135, 16816,   318, 10102,   999, 56302,
          2280, 31029,  5693,   259,   369,  9164,  3130,   361,   288,     1]])

In [17]:
def encode_input(text, target_lang, tokenizer,
                 seq_len, lang_token_map=MAPPING):
  
  # Accessing target language token from dict
  target_lang_token = lang_token_map[target_lang]

  # Tokenize
  # tokenize the text using a padding or truncating if need be
  input = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding='max_length',
      truncation=True,
      max_length = max_seq_len
  )

  return input[0]



def encode_target(
    text, tokenizer,
    seq_len, lang_token_map=MAPPING):
  
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      max_length = max_seq_len,
      truncation = True,
      padding = 'max_length'
  )
  return token_ids[0]


def format_data(translation, token_map, tokenizer, seq_len=128):
  langs = tuple(token_map.keys())

  input_lang, target_lang = langs

  input_text = translation[input_lang]
  target_text = translation[target_lang]

  if input_text is None or target_text is None:
    return None


  input_tokens = encode_input(
      input_text,
      target_lang,
      tokenizer,
      seq_len,
      token_map
  )

  target_tokens = encode_target(
      target_text,
      tokenizer,
      seq_len,
      token_map
  )

  return input_tokens, target_tokens

def batch_transform(batch, token_map, tokenizer):

  inputs = []
  targets = []

  for _set in batch['translation']:
    formatted_data = format_data(
        _set,
        token_map,
        tokenizer 
    )
    if formatted_data is None:
      continue
    
    input_ids, target_ids = formatted_data

    # Returns a new tensor with a dimension of size one inserted
    # at the specified position.
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))

    batch_input_ids = torch.cat(inputs).cuda()
    batch_target_ids = torch.cat(targets).cuda()

    # batch_input_ids, batch_target_ids

    return batch_input_ids, batch_target_ids

# data generator
def data_generator(dataset, token_map, tokenizer, batch_size=20):
  dataset = dataset.shuffle()

  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i: i + batch_size]
    yield batch_transform(raw_batch, token_map, tokenizer)

In [18]:
input_ids, target_ids = format_data(
    train_dataset[1]['translation'],
    MAPPING,
    tokenizer
)

In [19]:
input_ids

tensor([ 1042,  1651,   669, 13673,  2404,   339, 45505,   285,  1711,   259,
          262, 37669, 54854,   298, 85834, 21973,  4265, 71006,   260,     1])

In [20]:
target_ids

tensor([  336, 40067,  3213,   259,   276,  1135,   420,   414,   369,  4542,
        21665,  3213,   259,   369, 40067, 37669, 54854,   298, 85834,     1])

In [21]:
" ".join(tokenizer.convert_ids_to_tokens(input_ids))

'▁< yo > This ▁work ▁is ▁license d ▁under ▁ a ▁Creative ▁Commons ▁A ttribution ▁4.0 ▁International ▁License . </s>'

In [22]:
" ".join(tokenizer.convert_ids_to_tokens(target_ids))

'▁I ṣẹ ́ ▁ y ì í ▁w à ▁lá bẹ ́ ▁ à ṣẹ ▁Creative ▁Commons ▁A ttribution </s>'

In [23]:
data_gen = data_generator(train_dataset, MAPPING, tokenizer, 8)

data_batch = next(data_gen)

In [24]:
data_batch

(tensor([[  1042,   1651,    669,   2594,    287,    359, 167342,    304,  55324,
             263,   5224,  78191,  23694,   2733,    484,   5666,   1371,    910,
            1001,      1]], device='cuda:0'),
 tensor([[ 78191,  23694,    871,    742,    369,   7640,    369,   1444,    259,
             369,  34891,  28569,    259,   9360,   9251,    259, 113229,    261,
             259,      1]], device='cuda:0'))

In [25]:
n_epochs = 5
batch_size = 16
print_freq = 50
lr = 5e-4
n_batches = int(np.ceil(len(train_dataset) / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)

In [26]:
# model Optimizer
optimizer = torch.optim.AdamW(
    model.parameters(), lr=lr)

scheduler = get_linear_schedule_with_warmup(
    optimizer, n_warmup_steps, total_steps
)

In [27]:
losses = []

In [28]:
for epoch_idx in range(n_epochs):
  data_gen = data_generator(train_dataset, MAPPING,
                            tokenizer, batch_size)

  for batch_idx, (input_batch, target_batch) in tqdm_notebook(enumerate(data_gen),total=n_batches):

    optimizer.zero_grad()

    # forward pass
    model_out = model.forward(
        input_ids = input_batch,
        labels = target_batch
    )

    loss = model_out.loss
    losses.append(loss.item())

    loss.backward()
    optimizer.step()
    scheduler.step()

    # printing training update info
    if (batch_idx + 1) % print_freq == 0:
      avg_loss = np.mean(losses[-print_freq:])
      print(f"Epoch: {epoch_idx + 1} | Step: {batch_idx + 1} | Avg.loss{avg_loss:.3f} | lr: {scheduler.get_last_lr()[0]}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/630 [00:00<?, ?it/s]

Epoch: 1 | Step: 50 | Avg.loss5.164 | lr: 0.0004969541519717858
Epoch: 1 | Step: 100 | Avg.loss4.483 | lr: 0.0004889387624238538
Epoch: 1 | Step: 150 | Avg.loss3.761 | lr: 0.0004809233728759218
Epoch: 1 | Step: 200 | Avg.loss3.690 | lr: 0.00047290798332798973
Epoch: 1 | Step: 250 | Avg.loss3.582 | lr: 0.00046489259378005774
Epoch: 1 | Step: 300 | Avg.loss3.358 | lr: 0.0004568772042321257
Epoch: 1 | Step: 350 | Avg.loss3.445 | lr: 0.0004488618146841937
Epoch: 1 | Step: 400 | Avg.loss3.839 | lr: 0.0004408464251362616
Epoch: 1 | Step: 450 | Avg.loss3.509 | lr: 0.0004328310355883296
Epoch: 1 | Step: 500 | Avg.loss3.432 | lr: 0.00042481564604039755
Epoch: 1 | Step: 550 | Avg.loss3.308 | lr: 0.00041680025649246555
Epoch: 1 | Step: 600 | Avg.loss3.405 | lr: 0.00040878486694453356


  0%|          | 0/630 [00:00<?, ?it/s]

Epoch: 2 | Step: 50 | Avg.loss3.106 | lr: 0.0003959602436678423
Epoch: 2 | Step: 100 | Avg.loss2.963 | lr: 0.00038794485411991025
Epoch: 2 | Step: 150 | Avg.loss3.089 | lr: 0.0003799294645719782
Epoch: 2 | Step: 200 | Avg.loss3.130 | lr: 0.00037191407502404616
Epoch: 2 | Step: 250 | Avg.loss2.985 | lr: 0.00036389868547611416
Epoch: 2 | Step: 300 | Avg.loss2.981 | lr: 0.0003558832959281821
Epoch: 2 | Step: 350 | Avg.loss2.934 | lr: 0.00034786790638025006
Epoch: 2 | Step: 400 | Avg.loss2.685 | lr: 0.00033985251683231807
Epoch: 2 | Step: 450 | Avg.loss5.204 | lr: 0.000331837127284386
Epoch: 2 | Step: 500 | Avg.loss5.193 | lr: 0.00032382173773645403
Epoch: 2 | Step: 550 | Avg.loss5.077 | lr: 0.0003158063481885219
Epoch: 2 | Step: 600 | Avg.loss4.805 | lr: 0.00030779095864058993


  0%|          | 0/630 [00:00<?, ?it/s]

Epoch: 3 | Step: 50 | Avg.loss4.622 | lr: 0.00029496633536389873
Epoch: 3 | Step: 100 | Avg.loss4.602 | lr: 0.0002869509458159666
Epoch: 3 | Step: 150 | Avg.loss4.377 | lr: 0.00027893555626803463
Epoch: 3 | Step: 200 | Avg.loss4.199 | lr: 0.00027092016672010264
Epoch: 3 | Step: 250 | Avg.loss3.836 | lr: 0.0002629047771721706
Epoch: 3 | Step: 300 | Avg.loss4.515 | lr: 0.00025488938762423854
Epoch: 3 | Step: 350 | Avg.loss3.782 | lr: 0.0002468739980763065
Epoch: 3 | Step: 400 | Avg.loss4.167 | lr: 0.0002388586085283745
Epoch: 3 | Step: 450 | Avg.loss3.461 | lr: 0.00023084321898044245
Epoch: 3 | Step: 500 | Avg.loss3.519 | lr: 0.00022282782943251043
Epoch: 3 | Step: 550 | Avg.loss3.581 | lr: 0.00021481243988457838
Epoch: 3 | Step: 600 | Avg.loss3.577 | lr: 0.00020679705033664636


  0%|          | 0/630 [00:00<?, ?it/s]

Epoch: 4 | Step: 50 | Avg.loss3.194 | lr: 0.00019397242705995513
Epoch: 4 | Step: 100 | Avg.loss3.423 | lr: 0.00018595703751202308
Epoch: 4 | Step: 150 | Avg.loss3.306 | lr: 0.00017794164796409106
Epoch: 4 | Step: 200 | Avg.loss3.557 | lr: 0.00016992625841615904
Epoch: 4 | Step: 250 | Avg.loss3.334 | lr: 0.00016191086886822701
Epoch: 4 | Step: 300 | Avg.loss3.267 | lr: 0.00015389547932029496
Epoch: 4 | Step: 350 | Avg.loss3.179 | lr: 0.00014588008977236294
Epoch: 4 | Step: 400 | Avg.loss3.226 | lr: 0.0001378647002244309
Epoch: 4 | Step: 450 | Avg.loss3.185 | lr: 0.00012984931067649887
Epoch: 4 | Step: 500 | Avg.loss3.189 | lr: 0.00012183392112856685
Epoch: 4 | Step: 550 | Avg.loss3.064 | lr: 0.00011381853158063482
Epoch: 4 | Step: 600 | Avg.loss3.261 | lr: 0.0001058031420327028


  0%|          | 0/630 [00:00<?, ?it/s]

Epoch: 5 | Step: 50 | Avg.loss3.116 | lr: 9.297851875601154e-05
Epoch: 5 | Step: 100 | Avg.loss3.150 | lr: 8.496312920807952e-05
Epoch: 5 | Step: 150 | Avg.loss3.081 | lr: 7.694773966014748e-05
Epoch: 5 | Step: 200 | Avg.loss3.198 | lr: 6.893235011221545e-05
Epoch: 5 | Step: 250 | Avg.loss3.024 | lr: 6.0916960564283426e-05
Epoch: 5 | Step: 300 | Avg.loss3.076 | lr: 5.29015710163514e-05
Epoch: 5 | Step: 350 | Avg.loss2.939 | lr: 4.488618146841936e-05
Epoch: 5 | Step: 400 | Avg.loss2.999 | lr: 3.687079192048734e-05
Epoch: 5 | Step: 450 | Avg.loss3.072 | lr: 2.8855402372555306e-05
Epoch: 5 | Step: 500 | Avg.loss3.060 | lr: 2.0840012824623278e-05
Epoch: 5 | Step: 550 | Avg.loss2.821 | lr: 1.2824623276691248e-05
Epoch: 5 | Step: 600 | Avg.loss2.976 | lr: 4.809233728759218e-06


In [None]:
# train_dataset

In [42]:
# english = []
# yoruba = []

In [43]:
# for i in train_dataset['translation']:
#   yoruba.append(i['yo'])
#   english.append(i['en'])

In [47]:
# yoruba