In [5]:
!pip install transformers sentencepiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 33.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 57.9 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.5.2-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 66.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 35.3 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 69.8 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp

In [6]:
from datasets import load_dataset, load_dataset_builder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import MT5ForConditionalGeneration, T5Tokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook

In [None]:
## Model and Tokenizer Download

In [7]:
tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
model = MT5ForConditionalGeneration.from_pretrained('Davlan/mt5_base_yor_eng_mt')

Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/673 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [8]:
max_seq_len = model.config.max_length

In [9]:
test_string = "Akọni ajìjàgbara obìnrin tó sun àtìmalé torí owó orí"

In [10]:
inputs = tokenizer.encode(test_string, return_tensors="pt")

tokens = tokenizer.convert_ids_to_tokens(inputs[0])

In [11]:
tokens

['▁Ak',
 'ọn',
 'i',
 '▁aj',
 'ì',
 'jà',
 'g',
 'bara',
 '▁ob',
 'ìn',
 'rin',
 '▁tó',
 '▁sun',
 '▁',
 'à',
 'tì',
 'mal',
 'é',
 '▁to',
 'rí',
 '▁',
 'o',
 'wó',
 '▁or',
 'í',
 '</s>']

In [12]:
# Dataset Source - https://huggingface.co/datasets/menyo20k_mt
dataset = load_dataset('menyo20k_mt')
dataset

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading and preparing dataset menyo20k_mt/menyo20k_mt (download: 2.38 MiB, generated: 2.43 MiB, post-processed: Unknown size, total: 4.81 MiB) to /root/.cache/huggingface/datasets/menyo20k_mt/menyo20k_mt/1.0.0/96c9c82d2a5afc5726b868d436c0b8ae3eb7cbeea393e76b70cb3ded479d0376...


Downloading data:   0%|          | 0.00/822k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10070 [00:00<?, ? examples/s]

Dataset menyo20k_mt downloaded and prepared to /root/.cache/huggingface/datasets/menyo20k_mt/menyo20k_mt/1.0.0/96c9c82d2a5afc5726b868d436c0b8ae3eb7cbeea393e76b70cb3ded479d0376. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 10070
    })
})

In [13]:
train_dataset = dataset['train']

In [29]:
dataset['train']['translation'][:8000]

[{'en': 'Unit 1: What is Creative Commons?',
  'yo': '\ufeffÌdá 1: Kín ni Creative Commons?'},
 {'en': 'This work is licensed under a Creative Commons Attribution 4.0 International License.',
  'yo': 'Iṣẹ́ yìí wà lábẹ́ àṣẹ Creative Commons Attribution 4.0 International License.'},
 {'en': 'Creative Commons is a set of legal tools, a nonprofit organization, as well as a global network and a movement — all inspired by people’s willingness to share their creativity and knowledge, and enabled by a set of open copyright licenses.',
  'yo': 'Creative Commons jẹ́ àwọn ọ̀kan-ò-jọ̀kan ohun-èlò ajẹmófin, iléeṣẹ́ àìlérèlórí, àti àjọ àwọn ènìyàn eléròǹgbà kan náà kárí àgbáńlá ayé— tí í ṣe ìmísí àwọn ènìyànkan tí ó ní ìfẹ́ tinútinú láti pín àwọn iṣẹ́-àtinúdá àti ìmọ̀ wọn èyí tí ó ní àtìlẹ́yìn àwọn ọ̀kan-ò-jọ̀kan àṣẹ ìṣísílẹ̀-gbangba-wálíà fún àtúnlò.'},
 {'en': 'Creative Commons began in response to an outdated global copyright legal system.',
  'yo': 'Creative Commons bẹ̀rẹ̀ láti wá wọ̀rọ̀kọ̀ fi ṣ

In [14]:
train_dataset[0]['translation']

{'en': 'Unit 1: What is Creative Commons?',
 'yo': '\ufeffÌdá 1: Kín ni Creative Commons?'}

In [15]:
MAPPING = {
    'en': "<en>",
    'yo': "<yo>"
}

In [16]:
model.config.max_length

20

In [17]:
token_ids = tokenizer.encode(
    test_string, return_tensors='pt',
    padding='max_length', truncation=True,
    max_length=max_seq_len
)

In [18]:
token_ids

tensor([[ 3994,  2807,   266,  1479,  1135, 16816,   318, 10102,   999, 56302,
          2280, 31029,  5693,   259,   369,  9164,  3130,   361,   288,     1]])

In [19]:
def encode_input(text, target_lang, tokenizer,
                 seq_len, lang_token_map=MAPPING):
  
  # Accessing target language token from dict
  target_lang_token = lang_token_map[target_lang]

  # Tokenize
  # tokenize the text using a padding or truncating if need be
  input = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding='max_length',
      truncation=True,
      max_length = max_seq_len
  )

  return input[0]



def encode_target(
    text, tokenizer,
    seq_len, lang_token_map=MAPPING):
  
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      max_length = max_seq_len,
      truncation = True,
      padding = 'max_length'
  )
  return token_ids[0]


def format_data(translation, token_map, tokenizer, seq_len=128):
  langs = list(token_map.keys())

  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)

  input_text = translation[input_lang]
  target_text = translation[target_lang]

  if input_text is None or target_text is None:
    return None


  input_tokens = encode_input(
      input_text,
      target_lang,
      tokenizer,
      seq_len,
      token_map
  )

  target_tokens = encode_target(
      target_text,
      tokenizer,
      seq_len,
      token_map
  )

  return input_tokens, target_tokens

def batch_transform(batch, token_map, tokenizer):

  inputs = []
  targets = []

  for _set in batch['translation']:
    formatted_data = format_data(
        _set,
        token_map,
        tokenizer 
    )
    if formatted_data is None:
      continue
    
    input_ids, target_ids = formatted_data

    # Returns a new tensor with a dimension of size one inserted
    # at the specified position.
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))

    batch_input_ids = torch.cat(inputs).cuda()
    batch_target_ids = torch.cat(targets).cuda()

    batch_input_ids, batch_target_ids

    return batch

# data generator
def data_generator(dataset, token_map, tokenizer, batch_size=20):
  dataset = dataset.shuffle()

  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i: i + batch_size]
    yield batch_transform(raw_batch, token_map, tokenizer)

In [20]:
input_ids, target_ids = format_data(
    train_dataset[1]['translation'],
    MAPPING,
    tokenizer
)

In [21]:
input_ids

tensor([ 1042,  1651,   669, 13673,  2404,   339, 45505,   285,  1711,   259,
          262, 37669, 54854,   298, 85834, 21973,  4265, 71006,   260,     1])

In [22]:
target_ids

tensor([  336, 40067,  3213,   259,   276,  1135,   420,   414,   369,  4542,
        21665,  3213,   259,   369, 40067, 37669, 54854,   298, 85834,     1])

In [23]:
" ".join(tokenizer.convert_ids_to_tokens(input_ids))

'▁< yo > This ▁work ▁is ▁license d ▁under ▁ a ▁Creative ▁Commons ▁A ttribution ▁4.0 ▁International ▁License . </s>'

In [24]:
" ".join(tokenizer.convert_ids_to_tokens(target_ids))

'▁I ṣẹ ́ ▁ y ì í ▁w à ▁lá bẹ ́ ▁ à ṣẹ ▁Creative ▁Commons ▁A ttribution </s>'

In [25]:
data_gen = data_generator(train_dataset, MAPPING, tokenizer, 8)

data_batch = next(data_gen)

In [26]:
data_batch

{'translation': [{'en': '“In Canada, we have never had anything like that, so sometimes we can forget that we are living in the last days.',
   'yo': 'Lórílẹ̀-èdè Kánádà tá a wà yìí, kò sóhun tó jọ bẹ́ẹ̀, téèyàn ò bá sì ṣọ́ra èèyàn lè má rántí pé ọjọ́ ìkẹyìn la wà yìí.'},
  {'en': 'It is for both of us, if one uses it, he/she lives it for the other person to use it.',
   'yo': 'Àwa méjì la jọ ni í, bí ẹnìkíní bá lò ó, á á gbéeélẹ̀ kẹ́nìkejì ó lò ó.'},
  {'en': 'All users are already enrolled in this class',
   'yo': 'Gbogbo àwọn olùṣàmúlò ni a ti forúkọ wọn sílẹ̀ ní yàrá ìkẹ́ẹ̀kọ́ yìí'},
  {'en': 'One does not use a sword to kill a snail.',
   'yo': 'A kì í fi idà pa ìgbín.'},
  {'en': '"""The president of Kazakhstan, Nursultan Nazarbayev, pardoned Teymur Akhmedov, and he was released from custody on April 4, 2018."',
   'yo': '"""Nursultan Nazarbayev tó jẹ́ ààrẹ orílẹ̀-èdè Kazakhstan ní kí wọ́n dá Teymur Akhmedov sílẹ̀, wọ́n sì mú un kúrò látìmọ́lé ní April 4, 2018."'},
  {'en': "Don'