<a href="https://colab.research.google.com/github/XVI-IX/yoruba-translate/blob/david/trial/trial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers sentencepiece datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.2-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 12.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 60.4 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.5.2-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 51.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.5 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.10.0-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 59.6 MB/s 
Collecting responses<0.19
  Downloading respon

In [15]:
from datasets import load_dataset, load_dataset_builder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import MT5ForConditionalGeneration, T5Tokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook

In [3]:
tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")
model = MT5ForConditionalGeneration.from_pretrained('Davlan/mt5_base_yor_eng_mt')

Downloading:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/376 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/673 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [28]:
max_seq_len = model.config.max_length

In [4]:
test_string = "Akọni ajìjàgbara obìnrin tó sun àtìmalé torí owó orí"

In [9]:
inputs = tokenizer.encode(test_string, return_tensors="pt")

tokens = tokenizer.convert_ids_to_tokens(inputs[0])

In [10]:
tokens

['▁Ak',
 'ọn',
 'i',
 '▁aj',
 'ì',
 'jà',
 'g',
 'bara',
 '▁ob',
 'ìn',
 'rin',
 '▁tó',
 '▁sun',
 '▁',
 'à',
 'tì',
 'mal',
 'é',
 '▁to',
 'rí',
 '▁',
 'o',
 'wó',
 '▁or',
 'í',
 '</s>']

In [16]:
# Dataset Source - https://huggingface.co/datasets/menyo20k_mt
dataset = load_dataset('menyo20k_mt')
dataset

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

Downloading and preparing dataset menyo20k_mt/menyo20k_mt (download: 2.38 MiB, generated: 2.43 MiB, post-processed: Unknown size, total: 4.81 MiB) to /root/.cache/huggingface/datasets/menyo20k_mt/menyo20k_mt/1.0.0/96c9c82d2a5afc5726b868d436c0b8ae3eb7cbeea393e76b70cb3ded479d0376...


Downloading data:   0%|          | 0.00/822k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10070 [00:00<?, ? examples/s]

Dataset menyo20k_mt downloaded and prepared to /root/.cache/huggingface/datasets/menyo20k_mt/menyo20k_mt/1.0.0/96c9c82d2a5afc5726b868d436c0b8ae3eb7cbeea393e76b70cb3ded479d0376. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 10070
    })
})

In [20]:
train_dataset = dataset['train']

In [23]:
train_dataset[0]['translation']

{'en': 'Unit 1: What is Creative Commons?',
 'yo': '\ufeffÌdá 1: Kín ni Creative Commons?'}

In [24]:
MAPPING = {
    'en': "<en>",
    'yo': "<yo>"
}

In [27]:
model.config.max_length

20

In [31]:
token_ids = tokenizer.encode(
    test_string, return_tensors='pt',
    padding='max_length', truncation=True,
    max_length=max_seq_len
)

In [32]:
token_ids

tensor([[ 3994,  2807,   266,  1479,  1135, 16816,   318, 10102,   999, 56302,
          2280, 31029,  5693,   259,   369,  9164,  3130,   361,   288,     1]])

In [36]:
def encode_input(text, target_lang, tokenizer,
                 seq_len, lang_token_map=MAPPING):
  
  # Accessing target language token from dict
  target_lang_token = lang_token_map[target_lang]

  # Tokenize
  # tokenize the text using a padding or truncating if need be
  input = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding='max_length',
      truncation=True,
      max_length = max_seq_len
  )

  return input[0]



def encode_target(
    text, tokenizer,
    seq_len, lang_token_map=MAPPING):
  
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      max_length = max_seq_len,
      truncation = True,
      padding = 'max_length'
  )
  return token_ids[0]


def format_data(translation, token_map, tokenizer, seq_len=128):
  langs = list(token_map.keys())

  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)

  input_text = translation[input_lang]
  target_text = translation[target_lang]

  if input_text is None or target_text is None:
    return None


  input_tokens = encode_input(
      input_text,
      target_lang,
      tokenizer,
      seq_len,
      token_map
  )

  target_tokens = encode_target(
      target_text,
      tokenizer,
      seq_len,
      token_map
  )

  return input_tokens, target_tokens

In [53]:
input_ids, target_ids = format_data(
    train_dataset[1]['translation'],
    MAPPING,
    tokenizer
)

In [54]:
input_ids

tensor([ 1042,   278,   669,   566, 40067,  3213,   259,   276,  1135,   420,
          414,   369,  4542, 21665,  3213,   259,   369, 40067, 37669,     1])

In [55]:
target_ids

tensor([ 1494,  2404,   339, 45505,   285,  1711,   259,   262, 37669, 54854,
          298, 85834, 21973,  4265, 71006,   260,     1,     0,     0,     0])

In [56]:
" ".join(tokenizer.convert_ids_to_tokens(input_ids))

'▁< en > I ṣẹ ́ ▁ y ì í ▁w à ▁lá bẹ ́ ▁ à ṣẹ ▁Creative </s>'

In [57]:
" ".join(tokenizer.convert_ids_to_tokens(target_ids))

'▁This ▁work ▁is ▁license d ▁under ▁ a ▁Creative ▁Commons ▁A ttribution ▁4.0 ▁International ▁License . </s> <pad> <pad> <pad>'