In [23]:
!pip install transformers sentencepiece datasets
!pip install matplotlib




In [2]:
from datasets import load_dataset
from IPython.display import display
from IPython.html import widgets
#import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch
from torch import optim
from torch.nn import functional as F
from transformers import AdamW, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm_notebook
import pandas as pd

sns.set()

  warn("The `IPython.html` package has been deprecated since IPython 4.0. "


In [3]:
model_repo = 'google/mt5-small'
model_path = 'C:/Users/rohit/Documents/Coding/en-mr/models/mt5_translation.pt'
max_seq_len = 20

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_repo)

In [5]:
# Model description: https://huggingface.co/google/mt5-base
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)
model = model.cuda()

In [6]:
token_ids = tokenizer.encode(
    '<mr> This will be translated to Japanese! (hopefully)',
    return_tensors='pt').cuda()
print(token_ids)

model_out = model.generate(token_ids)
print(model_out)

output_text = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(model_out[0]))
print(output_text)

tensor([[  1042,    282,    286,    669,   1494,    898,    390,  37194,    285,
            288,  30865,    309,    274, 116024,  11994,    271,      1]],
       device='cuda:0')
tensor([[     0, 250099,      1]], device='cuda:0')
<pad> <extra_id_0></s>


In [7]:
example_input_str = '<mr> This is just a test nbuig.'
# example_input_str = 'これは普通のテスト'
input_ids = tokenizer.encode(example_input_str, return_tensors='pt')
print('Input IDs:', input_ids)

tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print('Tokens:', tokens)

Input IDs: tensor([[1042,  282,  286,  669, 1494,  339, 1627,  259,  262, 2978,  259,  272,
         1982, 1315,  260,    1]])
Tokens: ['▁<', 'm', 'r', '>', '▁This', '▁is', '▁just', '▁', 'a', '▁test', '▁', 'n', 'bu', 'ig', '.', '</s>']


In [8]:
#sorted(tokenizer.vocab.items(), key=lambda x: x[1])[1000:10000]

In [9]:
# Source: https://huggingface.co/datasets/alt
#dataset = load_dataset('alt')

In [10]:
"""train_dataset = dataset['train']
test_dataset = dataset['test']"""

"train_dataset = dataset['train']\ntest_dataset = dataset['test']"

In [11]:
LANG_TOKEN_MAPPING = {
    'en': '<en>',
    'mr': '<mr>'
}

In [12]:
special_tokens_dict = {'additional_special_tokens': list(LANG_TOKEN_MAPPING.values())}
tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))

Embedding(250102, 512)

In [13]:
example_input_str

'<mr> This is just a test nbuig.'

In [14]:
train_df = pd.read_csv("en-mr/train.tsv", sep="\t").astype(str)

#train_df["prefix"] = ""
train_df=train_df[::2]
train_df=train_df.rename(columns = {'input_text':'mr','target_text':'en'})

In [15]:
token_ids = tokenizer.encode(
    example_input_str, return_tensors='pt', padding='max_length',
    truncation=True, max_length=max_seq_len)
print(token_ids)

tokens = tokenizer.convert_ids_to_tokens(token_ids[0])
print(tokens)

tensor([[250101,   1494,    339,   1627,    259,    262,   2978,    259,    272,
           1982,   1315,    260,      1,      0,      0,      0,      0,      0,
              0,      0]])
['<mr>', '▁This', '▁is', '▁just', '▁', 'a', '▁test', '▁', 'n', 'bu', 'ig', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']


In [16]:
def encode_input_str(text, target_lang, tokenizer, seq_len,
                     lang_token_map=LANG_TOKEN_MAPPING):
  target_lang_token = lang_token_map[target_lang]

  # Tokenize and add special tokens
  input_ids = tokenizer.encode(
      text = target_lang_token + text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)

  return input_ids[0]
  
def encode_target_str(text, tokenizer, seq_len,
                      lang_token_map=LANG_TOKEN_MAPPING):
  token_ids = tokenizer.encode(
      text = text,
      return_tensors = 'pt',
      padding = 'max_length',
      truncation = True,
      max_length = seq_len)
  
  return token_ids[0]

def format_translation_data(translations, lang_token_map,
                            tokenizer, seq_len=128):
  # Choose a random 2 languages for in i/o
  langs = list(lang_token_map.keys())
  input_lang, target_lang = np.random.choice(langs, size=2, replace=False)

  # Get the translations for the batch
  input_text = translations[input_lang]
  target_text = translations[target_lang]

  if input_text is None or target_text is None:
    return None

  input_token_ids = encode_input_str(
      input_text, target_lang, tokenizer, seq_len, lang_token_map)
  
  target_token_ids = encode_target_str(
      target_text, tokenizer, seq_len, lang_token_map)

  return input_token_ids, target_token_ids

def transform_batch(batch, lang_token_map, tokenizer):
  inputs = []
  targets = []
  for i in range(0,len(batch)*2,2):
    #print(batch.loc[i])
    formatted_data = format_translation_data(
        batch.loc[i], lang_token_map, tokenizer, max_seq_len)
    
    if formatted_data is None:
      continue
    
    input_ids, target_ids = formatted_data
    inputs.append(input_ids.unsqueeze(0))
    targets.append(target_ids.unsqueeze(0))
    
  batch_input_ids = torch.cat(inputs).cuda()
  batch_target_ids = torch.cat(targets).cuda()

  return batch_input_ids, batch_target_ids

def get_data_generator(dataset, lang_token_map, tokenizer, batch_size=32):
  for i in range(0, len(dataset), batch_size):
    raw_batch = dataset[i:i+batch_size]
    yield transform_batch(raw_batch, lang_token_map, tokenizer)

In [17]:
# Testing `data_transform`
in_ids, out_ids = format_translation_data(
    train_df.loc[0], LANG_TOKEN_MAPPING, tokenizer)

#print(' '.join(tokenizer.convert_ids_to_tokens(in_ids)))
#print(' '.join(tokenizer.convert_ids_to_tokens(out_ids)))

# Testing data generator
data_gen = get_data_generator(train_df, LANG_TOKEN_MAPPING, tokenizer, 8)
data_batch = next(data_gen)
print('Input shape:', data_batch[0].shape)
print('Output shape:', data_batch[1].shape)

Input shape: torch.Size([8, 20])
Output shape: torch.Size([8, 20])


In [18]:
# Constants
n_epochs = 8
batch_size = 4
print_freq = 50
checkpoint_freq = 1000
lr = 5e-4
n_batches = int(np.ceil(len(train_df) / batch_size))
total_steps = n_epochs * n_batches
n_warmup_steps = int(total_steps * 0.01)

In [19]:
# Optimizer
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
    optimizer, n_warmup_steps, total_steps)

In [20]:
losses = []

In [21]:
def eval_model(model, gdataset, max_iters=8):
  test_generator = get_data_generator(gdataset, LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)
  eval_losses = []
  for i, (input_batch, label_batch) in enumerate(test_generator):
    if i >= max_iters:
      break

    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)
    eval_losses.append(model_out.loss.item())

  return np.mean(eval_losses)

In [22]:
for epoch_idx in range(1):
  # Randomize data order
  data_generator = get_data_generator(train_df[0:10000], LANG_TOKEN_MAPPING,
                                      tokenizer, batch_size)
                
  for batch_idx, (input_batch, label_batch) \
      in tqdm_notebook(enumerate(data_generator), total=n_batches):
    optimizer.zero_grad()

    # Forward pass
    model_out = model.forward(
        input_ids = input_batch,
        labels = label_batch)

    # Calculate loss and update weights
    loss = model_out.loss
    losses.append(loss.item())
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Print training update info
    if (batch_idx + 1) % print_freq == 0:
      avg_loss = np.mean(losses[-print_freq:])
      print('Epoch: {} | Step: {} | Avg. loss: {:.3f} | lr: {}'.format(
          epoch_idx+1, batch_idx+1, avg_loss, scheduler.get_last_lr()[0]))
      
    if (batch_idx + 1) % checkpoint_freq == 0:
      test_loss = eval_model(model, test_dataset)
      print('Saving model with test loss of {:.3f}'.format(test_loss))
      torch.save(model.state_dict(), model_path)

torch.save(model.state_dict(), model_path)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  in tqdm_notebook(enumerate(data_generator), total=n_batches):


HBox(children=(FloatProgress(value=0.0, max=739997.0), HTML(value='')))




RuntimeError: CUDA out of memory. Tried to allocate 490.00 MiB (GPU 0; 6.00 GiB total capacity; 4.07 GiB already allocated; 0 bytes free; 4.32 GiB reserved in total by PyTorch)

In [None]:
torch.save(model.state_dict(), model_path)

In [None]:
train_df=train_df[::2]
train_df.rename(columns = {'input_text':'mr','target_text':'en'})

In [None]:
train_df=train_df.rename(columns = {'input_text':'mr','target_text':'en'})

In [None]:
a=train_df.loc[0]

In [None]:
print(a)

In [None]:
train_df

In [None]:
for a in train_df[:1000]:
    print(A)

In [None]:
model.load_state_dict(torch.load(model_path))