pip install galore (Your boring dependency installation)

In [1]:
!pip3 install vncorenlp
!pip3 install transformers
!pip3 install sentencepiece
!pip3 install pytorch-lightning

Collecting vncorenlp
  Downloading vncorenlp-1.0.3.tar.gz (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 34.0 MB/s 
Building wheels for collected packages: vncorenlp
  Building wheel for vncorenlp (setup.py) ... [?25l[?25hdone
  Created wheel for vncorenlp: filename=vncorenlp-1.0.3-py3-none-any.whl size=2645951 sha256=74ee8b6433c6ea7b66a52b838d81f5df6154eeea91a8266692c734ed4cdf2c3a
  Stored in directory: /root/.cache/pip/wheels/0c/d8/f2/d28d97379b4f6479bf51247c8dfd57fa00932fa7a74b6aab29
Successfully built vncorenlp
Installing collected packages: vncorenlp
Successfully installed vncorenlp-1.0.3
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 27.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x8

Download VNCoreNLP (word segmenter feature only)

In [2]:
!mkdir -p VnCoreNLP/models/wordsegmenter
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
!wget https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/wordsegmenter.rdr
!mv VnCoreNLP-1.1.1.jar VnCoreNLP/ 
!mv vi-vocab VnCoreNLP/models/wordsegmenter/
!mv wordsegmenter.rdr VnCoreNLP/models/wordsegmenter/

--2022-05-22 06:59:42--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/VnCoreNLP-1.1.1.jar
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27412575 (26M) [application/octet-stream]
Saving to: ‘VnCoreNLP-1.1.1.jar’


2022-05-22 06:59:42 (246 MB/s) - ‘VnCoreNLP-1.1.1.jar’ saved [27412575/27412575]

--2022-05-22 06:59:42--  https://raw.githubusercontent.com/vncorenlp/VnCoreNLP/master/models/wordsegmenter/vi-vocab
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 526544 (514K) [application/octet-stream]
Saving to: ‘vi-voc

Mount my Google Drive to this machine (To load training data and save/load checkpoint)

In [3]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
root_dir = "/content/gdrive/MyDrive/"
base_dir = root_dir + 'ElainaModel/'

Mounted at /content/gdrive


Now the fun bit (Setting up the model as a PyTorch Lightning model)

In [4]:
import torch
from transformers import AutoModel, AutoTokenizer
from transformers import MBartForConditionalGeneration, AdamW, BartConfig, BartTokenizer, MBartTokenizer
from vncorenlp import VnCoreNLP

from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np

import torch.nn.functional as F
import pytorch_lightning as lightning
import torch
from pytorch_lightning.callbacks import ModelCheckpoint

import math
import random
import re
import argparse

class ElainaModel(lightning.LightningModule):
    def __init__(self, learning_rate, tokenizer, model, hparams):
      super().__init__()
      self.tokenizer = tokenizer
      self.model = model
      self.learning_rate = learning_rate
      # self.freeze_encoder = freeze_encoder
      # self.freeze_embeds_ = freeze_embeds
      self.hparams.update(hparams)

      if self.hparams.freeze_encoder:
        freeze_params(self.model.get_encoder())

      if self.hparams.freeze_embeds:
        self.freeze_embeds()

      print('constructor end')
    
    def freeze_embeds(self):
      ''' freeze the positional embedding parameters of the model; adapted from finetune.py '''
      freeze_params(self.model.model.shared)
      for d in [self.model.model.encoder, self.model.model.decoder]:
        freeze_params(d.embed_positions)
        freeze_params(d.embed_tokens)

    # Do a forward pass through the model
    def forward(self, input_ids, **kwargs):
      return self.model(input_ids, **kwargs)
    
    def configure_optimizers(self):
      optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
      return optimizer

    def training_step(self, batch, batch_idx):
      # Load the data into variables
      src_ids, src_mask = batch[0], batch[1]
      tgt_ids = batch[2]
      # Shift the decoder tokens right (but NOT the tgt_ids)
      # replaced tokenizer with self.tokenizer
      decoder_input_ids = shift_tokens_right(tgt_ids, self.tokenizer.pad_token_id)

      # Run the model and get the logits
      outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
      lm_logits = outputs[0]
      # Create the loss function
      ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
      # Calculate the loss on the un-shifted tokens
      loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

      return {'loss':loss}

    def validation_step(self, batch, batch_idx):

      src_ids, src_mask = batch[0], batch[1]
      tgt_ids = batch[2]

      # replaced tokenizer with self.tokenizer
      decoder_input_ids = shift_tokens_right(tgt_ids, self.tokenizer.pad_token_id)
      
      # Run the model and get the logits
      outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
      lm_logits = outputs[0]

      ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
      val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

      return {'loss': val_loss}
    
    # Method that generates text using the BartForConditionalGeneration's generate() method
    def generate_text(self, text, eval_beams, early_stopping = True, max_len = 40):
      ''' Function to generate text '''
      generated_ids = self.model.generate(
          text["input_ids"],
          attention_mask=text["attention_mask"],
          use_cache=True,
          decoder_start_token_id = self.tokenizer.pad_token_id,
          num_beams= eval_beams,
          max_length = max_len,
          early_stopping = early_stopping
      )
      return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=True) for w in generated_ids]

def freeze_params(model):
  ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training
      adapted from finetune.py '''
  for layer in model.parameters():
    layer.requires_grade = False

def shift_tokens_right(input_ids, pad_token_id):
  """ Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
      This is taken directly from modeling_bart.py
  """
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens



Sentence enconding function to turn all text things into number things (using BART's tokenizer)

In [5]:
def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=32, pad_to_max_length=True, return_tensors="pt", rdrsegmenter=None):
  ''' Function that tokenizes a sentence 
      Args: tokenizer - the BART tokenizer; source and target sentences are the source and target sentences
      Returns: Dictionary with keys: input_ids, attention_mask, target_ids
  '''

  input_ids = []
  attention_masks = []
  target_ids = []
  tokenized_sentences = {}

  for sentence in source_sentences:
    if rdrsegmenter is not None:
      seg_sentence = rdrsegmenter.tokenize(sentence)
      seg_sentence = ' '.join([' '.join(x) for x in seg_sentence])
    else:
      seg_sentence = sentence
    encoded_dict = tokenizer(
          seg_sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          # add_prefix_space = True
      )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)

  for sentence in target_sentences:
    if rdrsegmenter is not None:
      seg_sentence = rdrsegmenter.tokenize(sentence)
      seg_sentence = ' '.join([' '.join(x) for x in seg_sentence])
    else:
      seg_sentence = sentence
    encoded_dict = tokenizer(
          seg_sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          # add_prefix_space = True
      )
    # Shift the target ids to the right
    # shifted_target_ids = shift_tokens_right(encoded_dict['input_ids'], tokenizer.pad_token_id)
    target_ids.append(encoded_dict['input_ids'])

  target_ids = torch.cat(target_ids, dim = 0)
  

  batch = {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
      "labels": target_ids,
  }

  return batch

Data loading class where we get the .csv file and (after the totally optional word segmenting task) get encoded into tensors (lots of'em)

In [6]:
# Create a dataloading module as per the PyTorch Lightning Docs
class SummaryDataModule(lightning.LightningDataModule):
  def __init__(self, tokenizer, data_file, batch_size, num_examples = 20000, use_segmenter = False):
    super().__init__()
    self.tokenizer = tokenizer
    self.data_file = data_file
    self.batch_size = batch_size
    self.num_examples = num_examples

    if use_segmenter:
      self.rdrsegmenter = VnCoreNLP("./VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')
    else:
      self.rdrsegmenter = None
  
  # Loads and splits the data into training, validation and test sets with a 60/20/20 split
  def prepare_data(self):
    self.data = pd.read_csv(self.data_file)[:self.num_examples]
    self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.6*len(self.data)), int(.8*len(self.data))])

  # encode the sentences using the tokenizer  
  def setup(self, stage):
    self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target'], rdrsegmenter = self.rdrsegmenter)
    self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target'], rdrsegmenter = self.rdrsegmenter)
    self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target'], rdrsegmenter = self.rdrsegmenter)

  # Load the training, validation and test sets in Pytorch Dataset objects
  def train_dataloader(self):
    dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels'])                          
    train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size)
    return train_data

  def val_dataloader(self):
    dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels']) 
    val_data = DataLoader(dataset, batch_size = self.batch_size)                       
    return val_data

  def test_dataloader(self):
    dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels']) 
    test_data = DataLoader(dataset, batch_size = self.batch_size)                   
    return test_data


The part where we can acually use our model which include the noise generator for less predictable behavior and the generation function

In [12]:

def noise_sentence(sentence_, percent_words, replacement_token = "<mask>"):
  '''
  Function that noises a sentence by adding <mask> tokens
  Args: sentence - the sentence to noise
        percent_words - the percent of words to replace with <mask> tokens; the number is rounded up using math.ceil
  Returns a noised sentence
  '''
  # Create a list item and copy
  sentence_ = sentence_.split(' ')
  sentence = sentence_.copy()
  
  num_words = math.ceil(len(sentence) * percent_words)
  
  # Create an array of tokens to sample from; don't include the last word as an option because in the case of lyrics
  # that word is often a rhyming word and plays an important role in song construction
  sample_tokens = set(np.arange(0, np.maximum(1, len(sentence)-1)))
  
  words_to_noise = random.sample(sample_tokens, num_words)
  
  # Swap out words, but not full stops
  for pos in words_to_noise:
      if sentence[pos] != '.':
          sentence[pos] = replacement_token
  
  # Remove redundant spaces
  sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence))
  
  # Combine concurrent <mask> tokens into a single token; this just does two rounds of this; more could be done
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  return sentence

def generate_response(seed_line, num_lines, model_, noise_percent = 0.25, multiple_lines = False, max_line_history = 3, rdrsegmenter = None):
  ''' Function that generates lyrics based on previously generated lyrics 
      Args: seed_line - a line to start off the machine
            num_lines - the number of lines to generate
            model_ - the model used to generate the text
            multiple_lines - whether the model generates based on multiple previous lines or just the past line
            max_line_history - the maximum number of previous lines used in the current input
      Returns a list with num_lines of rap lines
  '''
  # Put the model on eval mode
  model_.to(torch.device('cpu'))
  model_.eval()
  if rdrsegmenter is not None:
    seed_line = ' '.join([' '.join(x) for x in rdrsegmenter.tokenize(seed_line)])
  dialog = []
  dialog.append(seed_line)
  # not using noise gen here, lets see if it works
  prompt_line_tokens = tokenizer(noise_sentence(seed_line, 0.2), max_length = 32, return_tensors = "pt", truncation = True)
  # Loop through the number of lines generating a new line based on the old

  line = [seed_line]
  for i in range(num_lines):
    # Print out the new line
    entry = line[0].strip().replace('< s >', '').replace('< / s >', '')
    # print(entry)
    dialog.append(entry)
    line = model.generate_text(prompt_line_tokens, eval_beams = 4)
    # This deals with an artefact in the training data that I had an issue cleaning
    if line[0].find(":") != -1:
      line[0] = re.sub(r'[A-Z]+: ', '', line[0])
    # This allows the model to generate a new line conditioned on more than one line
    if multiple_lines:
      start_line = np.maximum(0, i - max_line_history)
      end_line = i
      prompt_line = ' '.join(dialog[start_line:end_line]) # Going to end_line is fine because it is non-inclusive
    else:
      prompt_line = dialog[i]
    # not using noise gen here, lets see if it works
    prompt_line_tokens = tokenizer(noise_sentence(prompt_line, 0.2), max_length = 32, return_tensors = "pt", truncation = True)

  return dialog

This is the main function, we first setup our tokenizer and base BARTPho model

In [7]:
hparams = {
    'freeze_encoder': True,
    'freeze_embeds': True,
    'eval_beams': 4
}

print('Setting up tokenizer...')
tokenizer = AutoTokenizer.from_pretrained('vinai/bartpho-word')

print('Setting up BARTPho pretrained model...')
bart_model = MBartForConditionalGeneration.from_pretrained('vinai/bartpho-word')


Downloading:   0%|          | 0.00/866 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

Select whether you want to train a new model or use a saved model (once you train a new model, make sure you have a bucket ton of RAM)

In [None]:
do_train = input("Bạn muốn huấn luyện mô hình mới hay không? (y/n):")

if do_train == 'y':
  print("Setting up training / validating data...")
  summary_data = SummaryDataModule(tokenizer, 
                                  data_file='/content/gdrive/MyDrive/ElainaModel/bart_data.csv',
                                  batch_size = 8, num_examples = 800, use_segmenter = True)

  print("Initializing new model...")
  model = ElainaModel(
      learning_rate = 2e-5, 
      tokenizer = tokenizer, 
      model = bart_model, 
      hparams = hparams,
  )

  checkpoint = ModelCheckpoint(dirpath=base_dir + 'checkpoint_files/')
  print("Setting up trainer...")
  trainer = lightning.Trainer(
      gpus = 1,
      max_epochs = 5,
      min_epochs = 1,
      auto_lr_find = False,
      checkpoint_callback = checkpoint,
      progress_bar_refresh_rate = 500
  )

  print("Initiate training process.")
  # Prone to Out of Memory error
  trainer.fit(model, summary_data)
else:
  filename = "checkpoint_files/epoch=4-step=300.ckpt"

  print("Loading checkpoint for fine-tuned model")
  model = ElainaModel.load_from_checkpoint(
    base_dir + filename, 
    learning_rate = 2e-5, 
    tokenizer = tokenizer, 
    model = bart_model, 
    hparams = hparams
  )

Explained in the comment

In [10]:
# uncomment this line to backup trained checkpoints

# !cp lightning_logs/version_0/checkpoints/epoch=4-step=300.ckpt gdrive/MyDrive/ElainaModel/checkpoint_files

This is where we can actually test our poor model

In [13]:
# testing env

rdrsegmenter = VnCoreNLP("./VnCoreNLP/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m')

message = ""
while message != "exit":
  message = input(">>> ")
  if message == "exit": 
    continue
  new_dialog = generate_response(seed_line = message, num_lines = 2, model_ = model,
                           noise_percent = 0.2, multiple_lines = False, max_line_history = 1, rdrsegmenter=rdrsegmenter)
  print(new_dialog[2].replace("_", " "))
  
print("Testing stopped")

>>> chào bạn
Hãy nghĩ về tôi như một trợ lý ảo 
>>> Tại sao?
 Tôi có thể được huấn luyện để trở nên hữu dụng hơn. Nhà phát triển sẽ tiếp tục huấn luyện cho tôi. Mong bạn thông cảm 
>>> Tôi tin tưởng bạn
Tôi là trợ lý ảo, không phải người thật. Tôi là trợ lý ảo, không phải người thật 
>>> Đừng buồn
Tôi rất vui. Có thật nhiều điều thú vị ngoài kia 
>>> exit
Testing stopped
