In [None]:
!pip install -q pytorch-lightning
!pip install -q transformers

[K     |████████████████████████████████| 829kB 10.6MB/s 
[K     |████████████████████████████████| 276kB 21.5MB/s 
[K     |████████████████████████████████| 829kB 32.6MB/s 
[K     |████████████████████████████████| 112kB 52.0MB/s 
[K     |████████████████████████████████| 1.3MB 52.6MB/s 
[K     |████████████████████████████████| 296kB 53.9MB/s 
[K     |████████████████████████████████| 143kB 52.7MB/s 
[?25h  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone
  Building wheel for future (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.9MB 14.0MB/s 
[K     |████████████████████████████████| 3.2MB 29.2MB/s 
[K     |████████████████████████████████| 890kB 51.1MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np

import torch.nn.functional as F
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint

import math
import random
import re
import argparse

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
root_dir = '/content/gdrive/My Drive/'
base_dir = '/content/gdrive/My Drive/BART/'

Mounted at /content/gdrive


In [None]:
class BModel(pl.LightningModule):
  def __init__(self, learning_rate, tokenizer, model, hparams):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    self.hparams = hparams
    if self.hparams.freeze_encoder:
      freeze_params(self.model.get_encoder())
    if self.hparams.freeze_embeds:
      self.freeze_embeds()
  
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)
  
  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr = self.learning_rate)

  def training_step(self, batch, batch_idx):
    sourceIds, sourceMask = batch[0], batch[1]
    targetIds = batch[2]
    decoder_input_ids = shift_tokens_right(targetIds, tokenizer.pad_token_id)
    outputs = self(sourceIds, attention_mask=sourceMask, decoder_input_ids=decoder_input_ids, use_cache=False)
    CElossFunc = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    loss = CElossFunc(outputs[0].view(-1, outputs[0].shape[-1]), targetIds.view(-1))
    return {'loss':loss}

  def validation_step(self, batch, batch_idx):
    sourceIds, sourceMask = batch[0], batch[1]
    targetIds = batch[2]
    decoder_input_ids = shift_tokens_right(targetIds, tokenizer.pad_token_id)
    outputs = self(sourceIds, attention_mask=sourceMask, decoder_input_ids=decoder_input_ids, use_cache=False)
    CElossFunc = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    valLoss = CElossFunc(outputs[0].view(-1, outputs[0].shape[-1]), targetIds.view(-1))
    return {'loss': valLoss}
  
  # Generates text using the BartForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 40, startT = None):
    if startT == None:
      dstartT = self.tokenizer.pad_token_id
    else:
      dstartT = tokenizer.convert_tokens_to_ids(startT)
    generated_ids = self.model.generate(
        text["input_ids"],
        attention_mask=text["attention_mask"],
        use_cache=True,
        decoder_start_token_id = dstartT,
        num_beams= eval_beams,
        max_length = max_len,
        early_stopping = early_stopping
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=True) for w in generated_ids]

  def freeze_embeds(self):
    #freeze the positional embedding parameters of the model; from finetune.py
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

def freeze_params(model):
  #Freezes the layers for faster training; from finetune.py 
  for layer in model.parameters():
    layer.requires_grade = False

In [None]:
class DataLoader(pl.LightningDataModule):
  # Create a dataloading module as in https://pytorch-lightning.readthedocs.io/en/
  def __init__(self, tokenizer, data_file, batch_size, num_examples = 30000):
    super().__init__()
    self.tokenizer = tokenizer
    self.data_file = data_file
    self.batch_size = batch_size
    self.num_examples = num_examples
  
  # Load and split the data
  def prepare_data(self):
    self.data = pd.read_csv(self.data_file)[:self.num_examples]
    self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.6*len(self.data)), int(.8*len(self.data))])

  # Encode the sentences  
  def setup(self, stage):
    self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target'])
    self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target'])
    self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target'])

  # Load the training, validation and test sets
  def train_dataloader(self):
    dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels'])                          
    train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size)
    return train_data
  def val_dataloader(self):
    dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels']) 
    val_data = DataLoader(dataset, batch_size = self.batch_size)                       
    return val_data
  def test_dataloader(self):
    dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels']) 
    test_data = DataLoader(dataset, batch_size = self.batch_size)                   
    return test_data

In [None]:
def shift_tokens_right(input_ids, pad_token_id):
  # Shift input ids one token to the right, and wrap the last non pad token. from modeling_bart.py
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens

def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=32, pad_to_max_length=True, return_tensors="pt"):
  # Tokenize a sentence 
  input_ids = []
  attention_masks = []
  target_ids = []
  tokenized_sentences = {}
  for sentence in source_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
  for sentence in target_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )
    target_ids.append(encoded_dict['input_ids'])
  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)
  target_ids = torch.cat(target_ids, dim = 0)
  batch = {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
      "labels": target_ids,
  }
  return batch


def noise_sentence(sentence_, percent_words, replacement_token = "<mask>"):
  # Noise a sentence by adding <mask> tokens
  sentence_ = sentence_.split(' ')
  sentence = sentence_.copy()

  num_words = math.ceil(len(sentence) * percent_words)
  sample_tokens = set(np.arange(0, np.maximum(1, len(sentence))))
  words_to_noise = random.sample(sample_tokens, num_words)
  
  for pos in words_to_noise:
      if sentence[pos] != '.':
          sentence[pos] = replacement_token
  
  sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence))
  
  # Combine <mask> tokens into a single token
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  return sentence
  

In [None]:
# Load the model
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig, BartModel

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', add_prefix_space=True)
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

hparams = argparse.Namespace()
hparams.freeze_encoder = True
hparams.freeze_embeds = True
hparams.eval_beams = 4

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1553.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=557941479.0, style=ProgressStyle(descri…




In [None]:
trainer = pl.Trainer(gpus = 1,
                     max_epochs = 4,
                     min_epochs = 2,
                     auto_lr_find = False,
                     progress_bar_refresh_rate = 500)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
# Fit the instantiated model to the data
trainer.fit(model, summary_data)


  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M 
-------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [None]:
def generate_lyrics(seed_line, num_lines, model_, startW = None, noise_percent = 0.25, max_line_history = 3):
  model_.eval()
  lyrics = []
  lyrics.append(seed_line)
  prompt_line_tokens = tokenizer(noise_sentence(seed_line, 0.2), max_length = 32, return_tensors = "pt", truncation = True)
  # Loop through the number of lines generating a new line based on the old
  line = [seed_line]
  for i in range(num_lines):
    lyrics.append(line[0])
    line = model.generate_text(prompt_line_tokens, eval_beams = 4, startT = startW)

    if line[0].find(":") != -1:
      line[0] = re.sub(r'[A-Z]+: ', '', line[0])

    if max_line_history > 1:
      start_line = np.maximum(0, i - max_line_history)
      end_line = i
      prompt_line = ' '.join(lyrics[start_line:end_line]) # Going to end_line is fine because it is non-inclusive
    else:
      prompt_line = lyrics[i]
    prompt_line_tokens = tokenizer(noise_sentence(prompt_line, noise_percent), max_length = 32, return_tensors = "pt", truncation = True)
  return lyrics

In [None]:
new_song = generate_lyrics(seed_line = "I want you to know", num_lines = 4, model_ = model, noise_percent = 0.25, max_line_history = 4)

I want you to know
That you're the only one
E-mail me your name
What you're doing to me
