##Setup



In [None]:
!pip install -q pytorch-lightning
!pip install -q transformers

[K     |████████████████████████████████| 829kB 9.4MB/s 
[K     |████████████████████████████████| 112kB 25.5MB/s 
[K     |████████████████████████████████| 829kB 14.4MB/s 
[K     |████████████████████████████████| 276kB 51.4MB/s 
[K     |████████████████████████████████| 1.3MB 47.9MB/s 
[K     |████████████████████████████████| 143kB 52.1MB/s 
[K     |████████████████████████████████| 296kB 50.0MB/s 
[?25h  Building wheel for future (setup.py) ... [?25l[?25hdone
  Building wheel for PyYAML (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.9MB 13.0MB/s 
[K     |████████████████████████████████| 890kB 51.4MB/s 
[K     |████████████████████████████████| 3.2MB 47.6MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np

import pytorch_lightning as pl
import torch.nn.functional as F
import torch
from pytorch_lightning.callbacks import ModelCheckpoint

from transformers import AutoModelWithLMHead, AutoTokenizer, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

import ast
import math
import random
import re
import argparse
from tqdm import tqdm, trange
import io

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import files
uploaded = files.upload()
uploaded1 = files.upload()
uploaded2 = files.upload()

Saving dict.txt to dict.txt


Saving sciryl.csv to sciryl.csv


##BART Part

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)
root_dir = '/content/gdrive/My Drive/'
base_dir = '/content/gdrive/My Drive/BART/'

Mounted at /content/gdrive


In [None]:
class BModel(pl.LightningModule):
  def __init__(self, learning_rate, tokenizer, model, hparams):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    self.hparams = hparams
    if self.hparams.freeze_encoder:
      freeze_params(self.model.get_encoder())
    if self.hparams.freeze_embeds:
      self.freeze_embeds()
  
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)
  
  def configure_optimizers(self):
    return torch.optim.Adam(self.parameters(), lr = self.learning_rate)

  def training_step(self, batch, batch_idx):
    sourceIds, sourceMask = batch[0], batch[1]
    targetIds = batch[2]
    decoder_input_ids = shift_tokens_right(targetIds, tokenizer.pad_token_id)
    outputs = self(sourceIds, attention_mask=sourceMask, decoder_input_ids=decoder_input_ids, use_cache=False)
    CElossFunc = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    loss = CElossFunc(outputs[0].view(-1, outputs[0].shape[-1]), targetIds.view(-1))
    return {'loss':loss}

  def validation_step(self, batch, batch_idx):
    sourceIds, sourceMask = batch[0], batch[1]
    targetIds = batch[2]
    decoder_input_ids = shift_tokens_right(targetIds, tokenizer.pad_token_id)
    outputs = self(sourceIds, attention_mask=sourceMask, decoder_input_ids=decoder_input_ids, use_cache=False)
    CElossFunc = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    valLoss = CElossFunc(outputs[0].view(-1, outputs[0].shape[-1]), targetIds.view(-1))
    return {'loss': valLoss}
  
  # Generates text using the BartForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 40, startT = None):
    if startT == None:
      dstartT = self.tokenizer.pad_token_id
    else:
      dstartT = tokenizer.convert_tokens_to_ids(startT)
    generated_ids = self.model.generate(
        text["input_ids"],
        attention_mask=text["attention_mask"],
        use_cache=True,
        decoder_start_token_id = dstartT,
        num_beams= eval_beams,
        max_length = max_len,
        early_stopping = early_stopping
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=True) for w in generated_ids]

  def freeze_embeds(self):
    #freeze the positional embedding parameters of the model; from finetune.py
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

def freeze_params(model):
  #Freezes the layers for faster training; from finetune.py 
  for layer in model.parameters():
    layer.requires_grade = False

In [None]:
class DataLoader(pl.LightningDataModule):
  # Create a dataloading module as in https://pytorch-lightning.readthedocs.io/en/
  def __init__(self, tokenizer, data_file, batch_size, num_examples = 30000):
    super().__init__()
    self.tokenizer = tokenizer
    self.data_file = data_file
    self.batch_size = batch_size
    self.num_examples = num_examples
  
  # Load and split the data
  def prepare_data(self):
    self.data = pd.read_csv(self.data_file)[:self.num_examples]
    self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.6*len(self.data)), int(.8*len(self.data))])

  # Encode the sentences  
  def setup(self, stage):
    self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target'])
    self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target'])
    self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target'])

  # Load the training, validation and test sets
  def train_dataloader(self):
    dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels'])                          
    train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size)
    return train_data
  def val_dataloader(self):
    dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels']) 
    val_data = DataLoader(dataset, batch_size = self.batch_size)                       
    return val_data
  def test_dataloader(self):
    dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels']) 
    test_data = DataLoader(dataset, batch_size = self.batch_size)                   
    return test_data

In [None]:
def shift_tokens_right(input_ids, pad_token_id):
  # Shift input ids one token to the right, and wrap the last non pad token. from modeling_bart.py
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens

def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=32, pad_to_max_length=True, return_tensors="pt"):
  # Tokenize a sentence 
  input_ids = []
  attention_masks = []
  target_ids = []
  tokenized_sentences = {}
  for sentence in source_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
  for sentence in target_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )
    target_ids.append(encoded_dict['input_ids'])
  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)
  target_ids = torch.cat(target_ids, dim = 0)
  batch = {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
      "labels": target_ids,
  }
  return batch


def noise_sentence(sentence_, percent_words, replacement_token = "<mask>"):
  # Noise a sentence by adding <mask> tokens
  sentence_ = sentence_.split(' ')
  sentence = sentence_.copy()

  num_words = math.ceil(len(sentence) * percent_words)
  sample_tokens = set(np.arange(0, np.maximum(1, len(sentence))))
  words_to_noise = random.sample(sample_tokens, num_words)
  
  for pos in words_to_noise:
      if sentence[pos] != '.':
          sentence[pos] = replacement_token
  
  sentence = re.sub(r' {2,5}', ' ', ' '.join(sentence))
  
  # Combine <mask> tokens into a single token
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  sentence = re.sub(r'<mask> <mask>', "<mask>", sentence)
  return sentence
  

In [None]:
# Load the model
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig, BartModel

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', add_prefix_space=True)
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

hparams = argparse.Namespace()
hparams.freeze_encoder = True
hparams.freeze_embeds = True
hparams.eval_beams = 4

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1553.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=557941479.0, style=ProgressStyle(descri…




In [None]:
# Load the data into the model for training
summary_data = DataLoader(tokenizer, '/content/gdrive/My Drive/lyrics_noised.csv', batch_size = 16, num_examples = 100000)

# Load the model from a pre-saved checkpoint
model = BModel(learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model, hparams = hparams)

In [None]:
trainer = pl.Trainer(gpus = 1,
                     max_epochs = 4,
                     min_epochs = 2,
                     auto_lr_find = False,
                     
                     progress_bar_refresh_rate = 500)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores


In [None]:
# Fit the instantiated model to the data
trainer.fit(model, summary_data)


  | Name  | Type                         | Params
-------------------------------------------------------
0 | model | BartForConditionalGeneration | 139 M 
-------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
557.682   Total estimated model params size (MB)


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [None]:
def generate_lyrics(seed_line, num_lines, model_, startW = None, noise_percent = 0.25, max_line_history = 3):
  model_.eval()
  lyrics = []
  lyrics.append(seed_line)
  prompt_line_tokens = tokenizer(noise_sentence(seed_line, 0.2), max_length = 32, return_tensors = "pt", truncation = True)
  # Loop through the number of lines generating a new line based on the old
  line = [seed_line]
  for i in range(num_lines):
    lyrics.append(line[0])
    line = model.generate_text(prompt_line_tokens, eval_beams = 4, startT = startW)

    if line[0].find(":") != -1:
      line[0] = re.sub(r'[A-Z]+: ', '', line[0])

    if max_line_history > 1:
      start_line = np.maximum(0, i - max_line_history)
      end_line = i
      prompt_line = ' '.join(lyrics[start_line:end_line]) # Going to end_line is fine because it is non-inclusive
    else:
      prompt_line = lyrics[i]
    prompt_line_tokens = tokenizer(noise_sentence(prompt_line, noise_percent), max_length = 32, return_tensors = "pt", truncation = True)
  return lyrics

## XLnet Part

In [None]:
df = pd.read_csv("sciryl.csv", delimiter='\t', header=None, names=['sentence_source'], encoding="latin-1")
df.shape

df2 = df.replace({',':''}, regex=True)
df2.size

test_df = df2.sample(10000)
print(test_df)

                                          sentence_source
53648         niggas street ain't y'all nigga beef That's
130827  quarter a end to beater buzzer a with Jericho Hit
30399            "window his down rollin' fast up Comin'"
16240   "niggas no with trappin' wasn't tip dope bitch...
117445               script no star one green-lit is Life
...                                                   ...
120218         life my in fortune a they're them show And
56798   motherfuckers Black billion a organizing is mi...
54001      right eat gotta still I leftovers it's if Even
17677   me?! on cheated wife my me tell gon' then dead...
94684   you like and you support that niggas house on ...

[10000 rows x 1 columns]


In [None]:
# Preprocess data
sentences = test_df.sentence_source.values
sentences = [sentence + " [SEP]" for sentence in sentences]
sentences[-1] = sentences[-1] + " [CLS]"
XLtokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=True)

tokenized_texts = [XLtokenizer.tokenize(sent) for sent in sentences]
#test_text = [XLtokenizer(sent, return_tensors="pt", add_special_tokens=True) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[1])
MAX_LEN = 64
input_ids = [XLtokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

#train_inputs = torch.tensor(test_text)
train_input_ids = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…


Tokenize the first sentence:
['▁quarter', '▁a', '▁end', '▁to', '▁beat', 'er', '▁buzz', 'er', '▁a', '▁with', '▁Jericho', '▁Hit', '▁[', 'S', 'EP', ']']


In [None]:
batch_size = 32
#train_data = TensorDataset(tokenized_texts, train_input_ids, train_masks)
train_data = TensorDataset(train_input_ids, train_masks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [None]:
# Train loop
XLmodel = AutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True)




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [None]:
param_optimizer = list(XLmodel.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

In [None]:
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  # Training
  # Set our XLmodel to training mode (as opposed to evaluation mode)
  XLmodel.train()
  XLmodel = XLmodel.to(device)
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    print("Step: %s"%(step))
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # outputs = XLmodel(**inputs, labels=inputs["input_ids"])
    outputs = XLmodel(input_ids=b_input_ids, labels=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    loss = outputs.loss
    logits = outputs.logits
    if(loss is not None):
      train_loss_set.append(loss.item())  
      loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update tracking variables
    if(loss is not None):
      tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Step: 0
Step: 1
Step: 2
Step: 3
Step: 4
Step: 5
Step: 6
Step: 7
Step: 8
Step: 9
Step: 10
Step: 11
Step: 12
Step: 13
Step: 14
Step: 15
Step: 16
Step: 17
Step: 18
Step: 19
Step: 20
Step: 21
Step: 22
Step: 23
Step: 24
Step: 25
Step: 26
Step: 27
Step: 28
Step: 29
Step: 30
Step: 31
Step: 32
Step: 33
Step: 34
Step: 35
Step: 36
Step: 37
Step: 38
Step: 39
Step: 40
Step: 41
Step: 42
Step: 43
Step: 44
Step: 45
Step: 46
Step: 47
Step: 48
Step: 49
Step: 50
Step: 51
Step: 52
Step: 53
Step: 54
Step: 55
Step: 56
Step: 57
Step: 58
Step: 59
Step: 60
Step: 61
Step: 62
Step: 63
Step: 64
Step: 65
Step: 66
Step: 67
Step: 68
Step: 69
Step: 70
Step: 71
Step: 72
Step: 73
Step: 74
Step: 75
Step: 76
Step: 77
Step: 78
Step: 79
Step: 80
Step: 81
Step: 82
Step: 83
Step: 84
Step: 85
Step: 86
Step: 87
Step: 88
Step: 89
Step: 90
Step: 91
Step: 92
Step: 93
Step: 94
Step: 95
Step: 96
Step: 97
Step: 98
Step: 99
Step: 100
Step: 101
Step: 102
Step: 103
Step: 104
Step: 105
Step: 106
Step: 107
Step: 108
Step: 109
Step: 110


Epoch:  25%|██▌       | 1/4 [00:54<02:43, 54.38s/it]

Train loss: 0.04317759538735873
Step: 0
Step: 1
Step: 2
Step: 3
Step: 4
Step: 5
Step: 6
Step: 7
Step: 8
Step: 9
Step: 10
Step: 11
Step: 12
Step: 13
Step: 14
Step: 15
Step: 16
Step: 17
Step: 18
Step: 19
Step: 20
Step: 21
Step: 22
Step: 23
Step: 24
Step: 25
Step: 26
Step: 27
Step: 28
Step: 29
Step: 30
Step: 31
Step: 32
Step: 33
Step: 34
Step: 35
Step: 36
Step: 37
Step: 38
Step: 39
Step: 40
Step: 41
Step: 42
Step: 43
Step: 44
Step: 45
Step: 46
Step: 47
Step: 48
Step: 49
Step: 50
Step: 51
Step: 52
Step: 53
Step: 54
Step: 55
Step: 56
Step: 57
Step: 58
Step: 59
Step: 60
Step: 61
Step: 62
Step: 63
Step: 64
Step: 65
Step: 66
Step: 67
Step: 68
Step: 69
Step: 70
Step: 71
Step: 72
Step: 73
Step: 74
Step: 75
Step: 76
Step: 77
Step: 78
Step: 79
Step: 80
Step: 81
Step: 82
Step: 83
Step: 84
Step: 85
Step: 86
Step: 87
Step: 88
Step: 89
Step: 90
Step: 91
Step: 92
Step: 93
Step: 94
Step: 95
Step: 96
Step: 97
Step: 98
Step: 99
Step: 100
Step: 101
Step: 102
Step: 103
Step: 104
Step: 105
Step: 106
Step: 10

Epoch:  50%|█████     | 2/4 [01:48<01:48, 54.35s/it]

Step: 312
Train loss: 0.0029377766125002346
Step: 0
Step: 1
Step: 2
Step: 3
Step: 4
Step: 5
Step: 6
Step: 7
Step: 8
Step: 9
Step: 10
Step: 11
Step: 12
Step: 13
Step: 14
Step: 15
Step: 16
Step: 17
Step: 18
Step: 19
Step: 20
Step: 21
Step: 22
Step: 23
Step: 24
Step: 25
Step: 26
Step: 27
Step: 28
Step: 29
Step: 30
Step: 31
Step: 32
Step: 33
Step: 34
Step: 35
Step: 36
Step: 37
Step: 38
Step: 39
Step: 40
Step: 41
Step: 42
Step: 43
Step: 44
Step: 45
Step: 46
Step: 47
Step: 48
Step: 49
Step: 50
Step: 51
Step: 52
Step: 53
Step: 54
Step: 55
Step: 56
Step: 57
Step: 58
Step: 59
Step: 60
Step: 61
Step: 62
Step: 63
Step: 64
Step: 65
Step: 66
Step: 67
Step: 68
Step: 69
Step: 70
Step: 71
Step: 72
Step: 73
Step: 74
Step: 75
Step: 76
Step: 77
Step: 78
Step: 79
Step: 80
Step: 81
Step: 82
Step: 83
Step: 84
Step: 85
Step: 86
Step: 87
Step: 88
Step: 89
Step: 90
Step: 91
Step: 92
Step: 93
Step: 94
Step: 95
Step: 96
Step: 97
Step: 98
Step: 99
Step: 100
Step: 101
Step: 102
Step: 103
Step: 104
Step: 105
Step: 

Epoch:  75%|███████▌  | 3/4 [02:42<00:54, 54.35s/it]

Train loss: 0.001444932500520846
Step: 0
Step: 1
Step: 2
Step: 3
Step: 4
Step: 5
Step: 6
Step: 7
Step: 8
Step: 9
Step: 10
Step: 11
Step: 12
Step: 13
Step: 14
Step: 15
Step: 16
Step: 17
Step: 18
Step: 19
Step: 20
Step: 21
Step: 22
Step: 23
Step: 24
Step: 25
Step: 26
Step: 27
Step: 28
Step: 29
Step: 30
Step: 31
Step: 32
Step: 33
Step: 34
Step: 35
Step: 36
Step: 37
Step: 38
Step: 39
Step: 40
Step: 41
Step: 42
Step: 43
Step: 44
Step: 45
Step: 46
Step: 47
Step: 48
Step: 49
Step: 50
Step: 51
Step: 52
Step: 53
Step: 54
Step: 55
Step: 56
Step: 57
Step: 58
Step: 59
Step: 60
Step: 61
Step: 62
Step: 63
Step: 64
Step: 65
Step: 66
Step: 67
Step: 68
Step: 69
Step: 70
Step: 71
Step: 72
Step: 73
Step: 74
Step: 75
Step: 76
Step: 77
Step: 78
Step: 79
Step: 80
Step: 81
Step: 82
Step: 83
Step: 84
Step: 85
Step: 86
Step: 87
Step: 88
Step: 89
Step: 90
Step: 91
Step: 92
Step: 93
Step: 94
Step: 95
Step: 96
Step: 97
Step: 98
Step: 99
Step: 100
Step: 101
Step: 102
Step: 103
Step: 104
Step: 105
Step: 106
Step: 1

Epoch: 100%|██████████| 4/4 [03:37<00:00, 54.33s/it]

Step: 312
Train loss: 0.0008024863426676972





# Rhyming Part

In [None]:
STRESSES = {'AA1', 'AE1', 'AH1', 'AO1', 'AW1', 'AY1', 'EH1', 'ER1', 'EY1', 'IH1', 'IY1', 'OW1', 'OY1', 'UH1', 'UW1'}

file = open("dict.txt", "r")

contents = file.read()
PHODICT = ast.literal_eval(contents)

file.close()

def isSubList(listA, listB):
    if len(listA) < len(listB):
        return isSubList(listB, listA)
    n = len(listB)
    for start in range(len(listA)-n+1):
        if all(listA[start+i] == listB[i] for i in range(n)):
            return True
    return False

def getRhymes(word):
    print(type(word))
    sounds = PHODICT[word]
    for index,sound in enumerate(reversed(sounds)):
        if sound in STRESSES:
            ending = sounds[-index-1:]
            break
    yielded = set()
    for wordB, soundsB in PHODICT.items():
        if (ending == soundsB[-index-1:]) and (soundsB not in yielded) and (not isSubList(sounds, soundsB)):
            yielded.add(soundsB)
            yield wordB



#Generation Part

In [None]:
def rhymeFinder(inWord):
  lastWord = inWord
  sameRhyme = []
  for word in getRhymes(lastWord.upper()):
    sameRhyme.append(word.lower())
  outWord = random.choice(sameRhyme)
  return outWord

def reverseSentence(inList):
  words = inList[0].split()
  outStr = ' '.join(reversed(words))
  return outStr

def reverseW(line):
  words = line.split()
  reversed_words = ' '.join(reversed(words))
  return reversed_words


In [None]:
firstSeed = "You and me forever "
allLyrics = []

BART_sentence = generate_lyrics(seed_line = firstSeed, num_lines = 2, model_ = model, startW = None, noise_percent = 0.25, max_line_history = 1)
allLyrics.append(BART_sentence[-1])

startword = rhymeFinder(BART_sentence[-1].split()[-1])

prompt = " " + startword
tpmorp = reverseW(prompt)
XLinputs = XLtokenizer.encode(reverseW(BART_sentence[-1]) + ' ' + prompt, add_special_tokens=False, return_tensors="pt")
XLinputs = XLinputs.to(device)
prompt_length = len(XLtokenizer.decode(XLinputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
XLoutputs = XLmodel.generate(XLinputs, max_length=70, do_sample=True, top_p=0.99, top_k=25, repetition_penalty  = 6.0, length_penalty = 0.01)

XLgenerated = XLtokenizer.decode(XLoutputs[0])[prompt_length:] + prompt
allLyrics.append(XLgenerated)
print(allLyrics)

You and me forever
You and me
<class 'str'>
[' You and me ', '. The Big honoree']


In [None]:
!pip install rake-nltk
from rake_nltk import Rake

r = Rake()
r.extract_keywords_from_sentences(allLyrics)
keywords = r.get_ranked_phrases()

keySentence = ''
for i in keywords:
  keySentence += i
  keySentence += ' '



In [None]:
BART_sentence1 = generate_lyrics(seed_line = keySentence, num_lines = 2, model_ = model, startW = None, noise_percent = 0.25, max_line_history = 1)
allLyrics.append(BART_sentence1[-1])

startword = rhymeFinder(BART_sentence1[-1].split()[-1])

prompt = " " + startword
tpmorp = reverseW(prompt)
XLinputs = XLtokenizer.encode(reverseW(BART_sentence[-1]) + prompt, add_special_tokens=False, return_tensors="pt")
XLinputs = XLinputs.to(device)
prompt_length = len(XLtokenizer.decode(XLinputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
XLoutputs = XLmodel.generate(XLinputs, max_length=70, do_sample=True, top_p=0.99, top_k=25, repetition_penalty  = 6.0, length_penalty = 0.01)

XLgenerated = XLtokenizer.decode(XLoutputs[0])[prompt_length:] + prompt
allLyrics.append(XLgenerated)
print(allLyrics)

whoever save one forever
I don't give a fuck
<class 'str'>
[" You're the only one that can save me forever", ' he, they are whoever', " I don't give a fuck what you think", ' by your own personal link', " I don't give a fuck", ' my own belt. pluck', " I don't give a fuck", ' to go to the duck']
