## XLnet rap generator

*(based on https://mccormickml.com/2019/09/19/XLNet-fine-tuning/)*

---


Setup dependencies

In [1]:
!pip install transformers
import torch
import transformers
from transformers import AutoModelWithLMHead, AutoTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split



from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 8.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 35.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 49.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=3fcb28c3b82

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
from google.colab import files

uploaded = files.upload()

Saving lyrics.csv to lyrics.csv


Ingest data


In [10]:
df = pd.read_csv("lyrics.csv", delimiter='\t', header=None, names=['sentence_source'], encoding='latin-1')
df.shape
#df.sample(10)
#remove commas from end of lines
df2 = df.replace({',':''}, regex=True)
df2.size
#FOR TESTING
test_df = df2.sample(80000)


preprocess data for xl-net


In [11]:
sentences = test_df.sentence_source.values
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased", do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
#test_text = [tokenizer(sent, return_tensors="pt", add_special_tokens=True) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[1])
MAX_LEN = 64
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

#train_inputs = torch.tensor(test_text)
train_input_ids = torch.tensor(input_ids)
train_masks = torch.tensor(attention_masks)


Tokenize the first sentence:
['▁Ball', '▁on', '▁automatic', '▁start', '▁[', 'S', 'EP', ']', '▁[', 'CL', 'S', ']']


In [12]:
batch_size = 32
#train_data = TensorDataset(tokenized_texts, train_input_ids, train_masks)
train_data = TensorDataset(train_input_ids, train_masks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


Train loop

In [13]:
model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True)
#model.cuda()




In [14]:
from transformers import AdamW
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=2e-5)

In [15]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 2

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  # Training
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    print("Step: %s"%(step))
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    # outputs = model(**inputs, labels=inputs["input_ids"])
    model.to(device)
    outputs = model(input_ids=b_input_ids, labels=b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    #print(outputs)
    loss = outputs.loss
    logits = outputs.logits
    if(loss is not None):
      train_loss_set.append(loss.item())
      # Backward pass
      loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    if(loss is not None):
      tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))


Epoch:   0%|          | 0/2 [00:00<?, ?it/s][A

Step: 0
Step: 1
Step: 2
Step: 3
Step: 4
Step: 5
Step: 6
Step: 7
Step: 8
Step: 9
Step: 10
Step: 11
Step: 12
Step: 13
Step: 14
Step: 15
Step: 16
Step: 17
Step: 18
Step: 19
Step: 20
Step: 21
Step: 22
Step: 23
Step: 24
Step: 25
Step: 26
Step: 27
Step: 28
Step: 29
Step: 30
Step: 31
Step: 32
Step: 33
Step: 34
Step: 35
Step: 36
Step: 37
Step: 38
Step: 39
Step: 40
Step: 41
Step: 42
Step: 43
Step: 44
Step: 45
Step: 46
Step: 47
Step: 48
Step: 49
Step: 50
Step: 51
Step: 52
Step: 53
Step: 54
Step: 55
Step: 56
Step: 57
Step: 58
Step: 59
Step: 60
Step: 61
Step: 62
Step: 63
Step: 64
Step: 65
Step: 66
Step: 67
Step: 68
Step: 69
Step: 70
Step: 71
Step: 72
Step: 73
Step: 74
Step: 75
Step: 76
Step: 77
Step: 78
Step: 79
Step: 80
Step: 81
Step: 82
Step: 83
Step: 84
Step: 85
Step: 86
Step: 87
Step: 88
Step: 89
Step: 90
Step: 91
Step: 92
Step: 93
Step: 94
Step: 95
Step: 96
Step: 97
Step: 98
Step: 99
Step: 100
Step: 101
Step: 102
Step: 103
Step: 104
Step: 105
Step: 106
Step: 107
Step: 108
Step: 109
Step: 110



Epoch:  50%|█████     | 1/2 [11:43<11:43, 703.61s/it][A

Train loss: 0.00390900774752663
Step: 0
Step: 1
Step: 2
Step: 3
Step: 4
Step: 5
Step: 6
Step: 7
Step: 8
Step: 9
Step: 10
Step: 11
Step: 12
Step: 13
Step: 14
Step: 15
Step: 16
Step: 17
Step: 18
Step: 19
Step: 20
Step: 21
Step: 22
Step: 23
Step: 24
Step: 25
Step: 26
Step: 27
Step: 28
Step: 29
Step: 30
Step: 31
Step: 32
Step: 33
Step: 34
Step: 35
Step: 36
Step: 37
Step: 38
Step: 39
Step: 40
Step: 41
Step: 42
Step: 43
Step: 44
Step: 45
Step: 46
Step: 47
Step: 48
Step: 49
Step: 50
Step: 51
Step: 52
Step: 53
Step: 54
Step: 55
Step: 56
Step: 57
Step: 58
Step: 59
Step: 60
Step: 61
Step: 62
Step: 63
Step: 64
Step: 65
Step: 66
Step: 67
Step: 68
Step: 69
Step: 70
Step: 71
Step: 72
Step: 73
Step: 74
Step: 75
Step: 76
Step: 77
Step: 78
Step: 79
Step: 80
Step: 81
Step: 82
Step: 83
Step: 84
Step: 85
Step: 86
Step: 87
Step: 88
Step: 89
Step: 90
Step: 91
Step: 92
Step: 93
Step: 94
Step: 95
Step: 96
Step: 97
Step: 98
Step: 99
Step: 100
Step: 101
Step: 102
Step: 103
Step: 104
Step: 105
Step: 106
Step: 10


Epoch: 100%|██████████| 2/2 [23:26<00:00, 703.31s/it]

Train loss: 0.00014012650178447074





In [16]:
#DEBUG TRAIN LOOP
print(tokenized_texts)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Generate using finetuned model


In [21]:
PADDING_TEXT = """They tryna be crazy 
They tryna be crazy 
She wanna meet Carti 
That bitch is a Barbie 
I'ma fuck these hoes 
I'm on 730
Got a brand new pack like Kid Cudi 
I smoke dope like Kid Cudi
Push up and get the slugs from me 
I'm with all the shits 
She wanna meet Carti 
That bitch is a Barbie <eod> </s> <eos>"""

prompt = "All my friends are  "
inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
inputs = inputs.to(device)

prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.99, top_k=25, repetition_penalty  = 6.0, length_penalty = 0.01)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
print(generated)


All my friends are  iends are in Hell, but here they come and take down those bastard hellholes I need to give them something for everything else. Now they look around and see that there must have been one or two people standing at the same window as every last one of it.<eop> A damn fucking goddamn thing, if you want to say it by their face, let’ll call someone this. No. You can name everyone except your favorite person! You can use a lot more words than you could. Dont even know who exactly was right behind her; they have a good idea how she feels about you. This way: You can talk to her while they were outside, so just ask what they thought would go bad? If anything happens inside after you leave-you only have to keep an eye on her.


Generate with out the box xl-net

In [23]:
from transformers import AutoModelWithLMHead, AutoTokenizer

model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True)
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
# Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology

PADDING_TEXT = """They tryna be cray (Mm, yeah)
They tryna be cray (Mm, yeah)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the slugs from me (Slime slime)
I'm with all the shits (Slime slime)
She wanna meet Carti (Carti)
That bitch is a Barbie (Yeah)
I'ma fuck these hoes (Ooh)
I'm on 730
Got a brand new pack like Kid Cudi (Brand new)
I smoke dope like Kid Cudi
Push up and get the<eod> </s> <eos>"""

prompt = "All my friends are "
inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
inputs = inputs.to(device)

prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))

model = model.to(device)
outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.99, top_k=25, repetition_penalty  = 6.0, length_penalty = 0.01)
generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]



In [24]:
print(generated)

All my friends are iends are already in your blog for an equal amount of time. As far as you know; I do not have any money! I donвt pay anything but just go work that way until we can live together. The good news though: I haven’
