In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Libraries

In [2]:
!pip install transformers
!pip install syllabipy

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 4.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 95.2 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 58.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [3]:
import numpy as np
import pandas as pd 

import random
import time
import datetime

import torch
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

from tqdm import tqdm

from syllabipy.sonoripy import SonoriPy

from tokenizers import ByteLevelBPETokenizer
from tokenizers import BertWordPieceTokenizer

from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from transformers import BertTokenizer

# Train Tokenizer

In [4]:
# Train tokenizer

paths = ["/content/gdrive/MyDrive/pr/limericks_end_with_[SEP]_sep_with_-_and_$.txt"]

# Initialize a tokenizer
tokenizer = BertWordPieceTokenizer()

special_tokens=['[SEP]', '-', '$']
print(special_tokens)

# Customize training
tokenizer.train(files=paths, vocab_size=30_000, min_frequency=2, special_tokens=special_tokens)

# special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
# tokenizer.add_special_tokens(special_tokens_dict)

# Save files to disk
tokenizer.save_model("/content/gdrive/MyDrive/pr/tokenizers", "tokenizerSyllables")

['[SEP]', '-', '$']


['/content/gdrive/MyDrive/pr/tokenizers/tokenizerSyllables-vocab.txt']

# Load Data

In [5]:
poem_df = pd.read_csv("/content/gdrive/MyDrive/pr/limericks_end_with_[SEP]_sep_with_-_and_$.txt")
poem_df = poem_df.fillna("")
print(poem_df)

      capn $ jack $ was $ was hed $ o ver $ the $ si de - his $ crew $ searc hed $ but $ found $ not $ hair $ nor $ hi de - no $ lon ger $ the $ helm - but $ the $ deep $ bent hic $ realm - is $ whe re $ jack $ will $ fo re ver $ re si de [SEP]- 
0      as $ a $ soup $ bis que $ is $ best $ when $ s...                                                                                                                                                                                              
1      sim ply $ add $ to $ the $ grasp $ of $ a $ rh...                                                                                                                                                                                              
2      a beds $ whe re $ yo u $ sleep $ in $ the $ ni...                                                                                                                                                                                              
3      a $ s

# Hyperparameters

In [6]:
random_seed = 73
batch_size = 32
epochs = 8
max_len = 200

learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50

device = torch.device('cuda')

In [7]:
torch.cuda.manual_seed_all(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

<torch._C.Generator at 0x7ff912e91e90>

In [8]:
tokenizer = BertTokenizer.from_pretrained("/content/gdrive/MyDrive/pr/tokenizers/tokenizerSyllables-vocab.txt")
print("len(tokenizer) = ", len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


len(tokenizer) =  15112


# Dataset

In [9]:
class PoemDataset(Dataset):
    
    def __init__(self, data, tokenizer, max_length=max_len):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        encodings_dict = self.tokenizer(self.data[idx],
                                        truncation=True,
                                        max_length=self.max_length,
                                        padding='max_length'
                                        )
        input_ids = torch.tensor(encodings_dict['input_ids'])
        attention_mask = torch.tensor(encodings_dict['attention_mask'])
        return input_ids, attention_mask

poem_dataset = PoemDataset(poem_df.iloc[:, 0].values, tokenizer, max_len)

# Dataloader

In [10]:
poem_dataloader = DataLoader(poem_dataset, batch_size=batch_size)

# Model Definition

In [11]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=max_len)
poem_model = GPT2LMHeadModel(config=configuration)
poem_model.resize_token_embeddings(len(tokenizer))

optimizer = AdamW(poem_model.parameters(), lr=learning_rate, eps=eps)

total_steps = len(poem_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)

poem_model = poem_model.to(device)

In [22]:
def generate_poems(poem_model):
    prompt = "[CLS]"
    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)

    poem_model.eval()
    sample_outputs = poem_model.generate(
                                    generated, 
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length=max_len,
                                    top_p=0.95, 
                                    num_return_sequences=3
                                    )

    for i, sample_output in enumerate(sample_outputs):
        sample_output = tokenizer.decode(sample_output, skip_special_tokens=True)
        sample_output = sample_output.replace(" ", "").replace("$", " ").replace("-", " - ")
        print("{}: {}\n\n".format(i, sample_output))

In [24]:
model_weights_file = '/content/gdrive/MyDrive/pr/model_weights_20211127_1_epoch7.pth'
poem_model.load_state_dict(torch.load(model_weights_file))

for epoch in range(8, 15):

    print(f'Epoch {epoch + 1} of {epochs}')

    total_train_loss = 0
    poem_model.train()

    with tqdm(poem_dataloader) as t:
      for step, batch in enumerate(t):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        poem_model.zero_grad()        

        outputs = poem_model(b_input_ids,
                            labels=b_labels,
                            token_type_ids=None,
                            attention_mask=b_masks)

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(poem_dataloader)       

    print(f'Average Training Loss: {avg_train_loss}.')

    torch.save(poem_model.state_dict(), '/content/gdrive/MyDrive/pr/model_weights_20211127_1_epoch%d.pth' % epoch)

    generate_poems(poem_model)

Epoch 9 of 8


100%|██████████| 2766/2766 [35:56<00:00,  1.28it/s]


Average Training Loss: 0.8333827336025859.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  in the breeze - was the corol i hate - but i never could wait - the colonos he said was degrees - 


1:  - made me shine like a fine disarray - not a rope but a lot - the names mine for a lot - an a favorite every fine day


2:  economys african tree - and the colors these likely should be - youre delicious with ease - and as hard as it please - that this genus is not very free


Epoch 10 of 8


100%|██████████| 2766/2766 [35:55<00:00,  1.28it/s]


Average Training Loss: 0.7890202964421036.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  as baseball can be - useful baseballs use column youll see - but a navy among - its a double hue hung - to a heaviest avenue  - 


1:  with decolate names - from the glaciers all over his aims - to make music like this - though a copycat miss - and the absence of tales like the aims


2:  in darkness ive seen - and the highest of leaves on the screen - ive been growing my style - to relieve in the nile - it seems strange that ive always been seen


Epoch 11 of 8


100%|██████████| 2766/2766 [35:55<00:00,  1.28it/s]


Average Training Loss: 0.736409771920802.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  sunday weve built a new dale - where the world isnt easily stale - we are forced to the skies - but the timehonored guys - say our honeymoon salesmen are stale


1:  - i keep there on his paneline my day - there is no need to laugh - only fools all like half - and he finds as he passes one way - 


2:  - and the languish they think is the way - wheres the ague or syrup - they think that theyre teeming  - onions a consequence of prey - 


Epoch 12 of 8


100%|██████████| 2766/2766 [35:55<00:00,  1.28it/s]


Average Training Loss: 0.6846185267316373.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:    a biteaus - i think ill make a - change to get things youll see - a good buyer though widely desire - 


1:  haline fish in the sea  - are they sharp overstation - wheres high devoration  - oh where every item will be - 


2:  you must see - is the fruit that fits straight from the tree - from a species of mine - its the amazon sign - a mirage agaric that be - 


Epoch 13 of 8


100%|██████████| 2766/2766 [35:55<00:00,  1.28it/s]


Average Training Loss: 0.6367506664125108.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  was well like a mile  - not yet linen again - he was hopin and then - hed opine you to make a sweet smile lexicographers - 


1:  will get wise - you think it excites us - when you add when youre knight - an antistress it helps to divide us - 


2:  on the fish fish - if i have it with ease - ill just have a disease - wheres my basic a coupler of these - 


Epoch 14 of 8


100%|██████████| 2766/2766 [35:56<00:00,  1.28it/s]


Average Training Loss: 0.5955480743450094.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  to thinki goodbye - with no onion or ring - you can see whats athing - if you please see a good one dont fly - 


1:  distractions taboose - like burgeoning plants - paramesation astroys - and a term that enhances the treatments i use - 


2:  in the green fishing trees - will grow on the roots - if they dont give you these - if you learn that the best guarantees - 


Epoch 15 of 8


100%|██████████| 2766/2766 [35:54<00:00,  1.28it/s]


Average Training Loss: 0.5651384732340525.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0:  - called fishermen called on the isle - though the wind in its tribe - might bring disrespected libe  - headless airlines a very good style - 


1:  - is a concert in the uk - he was given to stop  - but at least for the cop - its deception  ill bet this ill pay - 


2:  - would rap in the mexican skies - on the beach as theyre pressed - to the sand on their breast - given moisture with maximal prize - 


