# 1. Environment setup 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install syllabipy



In [3]:
import numpy as np
import pandas as pd 
import random
import time
import datetime
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from syllabipy.sonoripy import SonoriPy
from transformers import BertTokenizer
from tqdm import tqdm

# 2. Load data

In [4]:
poem_df = pd.read_csv("/content/drive/MyDrive/GPT-2/limricks_end_with_[SEP]_sep_with_-.csv")
poem_df = poem_df.fillna("")

# 3. Process Text and Create Dataset

In [5]:
batch_size = 32
epochs = 20
max_len = 60
device = torch.device('cuda')
learning_rate = 1e-4
eps = 1e-8

In [6]:
word_tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/GPT-2/tokenizer[SEP]--vocab.txt")
syl_tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/GPT-2/fre_1_syllables-vocab.txt")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
print("Length of word_tokenizer : {}".format(len(word_tokenizer)))
print("Length of syl_tokenizer : {}". format(len(syl_tokenizer)))

Length of word_tokenizer : 30003
Length of syl_tokenizer : 22383


In [8]:
class PoemDataset(Dataset):
    
    def __init__(self, data, tokenizer, max_length=max_len):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = data
        
    def __len__(self):
        
        return len(self.data)
    
    def __getitem__(self, idx):
        encodings_dict = self.tokenizer(self.data[idx],
                                        truncation=True,
                                        max_length=self.max_length,
                                        padding='max_length'
                                        )
        input_ids = encodings_dict['input_ids']
        if None in input_ids:
          input_ids = torch.zeros(self.max_length)
          attention_mask = torch.zeros(self.max_length)
        else:
          input_ids = torch.tensor(encodings_dict['input_ids'])
          attention_mask = torch.tensor(encodings_dict['attention_mask'])
        input_ids = input_ids.type(torch.LongTensor)
        return input_ids, attention_mask        

In [9]:
poem_dataset = PoemDataset(poem_df.iloc[:, 0].values, word_tokenizer, max_len)

# 4. Train/Validation

# 5. Instantiate DataLoaders and Define Model Creation Function

In [10]:
poem_dataloader = DataLoader(poem_dataset, batch_size=batch_size, shuffle=True)

# 6. Create Poem Model

## 6.1 Sylliabification Embedding

## 6.2 Modify gpt2 architecture

In [12]:
configuration = GPT2Config(vocab_size=len(word_tokenizer), n_positions=max_len, n_embd=768)
model = GPT2LMHeadModel(config=configuration)
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(30003, 768)
    (wpe): Embedding(60, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
  

# 6.3 Train

In [None]:
model.cuda()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
# loaded = torch.load("/content/drive/MyDrive/GPT-2/mmodel.pth")
# model.load_state_dict(loaded)
# optimizer.load_state_dict(loaded['optimizer_state_dict'])
total_steps = len(poem_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=total_steps)
start_time = time.time()
model = model.to(device)
for epoch_i in range(100):

    print('Epoch: ', epoch_i, '\tlr: ', optimizer.param_groups[0]['lr'])

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for batch in tqdm(poem_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids,
                        labels=b_labels,
                        attention_mask=b_masks,
                        token_type_ids=None)

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(poem_dataloader)       
    training_time = (time.time() - t0)
    torch.save({'epoch': epoch_i,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()}, '/content/drive/MyDrive/GPT-2/baseline768/epoch' + str(epoch_i) 
        + 'lr' + str('%.8f' % optimizer.param_groups[0]['lr']) + 'loss' + str('%.4f' % avg_train_loss) + '.pth')
    print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')

Epoch:  0 	lr:  0.0


100%|██████████| 2766/2766 [12:41<00:00,  3.63it/s]


Average Training Loss: 3.8434493883054324. Epoch Training Time: 761.8387937545776
Epoch:  1 	lr:  9.508594174054641e-05


100%|██████████| 2766/2766 [12:41<00:00,  3.63it/s]


Average Training Loss: 3.414946602551039. Epoch Training Time: 761.5912117958069
Epoch:  2 	lr:  9.008141849104396e-05


100%|██████████| 2766/2766 [12:41<00:00,  3.63it/s]


Average Training Loss: 3.1899254303088846. Epoch Training Time: 761.9366409778595
Epoch:  3 	lr:  8.507689524154153e-05


100%|██████████| 2766/2766 [12:42<00:00,  3.63it/s]


Average Training Loss: 3.0058184178601293. Epoch Training Time: 762.7969658374786
Epoch:  4 	lr:  8.007237199203909e-05


  4%|▍         | 116/2766 [00:32<12:07,  3.64it/s]

In [17]:
state_dict = torch.load('/content/drive/MyDrive/GPT-2/baseline/epoch20lr0.00000000loss1.5441.pth')
model.load_state_dict(state_dict['model_state_dict'])

<All keys matched successfully>

# 7. Generate Poem Stanzas

In [19]:
prompt = "[CLS]"
generated = torch.tensor(word_tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
model.cuda()
model.eval()
sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=25, 
                                max_length=60,
                                top_p=0.95, 
                                num_return_sequences=50)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n".format(i, word_tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: 

1: ##al need - or slow comprehension - the topic prevention - or worse a delusion in greed

2: or vision - or any notion - or merely a guise or contusion

3: 

4: ##ic direction - its mass should suffice - or the whole bit of ice - are these

5: which you ive got - point just two point nine - or one is a pair - count the samples would have they been shot

6: as these days - should be much to be seen - wearing only a scene - or a parnasi plenty of phase

7: 

8: ##ic youd found - or these matters exciting - or given a positive sound

9: ##ic surprise

10: 

11: 

12: that rise - or small pains to please - its the point of a sneeze - would ensue if i hadnt a prize

13: 

14: not far too near - your new cranium died - by mere matter the point that i fear

15: ##al law meant - due to smelling the sound - the extent of ones mind - what we havent been shaved so he went

16: 

17: ##al strain - a suggestion - or logic or mystic - or serious matter in pain

18: which tend to be blurred - 