# 1. Environment setup 

In [1]:
!pip install transformers
!pip install syllabipy

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 8.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 51.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.9 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 53.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting 

In [2]:
import numpy as np
import pandas as pd 
import random
import time
import datetime
import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from syllabipy.sonoripy import SonoriPy
from transformers import BertTokenizer

# 2. Load data

In [3]:
poem_df = pd.read_csv("/content/limricks_end_with_[SEP]_sep_with_-.csv")
poem_df = poem_df.fillna("")

# 3. Process Text and Create Dataset

In [16]:
batch_size = 2
epochs = 8
max_len = 120
device = torch.device('cuda')
learning_rate = 1e-4
eps = 1e-8

In [5]:
word_tokenizer = BertTokenizer.from_pretrained("/content/tokenizer[SEP]--vocab.txt")
syl_tokenizer = BertTokenizer.from_pretrained("/content/fre_1_syllables-vocab.txt")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
print("Length of word_tokenizer : {}".format(len(word_tokenizer)))
print("Length of syl_tokenizer : {}". format(len(syl_tokenizer)))

Length of word_tokenizer : 30003
Length of syl_tokenizer : 22383


In [99]:
max_poem_length = max([len(word_tokenizer.encode(poem)) for poem in poem_df.iloc[:, 0]])

In [100]:
print(max_poem_length)

105


In [7]:
class PoemDataset(Dataset):
    
    def __init__(self, data, tokenizer, max_length=max_len):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = data
        
    def __len__(self):
        
        return len(self.data)
    
    def __getitem__(self, idx):
        encodings_dict = self.tokenizer(self.data[idx],
                                        truncation=True,
                                        max_length=self.max_length,
                                        padding='max_length'
                                        )
        input_ids = torch.tensor(encodings_dict['input_ids'])
        attention_mask = torch.tensor(encodings_dict['attention_mask'])
        return input_ids, attention_mask        

In [8]:
poem_dataset = PoemDataset(poem_df.iloc[:, 0].values, word_tokenizer, max_len)

# 4. Train/Validation

# 5. Instantiate DataLoaders and Define Model Creation Function

In [9]:
poem_dataloader = DataLoader(poem_dataset, batch_size=batch_size)

# 6. Create Poem Model

In [10]:
configuration = GPT2Config(vocab_size=len(word_tokenizer), n_positions=max_len)
model = GPT2LMHeadModel(config=configuration)

## 6.1 Sylliabification Embedding

In [11]:
class translater():
  def __init__(self, word_tokenizer, syl_tokenizer):
    self.word_tokenizer = word_tokenizer
    self.syl_tokenizer = syl_tokenizer

  def word_to_syl(self, x):
    
    """ 
    Convert word(tokenizer) to syllable(tokenizer) list  
    args:
      x : interger tokenzier 
    return:
      syl_list : list of syllable tokenizer
    """
    input_id = torch.tensor([x])
    word = word_tokenizer.decode(input_id)
    syllable_list = SonoriPy(word)  # a list of syllables
    result = [syl_tokenizer.encode(i)[1] for i in syllable_list]
    return result

In [12]:
class syl_embedding(nn.Module):
  def __init__(self, vocab_size, syl_size, syl_embed_len, word_embed_dim, word_to_syl, syl_embed_dim=1):
    super(syl_embedding, self).__init__()
    self.vocab_size = vocab_size
    self.word_embed_dim = word_embed_dim
    self.syl_embed_dim = syl_embed_dim
    self.syl_embed_len = syl_embed_len
    self.word_embedding = nn.Embedding(vocab_size, word_embed_dim)
    self.syl_embedding  = nn.Embedding(syl_size, syl_embed_dim)
    self.word_to_syl = word_to_syl
  def forward(self, x):
    word_embedding = self.word_embedding(x)
    syl_embedding = torch.zeros(word_embedding.shape[0], word_embedding.shape[1], self.syl_embed_len)
    for i in range(x.shape[0]):
      for j in range(x.shape[1]):
        syls = self.word_to_syl(x[i, j])
        for k in range(len(syls)):
          syl = syls[k]
          if(syl == None):
            syl_embedding[i, j, k] = 0
          else:
            syl = torch.tensor(syl).to(device)
            syl_embedding[i, j, k] = self.syl_embedding(syl).item()
    syl_embedding = syl_embedding.to(device)
    final_embed = torch.cat((word_embedding, syl_embedding), dim=2)
    return final_embed 

## 6.2 Modify gpt2 architecture

In [15]:
trans = translater(word_tokenizer, syl_tokenizer)
syl_embed = syl_embedding(len(word_tokenizer), len(syl_tokenizer), 10, word_embed_dim=758, word_to_syl = trans.word_to_syl)
model.transformer.wte = syl_embed
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): syl_embedding(
      (word_embedding): Embedding(30003, 758)
      (syl_embedding): Embedding(22383, 1)
    )
    (wpe): Embedding(120, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=

# 6.3 Train

In [None]:
model.cuda()
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)
total_steps = len(poem_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=50,
                                            num_training_steps=total_steps)

start_time = time.time()
model = model.to(device)

for epoch_i in range(0, epochs):

    print(f'Epoch {epoch_i + 1} of {epochs}')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(poem_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids,
                        labels=b_labels,
                        attention_mask=b_masks,
                        token_type_ids=None)

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(poem_dataloader)       
    training_time = format_time(time.time() - t0)

    print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')

Epoch 1 of 8


# 7. Generate Poem Stanzas

In [23]:
prompt = "[CLS]"
generated = torch.tensor(word_tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
model.eval()
sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length=max_len,
                                top_p=0.95, 
                                num_return_sequences=3)
for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, word_tokenizer.decode(sample_output, skip_special_tokens=False)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: [CLS] [CLS] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


1: [CLS] [CLS] [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [22]:
sample_outputs

tensor([[30001, 30001,     0, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000,
         30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000, 30000],
        [30001, 30001,     0, 30000, 30000, 30000, 

# Test

In [7]:
word_tokenizer = BertTokenizer.from_pretrained("/content/tokenizer[SEP]--vocab.txt")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
word_tokenizer.encode("capn jack was washed over the side-")

[30001, 18641, 1892, 143, 6846, 551, 57, 1192, 1, 0]

In [14]:
word_tokenizer.decode(torch.tensor([1]))

'-'

In [96]:
trans.word_to_syl(1)

[]

In [74]:
syl_embed = syl_embedding(len(word_tokenizer), len(syl_tokenizer), 10, word_embed_dim=758, word_to_syl = trans.word_to_syl)

In [113]:
for i, batch in enumerate(poem_dataloader):
  input_id = batch[0].to(device)
  labels   = batch[0].to(device)
  b_masks = batch[1].to(device)
  model = model.to(device)
  output = model(input_id,
                 labels = labels,
                 attention_mask = b_masks)
  loss = output[0]
  print(loss)


RuntimeError: ignored

In [112]:
torch.cuda.empty_cache()