<a href="https://colab.research.google.com/github/Thikkar/music-generator/blob/main/notebooks/PoemsGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Set-up

1.1 Scripts

In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m42.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


1.2 Imports

In [8]:
import pandas as pd
import numpy as np
import os
import re
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

1.3 Device

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 2. Data Pre-processing

2.1 a) Load Pre-Processed Data

In [3]:
poems_generator_df = pd.read_csv("/content/poems-df.csv")
poems_generator_df.drop("Unnamed: 0", axis=1, inplace=True)
poems_generator_df

Unnamed: 0,poem,label
0,As a decrepit father takes delight\nTo see his...,family
1,She hears me strike the board and say\nThat sh...,family
2,"My father, who I hardly knew,\nWas never one t...",family
3,We are born as sinners\nYet every child is inn...,family
4,'I know where I came from\nand I know where I ...,family
...,...,...
8454,Let me take you hold your body in my arms\nKis...,love
8455,"BARDS of Passion and of Mirth,\nYe have left y...",love
8456,My beast has been awakened.\nI shiver from its...,love
8457,Words hovering around the fringes of my mind\n...,love


2.2 Poem Dataset

In [10]:
class PoemsDataset(Dataset):
  def __init__(self, poems_df, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for poem in poems_df["poem"]:

      encodings_dict = tokenizer(poem, truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

2.3 Initialize Parameters

In [14]:
params = {
    "batch_size": 1,
    "epochs": 5,
    "learning_rate": 0.0005,
    "warmup_steps": 0.01,
    "epsilon": 1e-8
}

class Params:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

params = Params(**params)

## 3. Model

3.1 Transformer: GPT-2 (Poems Generator)

In [15]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', pad_token='<|pad|>')

# dataset and dataloader
poems_dataset = PoemsDataset(poems_generator_df, tokenizer)
poems_dataloader = DataLoader(poems_dataset, batch_size=params.batch_size)

# model
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration).to(device)
model.resize_token_embeddings(len(tokenizer))

# optimizer
optimizer = AdamW(model.parameters(),
                  lr = params.learning_rate,
                  eps = params.epsilon
                )

total_steps = len(poems_dataloader) * params.epochs

#scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = params.warmup_steps, 
                                            num_training_steps = total_steps)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



3.2 Train

In [18]:
def train(model, dataloader, optimizer, scheduler, epochs):

  train_losses = []
  for epoch in range(epochs):
    print(f"Epoch {epoch+1} //////////////////////////////////////////")
    total_loss = 0
    num_batches = len(dataloader)

    model.train(True)
    for batch_idx, (input_ids, masks) in enumerate(dataloader):
      input_ids = input_ids.to(device)
      masks = masks.to(device)

      model.zero_grad()

      outputs = model(input_ids, labels=input_ids, attention_mask=masks)

      loss = outputs[0]
      total_loss += loss.item()

      if batch_idx % 1000 == 0:
        print(f"At Batch {batch_idx+1}/{num_batches}, Loss = {loss.item()}")

      loss.backward()
      optimizer.step()
      scheduler.step()

    model.train(False)
    av_loss = total_loss / num_batches
    train_losses.append(av_loss)

    print(f"Average Loss = {av_loss}\n")

  # Plot Loss vs. Epoch
  plt.plot(range(1, epochs+1), train_losses)
  plt.xlabel("Epochs")
  plt.ylabel("Loss")
  plt.show()

3.3 Generate

In [1]:
def generate(model, prompt, tokenizer):
  # model.eval()

  tokenized_prompt = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  tokenized_prompt = tokenized_prompt.to(device)

  generated_outputs = model.generate(
                                tokenized_prompt, 
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )
  
  for i, sample_output in enumerate(generated_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

## 4. Run

## 5. Notes