###Note that this jupiter notebook was built off of the notebook created by Scott Duda in this article:
https://scottmduda.medium.com/generating-an-edgar-allen-poe-styled-poem-using-gpt-2-289801ded82c

In [None]:
#Setup the environment
!pip install transformers

import numpy as np
import pandas as pd 

import random
import time
import datetime

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

from google.colab import drive
drive.mount('/content/drive')

home_directory = '/content/drive/MyDrive/Aps360 Project/MultiPoemDataset/emotions/happy'
RANDOM_SEED = 73
BATCH_SIZE = 2
EPOCHS = 8
MAX_LEN = 1024

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#This class stores the tokenized version of every poem and allows you to
#Get a specific tokenized poem by its index in the list of poems.
class PoemDataset(Dataset):
    
    #When initialied, tokenize each poem and store them in properties of the class
    def __init__(self, data, tokenizer, gpt2_type='gpt2', max_length=MAX_LEN):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        
        for i in data:
            encodings_dict = tokenizer('<BOS>' + i + '<EOS>',
                                     truncation=True,
                                     max_length=max_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)
    
    #Poems were tokenized and appended to the array in order,
    #Allowing us to retrieve the tokens for the poems by the poem index
    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]
        

In [None]:
#Helper functions
def get_train_val_size(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


In [None]:
def train(poem_model,learning_rate=1e-4,eps=1e-8,warmup_steps=50,starting_epoch=0):
  optimizer = AdamW(poem_model.parameters(), lr=learning_rate, eps=eps)
  total_steps = len(poem_train_dataloader) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps=warmup_steps,
                                              num_training_steps=total_steps)
  start_time = time.time()
  train_loss = []
  val_loss = []
  for epoch_i in range(starting_epoch, EPOCHS):
      print(f'Epoch {epoch_i + 1} of {EPOCHS}')
      t0 = time.time()

      #Train the model
      total_train_loss = 0
      poem_model.train()
      for step, batch in enumerate(poem_train_dataloader):

          #Note that the labels are the same as the input. This is because the 
          #GPT2LMHeadModel That we are using shifts the labels by 1 meaning that 
          #the label for each input token is the next input token. This is desired 
          #when building a language model because we want the predicted output to
          #be the next most likely word in the sentence. 
          b_input_ids = batch[0].to(device)
          b_labels = batch[0].to(device)
          b_masks = batch[1].to(device)

          poem_model.zero_grad()        
          outputs = poem_model(b_input_ids,
                                      labels=b_labels,
                                      attention_mask=b_masks,
                                      token_type_ids=None)
          loss = outputs[0]  
          batch_loss = loss.item()
          total_train_loss += batch_loss

          loss.backward()
          optimizer.step()
          scheduler.step()

      avg_train_loss = total_train_loss / len(poem_train_dataloader)       
      training_time = format_time(time.time() - t0)
      print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')
      
      t0 = time.time()

      #Evaluate the model
      poem_model.eval()
      total_eval_loss = 0
      nb_eval_steps = 0
      for batch in poem_val_dataloader:
          b_input_ids = batch[0].to(device)
          b_labels = batch[0].to(device)
          b_masks = batch[1].to(device)

          with torch.no_grad():        
              outputs  = poem_model(b_input_ids,
                                          attention_mask=b_masks,
                                          labels=b_labels)
              loss = outputs[0]  

          batch_loss = loss.item()
          total_eval_loss += batch_loss        

      avg_val_loss = total_eval_loss / len(poem_val_dataloader)

      train_loss.append(avg_train_loss)
      val_loss.append(avg_val_loss)
      torch.save(poem_model.state_dict(), "{}/{}_epoch_{}".format(
              home_directory,"model_happy",epoch_i))
      print(f'Average Validation Loss: {avg_val_loss}')

  np.savetxt("{}/{}_train_loss.csv".format(home_directory, "model_happy"), train_loss)
  np.savetxt("{}/{}_val_loss.csv".format(home_directory, "model_happy"), val_loss)
  print(f'Total Training Time: {format_time(time.time()-start_time)}')



###Loading the data

In [None]:
#Load them poems we want to train our model with
poem_df = pd.read_csv(home_directory + '/poki-happy-with-header.csv')
poem_df = poem_df.fillna('')

#Load the GPT2 tokenizer that will be used by PoemDataset to encode the poems. Add the 
#BOS,EOS and PAD tokens to the tokenized dictionary so that when we put these 
#Tokens around our poems to separate them, the tokenizer will know what to do with them.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
tokenizer.add_special_tokens(special_tokens_dict)

#create an object of the poemDataset class
#that will hold an ordered list of the tokenized version of each poem
poem_dataset = PoemDataset(poem_df['poems'].values, tokenizer, max_length=MAX_LEN)

#Split the poem dataset into a training set and a validation set.
poem_train_size, poem_val_size = get_train_val_size(split=0.8, dataset=poem_dataset)
poem_train_dataset, poem_val_dataset = random_split(poem_dataset, [poem_train_size, poem_val_size])
poem_train_dataloader = DataLoader(poem_train_dataset,
                              sampler=RandomSampler(poem_train_dataset),
                              batch_size=BATCH_SIZE)
poem_val_dataloader = DataLoader(poem_val_dataset,
                            sampler=SequentialSampler(poem_val_dataset),
                            batch_size=BATCH_SIZE)

##Setup the model

In [None]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device('cuda')


#If you are only evaluating the model and don't want to train it again,
#set training_desired to false. Otherwise set it to true and choose the 
#epoch you left off at last time to start Training from (0 if you haven't).
load_previous_state_dict = True
previous_state_dict_location = "model_happy_epoch_7"


#Setup the pretrained GPT2 model
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_LEN).from_pretrained('gpt2', output_hidden_states=True)
poem_model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
poem_model.resize_token_embeddings(len(tokenizer))

if load_previous_state_dict:
  poem_model.load_state_dict(torch.load("{}/{}".format(home_directory,previous_state_dict_location)))

poem_model.cuda()
poem_model = poem_model.to(device)

##Train the model, if desired

In [None]:
training_desired = False

#If training_desired is set to true, choose the epoch you left off at last time
#to continue training from there (put it as 0 if you haven't yet)
starting_epoch = 0

# hyperparameters
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50
if training_desired:
  train(poem_model,learning_rate,eps,warmup_steps,starting_epoch)


##Use the model to generate poems

In [None]:
# create text generation seed prompt
prompt = "<BOS> I love pizza"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

In [None]:
previous_state_dict_location = "model_happy_epoch_7"
poem_model.eval()
sample_outputs = poem_model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length=MAX_LEN,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))