In [1]:
from google.colab import drive
drive.mount('/content/drive')
home_directory = '/content/drive/MyDrive/Aps360 Project/Datasets/'
model_name = 'model_article_summaries'


Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.0-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 69.4 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 81.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 88.4 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [3]:
import numpy as np
import pandas as pd 

import random
import time
import datetime
import gc

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler, Subset

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'sep_token':'<SEP>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




In [5]:
RANDOM_SEED = 73
BATCH_SIZE = 20
EPOCHS = 5
MAX_LEN = 200

In [6]:
class PoemWithSummaryDataset(Dataset):
    
    def __init__(self, input, target, tokenizer, gpt2_type='gpt2', max_length=MAX_LEN):
        self.tokenizer = tokenizer
        self.data = []
        self.attn_masks = []
        
        for i in range(len(input)):
          
          encodings_dict = tokenizer(input[i] + '<SEP>' + target[i],
                                    truncation=True,
                                    max_length=max_length,
                                    padding='max_length'
                                  )
          """
          #for checking how long the inputs are in terms of tokens to determine the max_length
          encodings_dict_input = tokenizer('<BOS>' + input[i] + '<EOS>')
          encodings_dict_target = tokenizer('<BOS>' + target[i] + '<EOS>')
          """
          self.data.append(torch.tensor(encodings_dict['input_ids']))
          self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        
        return len(self.data)
    
    def __getitem__(self, idx):
        
        return self.data[idx], self.attn_masks[idx]
        

In [7]:
#Helper functions
def get_train_val_size(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [8]:
def train(poem_model,learning_rate=1e-4,eps=1e-8,warmup_steps=50,starting_epoch=0):
  optimizer = AdamW(poem_model.parameters(), lr=learning_rate, eps=eps)
  total_steps = len(poem_train_dataloader) * EPOCHS
  scheduler = get_linear_schedule_with_warmup(optimizer,
                                              num_warmup_steps=warmup_steps,
                                              num_training_steps=total_steps)
  start_time = time.time()
  train_loss = []
  val_loss = []
  for epoch_i in range(starting_epoch, EPOCHS):
      print(f'Epoch {epoch_i + 1} of {EPOCHS}')
      t0 = time.time()

      #Train the model
      total_train_loss = 0
      poem_model.train()
      for step, batch in enumerate(poem_train_dataloader):

          #Note that the labels are the same as the input. This is because the 
          #GPT2LMHeadModel That we are using shifts the labels by 1 meaning that 
          #the label for each input token is the next input token. This is desired 
          #when building a language model because we want the predicted output to
          #be the next most likely word in the sentence. 
          b_input_ids = batch[0].to(device)
          b_masks = batch[1].to(device)

          poem_model.zero_grad()        
          outputs = poem_model(b_input_ids,
                                      labels=b_input_ids,
                                      attention_mask=b_masks,
                                      token_type_ids=None)
          loss = outputs[0]  
          batch_loss = loss.item()
          total_train_loss += batch_loss

          loss.backward()
          optimizer.step()
          scheduler.step()

      avg_train_loss = total_train_loss / len(poem_train_dataloader)       
      training_time = format_time(time.time() - t0)
      print(f'Average Training Loss: {avg_train_loss}. Epoch Training Time: {training_time}')
      
      t0 = time.time()

      #Evaluate the model
      poem_model.eval()
      total_eval_loss = 0
      nb_eval_steps = 0
      for batch in poem_val_dataloader:
          b_input_ids = batch[0].to(device)
          b_masks = batch[1].to(device)

          with torch.no_grad():        
              outputs  = poem_model(b_input_ids,
                                          attention_mask=b_masks,
                                          labels=b_input_ids)
              loss = outputs[0]  

          batch_loss = loss.item()
          total_eval_loss += batch_loss        

      avg_val_loss = total_eval_loss / len(poem_val_dataloader)

      train_loss.append(avg_train_loss)
      val_loss.append(avg_val_loss)
      torch.save(poem_model.state_dict(), "{}/{}".format(
              home_directory,model_name))
      print(f'Average Validation Loss: {avg_val_loss}')

  np.savetxt("{}/{}_train_loss.csv".format(home_directory, model_name), train_loss)
  np.savetxt("{}/{}_val_loss.csv".format(home_directory, model_name), val_loss)
  print(f'Total Training Time: {format_time(time.time()-start_time)}')


##Loading the data

In [9]:
#Load them poems we want to train our model with
#poem_df = pd.read_csv(home_directory + 'poe_poems_with_summary_and_title.csv')
poem_df = pd.read_csv(home_directory + 'InshortsCleanedData.csv')

#inputs = poem_df['Summary']
#targets = poem_df['Poem']
inputs = poem_df['Headline']
targets = poem_df['Short']

inputs = inputs.values#[0:1000]
targets = targets.values#[0:1000]


#Load the GPT2 tokenizer that will be used by PoemDataset to encode the poems. Add the 
#BOS,EOS and PAD tokens to the tokenized dictionary so that when we put these 
#Tokens around our poems to separate them, the tokenizer will know what to do with them.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'sep_token':'<SEP>', 'pad_token': '<PAD>'}
tokenizer.add_special_tokens(special_tokens_dict)

#create an object of the PoemWithSummaryDataset class
#that will hold an ordered list of the tokenized version of each poem and its summary
poem_dataset = PoemWithSummaryDataset(inputs, targets, tokenizer)



In [10]:
#Split the poem dataset into a training set and a validation set.
poem_train_size, poem_val_size = get_train_val_size(split=0.8, dataset=poem_dataset)

#FOR TESTING
"""
poem_train_size = 800
poem_val_size = 200
poem_train_dataset = Subset(poem_dataset,range(0,800))
poem_val_dataset = Subset(poem_dataset,range(800,1000))
"""
poem_train_dataset, poem_val_dataset = random_split(poem_dataset, [poem_train_size, poem_val_size])
poem_train_dataloader = DataLoader(poem_train_dataset,
                              sampler=RandomSampler(poem_train_dataset),
                              batch_size=BATCH_SIZE)
poem_val_dataloader = DataLoader(poem_val_dataset,
                            sampler=SequentialSampler(poem_val_dataset),
                            batch_size=BATCH_SIZE)

In [11]:
"""
#checking how large the encodings should be
num_over = 0
total = len(poem_dataset.targets)
for target in poem_dataset.targets:
  if len(target) > 150:
    num_over+=1
print(num_over)
print(total)
"""

'\n#checking how large the encodings should be\nnum_over = 0\ntotal = len(poem_dataset.targets)\nfor target in poem_dataset.targets:\n  if len(target) > 150:\n    num_over+=1\nprint(num_over)\nprint(total)\n'

##Setup the model

In [12]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device('cuda')

gc.collect()
torch.cuda.empty_cache() 

#If you are only evaluating the model and don't want to train it again,
#set training_desired to false. Otherwise set it to true and choose the 
#epoch you left off at last time to start Training from (0 if you haven't).
load_previous_state_dict = True
previous_state_dict_location = "model_article_summaries"#"model_summary_poe_epoch_7"


#Setup the pretrained GPT2 model
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_LEN).from_pretrained('gpt2', output_hidden_states=True)
poem_model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
poem_model.resize_token_embeddings(len(tokenizer))

if load_previous_state_dict:
  poem_model.load_state_dict(torch.load("{}/{}".format(home_directory,previous_state_dict_location)))

poem_model.cuda()
poem_model = poem_model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




##Train the model if desired

In [13]:
!nvidia-smi

Sat Jul 24 17:10:40 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    32W / 250W |   1451MiB / 16280MiB |     20%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [14]:
training_desired = False

#If training_desired is set to true, choose the epoch you left off at last time
#to continue training from there (put it as 0 if you haven't yet)
starting_epoch = 0

# hyperparameters
learning_rate = 1e-4
eps = 1e-8
warmup_steps = 50
if training_desired:
  train(poem_model,learning_rate,eps,warmup_steps,starting_epoch)


##Use the model to generate poems

In [15]:
# create text generation seed prompt
prompts = ["Supreme Court to go paperless in 6 months: CJI<SEP>", 
           "Governer demands for more vaccines<SEP>",
           "Scientists say chocolate is good for you<SEP>",
           "Aus Open: Serena, Sharapova progress to round 3"]


In [16]:
poem_model.eval()
           
for prompt in prompts:
  encoded = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = encoded.to(device)
  sample_outputs = poem_model.generate(
                                  generated, 
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length=MAX_LEN,
                                  top_p=0.95, 
                                  num_return_sequences=3
                                  )
  print("\nPrompt: ", prompt, "\n Generated:\n")

  for i, sample_output in enumerate(sample_outputs):
      print("{}: {}\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
  

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt:  Supreme Court to go paperless in 6 months: CJI<SEP> 
 Generated:

0: Supreme Court to go paperless in 6 months: CJIThe Supreme Court will go paperless in two months from now, said CJI Arup Rane. &#34;We will have to take up the problem of paperless notes with the states and have them made using the digital medium,&#34; he added. Currently, the Centre is on the verge of imposing the demonetisation policy if such an order are not carried out in time, he added.\n\n\n    \n\n\n\n\n\n\n\n\n\n\n     \n\n\n\n\n     \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

1: Supreme Court to go paperless in 6 months: CJIThe Supreme Court on Saturday told the Delhi High Court that it was working on the proposed Uniform Civil Code. The Law Ministry has said that the proposal for the Uniform Civil Code is being finalised in July and the government will submit the draft to the Delhi High Court within six months.A petition was filed by the Opposition in the Supreme Court demanding the government to &#34;pap

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt:  Governer demands for more vaccines<SEP> 
 Generated:

0: Governer demands for more vaccinesThe Centre on Monday requested the Centre for a total of ₹400 crore for the promotion of the safety of the vaccines. The response letter came following the National Vaccine Board recommendations which were found to be in violation of the Right to Life Act. Notably, India has now launched 23 vaccine-based drugs in the last five years.The Centre has asked all states, territories, union territories and union territories to provide every patient with ‘universal…virus-free’ vaccine.  \n\n\n\nThe government has directed all governments and Union Territories to provide the same. \n\n\n\n\n\n\n\n    \n\n\n\n\n  \

1: Governer demands for more vaccinesUnion Health Minister JP Nadda on Tuesday demanded that all new vaccines be put on an &#34;urgent&#34; list, calling for speedy screening of the entire country following the recent terror attack on India. He also said in the Parliament the governme

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Prompt:  Scientists say chocolate is good for you<SEP> 
 Generated:

0: Scientists say chocolate is good for youA Canadian research has suggested that chocolate may have an &#39;increased anti-aging&#39; activity. Further, scientists concluded that chocolate might increase heart health by promoting muscle-building and reducing the body&#39;s own cortisol production. Additionally, researchers found that chocolate-infused tea, which stimulates the production of hormones like cortisol, promotes heart health.The chocolate-flavoured beverage &#39;Chocolate Bar&#39; was first marketed by American coffee giant Starbucks in 2004. \n \n \n                  \n\n           \n           \n      \n  \n \n  \n \n \n

1: Scientists say chocolate is good for youIn an editorial published in the Indian Express, scientists have claimed that chocolate&#39;s beneficial properties in reducing cholesterol, cholesterol-lowering hormone and appetite have been scientifically proven. In one study, researchers t