In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

!pip install transformers
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
import json

import pandas as pd  



In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|BOS|>', eos_token='<|EOS|>', pad_token='<|pad|>') #gpt2-medium


class GPT2Dataset(Dataset):
  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    for txt in txt_list:
      encodings_dict = tokenizer('<|BOS|>'+ txt + '<|EOS|>', truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# import pandas as pd
# xx = pd.read_csv('summary.csv', encoding = "ISO-8859-1")
# print(xx)

file = 'books_dataset.json'
with open(file) as train_file:
    dict_train = json.load(train_file)

# converting json dataset from dictionary to dataframe
train = pd.DataFrame.from_dict(dict_train, orient='index')
train.reset_index(level=0, inplace=True)

xx=train
# data = json.loads(books_dataset)
# df = pd.json_normalize(data['results'])
# xx = df
print(xx.iloc[:,1])


0      <|BOS|>The Time Machine<|SEP|>time faints his ...
1      <|BOS|>The War of the Worlds<|SEP|>envious eye...
2      <|BOS|>A Princess of Mars<|SEP|>lower atmosphe...
3      <|BOS|>Youth<|SEP|>space aliens tiny dead iden...
4      <|BOS|>2 B R 0 2 B<|SEP|>population control ac...
                             ...                        
140    <|BOS|>Rip Foster Rides the Gray Planet<|SEP|>...
141    <|BOS|>Eastern Standard Tribe<|SEP|>members ps...
142    <|BOS|>Man of Many Minds<|SEP|>minds a secret ...
143    <|BOS|>The Players<|SEP|>aliens no nonsense th...
144    <|BOS|>Rip Foster in Ride the Gray Planet<|SEP...
Name: 0, Length: 145, dtype: object


In [None]:


summaries = xx.iloc[:,1].copy()

print(summaries)

0      <|BOS|>The Time Machine<|SEP|>time faints his ...
1      <|BOS|>The War of the Worlds<|SEP|>envious eye...
2      <|BOS|>A Princess of Mars<|SEP|>lower atmosphe...
3      <|BOS|>Youth<|SEP|>space aliens tiny dead iden...
4      <|BOS|>2 B R 0 2 B<|SEP|>population control ac...
                             ...                        
140    <|BOS|>Rip Foster Rides the Gray Planet<|SEP|>...
141    <|BOS|>Eastern Standard Tribe<|SEP|>members ps...
142    <|BOS|>Man of Many Minds<|SEP|>minds a secret ...
143    <|BOS|>The Players<|SEP|>aliens no nonsense th...
144    <|BOS|>Rip Foster in Ride the Gray Planet<|SEP...
Name: 0, Length: 145, dtype: object


In [None]:
dataset = GPT2Dataset(summaries, tokenizer, max_length=768)
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [None]:
batch_size=2
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [None]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))


Embedding(50260, 768)

In [None]:
epochs = 5
learning_rate = 10e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)



In [None]:
import time
import datetime
import random
!pip install ipdb
device = torch.device("cuda")
model.cuda()
total_t0 = time.time()
model = model.to(device)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))
training_stats = []

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        # cc = batch.copy()
        # labels = []
        # for en, ex in enumerate(cc[0]):
        #   import ipdb;ipdb.set_trace()
        #   ind_29 = (ex == 29).nonzero(as_tuple=True)
        #   ind_27 = (ex == 27).nonzero(as_tuple=True)
        #   batch[0][en] = cc[0][en][ind_29[0][1]+1:ind_27[0][2]]
        #   batch[1][en] = cc[1][en][ind_29[0][1]+1:ind_27[0][2]]
        #   labels.append(cc[0][en][ind_29[0][2]+1:])
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 9.25
  Training epoch took: 0:00:25

Running Validation...
  Validation Loss: 3.36
  Validation took: 0:00:04

Training...

  Average training loss: 2.65
  Training epoch took: 0:00:27

Running Validation...
  Validation Loss: 2.62
  Validation took: 0:00:04

Training...

  Average training loss: 2.05
  Training epoch took: 0:00:27

Running Validation...
  Validation Loss: 2.53
  Validation took: 0:00:04

Training...

  Average training loss: 1.74
  Training epoch took: 0:00:26

Running Validation...
  Validation Loss: 2.55
  Validation took: 0:00:04

Training...

  Average training loss: 1.55
  Training epoch took: 0:00:27

Running Validation...
  Validation Loss: 2.58
  Validation took: 0:00:04

Training complete!
Total training took 0:02:31 (h:mm:ss)


In [None]:
model.eval()

prompt = "<|BOS|>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=5, 
                                max_length = 300,
                                top_p=0.9, 
                                num_return_sequences=3
                                )
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0: A Journey to the Centre of the Earth<|SEP|>The adventures of the protagonists are set in a parallel universe, with the protagonists being the protagonist, the protagonist being a young boy, and the protagonist being a woman.
The protagonist is a young woman who lives in a parallel universe to the protagonist's story. The protagonist is an experienced musician and musician, with a fascination with music, and the protagonist's love of classical music and classical music is expressed in a way reminiscent of a similar love story between the two protagonists. The story follows the protagonist, a musician, as he travels through the parallel universe, and learns how to make music.
The story follows the protagonist and the protagonist's adventures through the various worlds of the alternate universe.
The protagonist's love of classical music has influenced his behavior and has inspired him to become a musician. The protagonist's love of classical music has