In [7]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

!pip install transformers
from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
import json

import pandas as pd  



In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|BOS|>', eos_token='<|EOS|>', pad_token='<|pad|>') #gpt2-medium


class GPT2Dataset(Dataset):
  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):
    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []
    for txt in txt_list:
      encodings_dict = tokenizer('<|BOS|>'+ txt + '<|EOS|>', truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# import pandas as pd
# xx = pd.read_csv('summary.csv', encoding = "ISO-8859-1")
# print(xx)

file = 'books_dataset.json'
with open(file) as train_file:
    dict_train = json.load(train_file)

# converting json dataset from dictionary to dataframe
train = pd.DataFrame.from_dict(dict_train, orient='index')
train.reset_index(level=0, inplace=True)

xx=train
# data = json.loads(books_dataset)
# df = pd.json_normalize(data['results'])
# xx = df
#print(xx.iloc[:,1])


In [10]:


summaries = xx.iloc[:,1].copy()

print(summaries)

0      <|BOS|>The Time Machine<|SEP|>time faints his ...
1      <|BOS|>The War of the Worlds<|SEP|>envious eye...
2      <|BOS|>A Princess of Mars<|SEP|>lower atmosphe...
3      <|BOS|>Youth<|SEP|>space aliens tiny dead iden...
4      <|BOS|>2 B R 0 2 B<|SEP|>population control ac...
                             ...                        
280    <|BOS|>Rip Foster Rides the Gray Planet<|SEP|>...
281    <|BOS|>Eastern Standard Tribe<|SEP|>members ps...
282    <|BOS|>Man of Many Minds<|SEP|>minds a secret ...
283    <|BOS|>The Players<|SEP|>aliens not nonsense t...
284    <|BOS|>Rip Foster in Ride the Gray Planet<|SEP...
Name: 0, Length: 285, dtype: object


In [11]:
dataset = GPT2Dataset(summaries, tokenizer, max_length=768)
train_size = int(0.7 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [49]:
def preprocess_val_dataset(dataset):
  
  s = tokenizer.decode(dataset[0][0]).split('<|SEP|>')
  new_output = s[len(s)-1]
  new_output = s[len(s)-1]
  new_input = s[0]
  for i in range(1,len(s)-1):
    new_input = new_input+'<|SEP|>'+s[i]

  new_input = new_input + '<|SEP|>'
  print(new_input)
  input_ids = tokenizer.encode(new_input, return_tensors='pt')
  output_ids = tokenizer.encode(new_output, return_tensors='pt')

  return input_ids[0], torch.ones(len(input_ids[0])), output_ids

In [40]:
batch_size=2
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [13]:
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))


Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]

Embedding(50260, 768)

In [14]:
epochs = 5
learning_rate = 10e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 100

In [15]:
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)



In [50]:
import time
import datetime
import random
import tensorflow as tf

!pip install ipdb
device = torch.device("cuda")
model.cuda()
total_t0 = time.time()
model = model.to(device)

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))
training_stats = []

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        # cc = batch.copy()
        # labels = []
        # for en, ex in enumerate(cc[0]):
        #   import ipdb;ipdb.set_trace()
        #   ind_29 = (ex == 29).nonzero(as_tuple=True)
        #   ind_27 = (ex == 27).nonzero(as_tuple=True)
        #   batch[0][en] = cc[0][en][ind_29[0][1]+1:ind_27[0][2]]
        #   batch[1][en] = cc[1][en][ind_29[0][1]+1:ind_27[0][2]]
        #   labels.append(cc[0][en][ind_29[0][2]+1:])
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval()

            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train()

        loss.backward()

        optimizer.step()

        scheduler.step()

    # average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        input_ids, masks, output_ids = preprocess_val_dataset(batch)
        #b_input_ids = batch[0].to(device)
        b_input_ids = input_ids.to(device)
        #b_labels = batch[0].to(device)
        b_labels = input_ids.to(device)
        #b_masks = batch[1].to(device)
        b_masks = masks.to(device)
        #print(f"DEBA: {b_labels}")
        
        with torch.no_grad():        

            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
          
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...

  Average training loss: 0.93
  Training epoch took: 0:01:40

Running Validation...
<|BOS|> <|BOS|> Skylark Three<|SEP|>protagonist planet personal to wear summon Help the Suburb boat a total battleship a spaceship capital Power full check Progress crane<|SEP|>
<|BOS|> <|BOS|> When the Sleeper Wakes<|SEP|>confronts huge wealth institutions Workers Unskilled workers anti-aircraft guns political entities other cities unspecified joys industry Grahams aeroplane crashes wind-mills<|SEP|>
<|BOS|> <|BOS|> A Martian Odyssey<|SEP|>small pyramids healthy tissue diseased The next cart creature creatures every the a large birdlike rubbery plants tentacled dust shoots<|SEP|>
<|BOS|> <|BOS|> The Time Machine<|SEP|>weather faints his machine the easy lichenose vegetation various days to possess a desk template evening survival huge butterflies heavy doors<|SEP|>
<|BOS|> <|BOS|> Youth<|SEP|>space aliens lowercase died identifiable physicist descriptions interstellar trade suitable nut N

In [51]:
model.eval()

prompt = "<|BOS|>"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=5, 
                                max_length = 300,
                                top_p=0.9, 
                                num_return_sequences=3
                                )
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257]], device='cuda:0')
0: The Sky Is Falling<|SEP|>life forms the sky above the Earth a hollow shell a hollow shell the world a hollow shell a higher energy source the sky above the Earth the sun a hollow shell the sun a higher energy source the sun<|SEP|>The story takes place in Alaska, a vibrant, utopian future where the sun and stars form a spherical, globular, "zone." The sky above the earth is a hollow shell, with no apparent effect on the composition of the sky above. The sky above is a literal dome with no apparent effect on the composition of the sky above. The sky above the Earth is a hollow shell with features of a higher energy source than the sun, but the effect of the effect is temporary.
A series of adventures ensues as Earth and Mercury pass within the sphere of their own destinies, and the sun and stars become a hollow shell. Eventually, the sun and stars die, leaving only a few visible "sphere" within the dome. The sky above the Earth is a hollow shell with