Installing dependencies

In [None]:
!pip install transformers
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
import joblib
from google.colab import drive
drive.mount('/content/gdrive')

Loading the dataset

In [None]:
stories = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/children_books.csv')
stories = stories['Desc']
df = stories

Creating Dataset class

In [None]:
class Story(Dataset): 

    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.story = []

        for row in df:
          self.story.append(torch.tensor(self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")))
                
        if truncate:
            self.story = self.story[:20000]
        self.char_count = len(self.story)
        
    def __len__(self):
        return self.char_count

    def __getitem__(self, item):
        return self.story[item]

Creating dataset object

In [None]:
dataset = Story(df, truncate=True, gpt2_type="gpt2")

Loading pre-trained model and tokenizer

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [None]:
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

Fine-tuning/training function

In [None]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [None]:
model = train(dataset, model, tokenizer)

Saving/loading the model

In [None]:
# torch.save(model, '/content/story_model.pt')
model = torch.load('/content/gdrive/MyDrive/Colab Notebooks/story_model.pt', map_location ='cpu')

In [None]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=1.,
):

    model.eval()

    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False

            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

In [None]:
generated_text = generate(model.to('cpu'), tokenizer, 'A purple patch', entry_count=3, temperature=0.9)

100%|██████████| 3/3 [00:13<00:00,  4.41s/it]


In [None]:
clean_text = []
for i in range(len(generated_text)):
  first_sentence = generated_text[i].split('.',1)[0]
  if(len(first_sentence) > 40):
    clean_text.append(generated_text[i].replace('\n', '').replace('\n\n', '').replace('\nA', '').replace('<|endoftext|>', '').split('.',1)[0] + '.')
  else:
    clean_text.append(generated_text[i].replace('\n', '').replace('\n\n', '').replace('\nA', '').replace('<|endoftext|>', ''))

for i in range(len(clean_text)):
  print(clean_text[i],'\n')

A purple patch is just one of many. The crescent moon is often the brightest star in the constellation Aquarius. It is also the brightest star in the night 

A purple patch, which turns the surface of a lake green, and three large red lines, and three small circles, which are the circumference of a ball. 

A purple patch on the back of her back gives her the feeling of a huge green hand. 

