In [1]:
!pip install transformers



In [2]:
import torch
import json
import csv
import os
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')

Using device: cuda


# Load GPT-2 Medium 

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', model_max_length=1024)
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to(device)

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Helper functions

In [5]:
class ModelWrapper:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'

    def generate(self, prompt, keywords, category):
        input = '<|startoftext|>'
        if category:
            assert category in ['positive', 'negative']
            input += f'~`{category}'
        if keywords:
            keywords = [ k.replace(' ', '-') for k in keywords ]
            input += f"~^{' '.join(keywords)}"
        
        input += f"~@{prompt if prompt else ''}"
        input_encoded = self.tokenizer.encode(input, return_tensors='pt').to(self.device)

        # TODO: make these settings adjustable
        outputs = self.model.generate(
            input_encoded,
            do_sample=True, 
            max_length=600, 
            top_k=30, 
            top_p=0.96, 
            num_return_sequences=3
        )

        # TODO: select outputs with keywords (?)
        outputs_decoded = [ self.tokenizer.decode(out, skip_special_tokens=True) for out in outputs ]
        return outputs_decoded

def lambda_lr(epoch):
    pass


def get_exponential_decay():
    pass


# Reviews dataset

In [6]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [7]:
DATA_PATH = '/content/drive/My Drive/Data Science/Datasets'
!ls '$DATA_PATH'

reviews_nlp_encoded_balanced.txt  reviews_nlp_encoded.txt


In [8]:
from torch.utils.data import Dataset, DataLoader
MAX_SEQ_LEN = 1000

class ReviewsDataset(Dataset):
    def __init__(self, filename):
        super().__init__()

        with open(filename) as data_file:
          self.reviews = data_file.readlines()
        
        self.join_sequences()

    def join_sequences(self):
        joined = []
        temp_reviews_tens = None
        for review in self.reviews:
            # fit as many review sequences into MAX_SEQ_LEN sequence as possible
            review_tens = torch.tensor(tokenizer.encode(review, max_length=MAX_SEQ_LEN, truncation=True)).unsqueeze(0)

            # the first review sequence in the sequence
            if not torch.is_tensor(temp_reviews_tens):
                temp_reviews_tens = review_tens
                continue
            else:
                # the next review does not fit in so we process the sequence and leave the last review 
                # as the start for next sequence 
                if temp_reviews_tens.size()[1] + review_tens.size()[1] < MAX_SEQ_LEN:
                    # add the review to sequence, continue and try to add more
                    temp_reviews_tens = torch.cat([temp_reviews_tens, review_tens[:, 1:]], dim=1)
                    continue
                else:
                    work_reviews_tens, temp_reviews_tens = temp_reviews_tens, review_tens
            joined.append(work_reviews_tens)
          
        self.encoded_joined_seq = joined

    def __len__(self):
        return len(self.encoded_joined_seq)

    def __getitem__(self, idx):
        return self.encoded_joined_seq[idx]

review_dataset = ReviewsDataset(f'{DATA_PATH}/reviews_nlp_encoded_balanced.txt')
review_loader = DataLoader(review_dataset, batch_size=1, shuffle=True)

# Hyperparameters

In [9]:
BATCH_SIZE = 5
EPOCHS = 8
LEARNING_RATE = 1e-5
WARMUP_FRAC = 0.2
from transformers import AdamW, get_linear_schedule_with_warmup

In [10]:
len(review_loader)*EPOCHS//BATCH_SIZE

4744

# Training

In [12]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

steps = len(review_loader)*EPOCHS//BATCH_SIZE
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(WARMUP_FRAC*steps), num_training_steps=steps)


loss_history = []
batch_count = 0
reviews_count = 0
loss_every = 100

temp_reviews_tens = None
models_dir = "models"
if not os.path.exists(models_dir):
    os.mkdir(models_dir)

for epoch in range(EPOCHS):
    print(f"EPOCH {epoch+1}")
    epoch_loss_history = []
  
    for review_tens in review_loader:
        review_tens = review_tens.to(device)
                
        # sequence ready, process it trough the model
        optimizer.zero_grad()
        outputs = model(review_tens, labels=review_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        epoch_loss_history.append(loss.detach().item())
                       
        reviews_count += 1
        if reviews_count == BATCH_SIZE:
            reviews_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step() 

        if batch_count == loss_every:
            batch_count = 0
            avg_loss = np.array(epoch_loss_history)[-loss_every*BATCH_SIZE+1:].mean()
            print(f'Avg. loss: {avg_loss:.4f}')
    
    loss_history.append(torch.tensor(epoch_loss_history))
    # store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_dir, f"review_model_e{epoch+1}.pt"))

EPOCH 1
Avg. loss: 3.6317
Avg. loss: 3.2692
Avg. loss: 2.9938
Avg. loss: 2.8743
Avg. loss: 2.8129
EPOCH 2
Avg. loss: 2.5630
Avg. loss: 2.6838
Avg. loss: 2.6536
Avg. loss: 2.6651
Avg. loss: 2.7031
Avg. loss: 2.6969
EPOCH 3
Avg. loss: 2.6836
Avg. loss: 2.6036
Avg. loss: 2.6049
Avg. loss: 2.6010
Avg. loss: 2.6090
Avg. loss: 2.6174
EPOCH 4
Avg. loss: 2.5218
Avg. loss: 2.5201
Avg. loss: 2.5620
Avg. loss: 2.6118
Avg. loss: 2.5853
Avg. loss: 2.5530
EPOCH 5
Avg. loss: 2.5556
Avg. loss: 2.5799
Avg. loss: 2.5035
Avg. loss: 2.5234
Avg. loss: 2.5576
Avg. loss: 2.5856
EPOCH 6
Avg. loss: 2.5712
Avg. loss: 2.5169
Avg. loss: 2.5371
Avg. loss: 2.5633
Avg. loss: 2.5230
Avg. loss: 2.5354
EPOCH 7
Avg. loss: 2.5047
Avg. loss: 2.4946
Avg. loss: 2.5477
Avg. loss: 2.5100
Avg. loss: 2.5019
Avg. loss: 2.5609
EPOCH 8
Avg. loss: 2.4949
Avg. loss: 2.5348
Avg. loss: 2.5014
Avg. loss: 2.5037
Avg. loss: 2.5319
Avg. loss: 2.5259


# Load model

In [13]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.to(device)
model.load_state_dict(torch.load('models/review_model_e8.pt'))
model.eval()

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [14]:
wrapped_model = ModelWrapper(model, tokenizer)

In [19]:
wrapped_model.generate('', ['dog'], 'bad')

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


["<|startoftext|>~`bad~^dog~@This book had such a nice ring to it that if a dog bites you, it will eat you. It was so easy to read, so you could feel the force of the dog's bite. I loved seeing the dog on the porch in the backyard to see what was going on with the dog's coat, and the dog would come up and walk away with no coat, no leash. There were very few people around the house and I didn't notice any unusual behaviors, which made it even more interesting to me. The book is not for the timid dog, but the one with a weak heart and a lack of a strong jaw. It is a great story, but not one you would normally read for someone who is very young.",
 "<|startoftext|>~`bad~^dog~@This book is terrible! I read it in the winter when I was in Italy and I didn't care about being a dog. It really doesn't do it for me. The way they are treated in this book is disgusting. I didn't care that they had a leash around the dog. It's an awful dog that I was terrified of and was hoping she would be taken 