In [4]:
import sys, os, json

In [5]:
root_folder = "."
output_dir = os.path.join(root_folder, "./model_save_gpt_234/")
data_path = os.path.join(root_folder, "yelp_review_training_dataset.jsonl")

In [None]:
sys.path.append(root_folder)

# Load Finetuned GPT-2 Model

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

from tqdm import tqdm
import pandas as pd

In [7]:
config = GPT2Config.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

In [28]:
if torch.cuda.is_available():
  device = "cuda"
  model.cuda()
else:
  device = "cpu"
model.to(device)
print(device)

cuda


# Do a sample review generation

In [92]:
model.eval()

prompt = "<|startoftext|> This place was"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = encoded_inputs['input_ids']
mask = encoded_inputs['attention_mask'].to(device)
generated = generated.to(device)
model = model.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=2,
                                attention_mask=mask
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[   40,  1101, 50258, 50258, 50258, 50258],
        [ 1870,  1194,  6827, 50258, 50258, 50258],
        [ 1870,   262,   845,   845,   938,   530]], device='cuda:0')
0: I'm pretty used to getting my fair share of service here.  If you're a frequent customer and you're in a pinch, this is a great place for you.  My friend had a very good experience here, I really wanted to give 5 stars, but it turns out they do have some bad food choices at lunch.  It's not all the same, the food is bland, the food is just not the same.  There was one thing that kept us coming back and the other was the manager.  She was very rude but she made up excuses for it and her boss was very nice.  I don't feel comfortable with management that often, I feel like this place was handled poorly by a friend of mine for years.


1: I'm a frequent guest here. It's like your normal Chinese buffet where you sit in one room, get something to eat, and watch a movie (which is why I come here frequently). But it get

In [10]:
from data_parsing import load_dataset

In [11]:
data = load_dataset(data_path)

In [12]:
reviews234 = [i for i in data if i[1] == 2 or i[1] == 3 or i[1] == 4]

In [13]:
len(reviews234)

142543

In [14]:
data_dir = os.path.join(root_folder, "yelp_review_val_dataset_234.csv")

In [15]:
df = pd.read_csv(data_dir, error_bad_lines=False, sep='\t', header='infer', lineterminator='\n')  
print(df)

                                                    text
0      Visited this place for the second time when I ...
1      We went here many times over the past few days...
2      1.5 stars. This location has been a revolving ...
3      After trying many of the dispensaries througho...
4      Hard to figure out how to get a table, but the...
...                                                  ...
28504  I have only tried the italian beef. Outstandin...
28505  I have had these pop ups since 2008 - they do ...
28506  Came here for lunch and ordered fish. Service ...
28507  Forces new patients to give a credit card # or...
28508  First of all, they are supposed to open at 9:0...

[28509 rows x 1 columns]


In [16]:
labels_dir = os.path.join(root_folder, "yelp_review_val_dataset_234_labels.csv")

In [17]:
labels_df = pd.read_csv(labels_dir, error_bad_lines=False, sep='\t', header='infer', lineterminator='\n')  
print(labels_df)

       text
0       4.0
1       3.0
2       2.0
3       3.0
4       4.0
...     ...
28504   4.0
28505   2.0
28506   3.0
28507   2.0
28508   2.0

[28509 rows x 1 columns]


In [18]:
labels234 = list(labels_df.text)

In [19]:
reviews234 = list(df.text)

In [20]:
len(reviews234), len(labels234)

(28509, 28509)

In [21]:
reviews234[0]

'Visited this place for the second time when I was in Montreal in August. I went for the first time in October after hearing marvelous things about the charcuterie (I was on a charcuterie kick at the time). My friend, Jason and I split the charcuterie plate with foie gras as well as a cheese plate (large portions--we had left overs). We both loved it! This time when I came, I got the charcuterie plate and a cronut (caramel). The charcuterie plate had changed a bit-- a couple items were replaced with new things. There was one "deli" style meat that I was not a fan of. Other than that I loved everything! There was a new ginger bread that they served with the foie grad that I was obsessed with. The combination was perfect! I also thoroughly enjoyed my cronut. They were a trend in the states for a bit, but sort of died out, so when I saw them on the menu, I knew I had to get one. It did not disappoint. The one negative thing I have to say is that the service did not seem to be super great/

In [22]:
labels234[0]

4.0

# Create data for N number of samples

In [97]:
new_data = []
num_to_generate = 2
batch_size = 8

In [99]:
for i in tqdm(range(0, 28000 - batch_size, batch_size)):
  text = reviews234[i % len(labels234) : (i + batch_size) % len(labels234)]
  review = labels234[i % len(labels234) : (i + batch_size) % len(labels234)]
#   prompt = "<|startoftext|>" + " " + ' '.join(text.split()[:10])
  encodings = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
#   generated = torch.tensor([tokenizer.encode("<|startoftext|>" + " " + ' '.join(p.split()[:7]), ) for p in text])
#   generated = generated.to(device)
  model = model.to(device)

  sample_outputs = model.generate(
                                  encodings['input_ids'], 
                                  #bos_token_id=random.randint(1,30000),
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 300,
                                  top_p=0.95, 
                                  num_return_sequences=num_to_generate,
                                  attention_mask=encodings['attention_mask']
                                  )
    
  assert len(sample_outputs) == batch_size * num_to_generate
  for i, sample_output in enumerate(sample_outputs):
#     print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
    gpt_output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True).strip()
    new_data.append((gpt_output, review[i % num_to_generate]))
  
#   gpt_output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True).strip()
#   new_data.append((gpt_output, review))


  0%|          | 0/3499 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 621, but ``max_length`` is set to 300.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
  0%|          | 1/3499 [00:00<09:55,  5.88it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 885, but ``max_length`` is set to 300.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
  0%|          | 2/3499 [00:00<09:24,  6.20it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/3499 [00:00<10:18,  5.66it/s]


RuntimeError: CUDA out of memory. Tried to allocate 56.00 MiB (GPU 0; 7.76 GiB total capacity; 2.14 GiB already allocated; 26.94 MiB free; 2.45 GiB reserved in total by PyTorch)

In [None]:
new_data[10], data[10]

In [None]:
rs = json.dumps(dict(new_data))

In [None]:
with open(os.path.join(root_folder, 'new_data_234_56000_samples.json'), 'w') as f:
  json.dump(rs, f)