# Install HuggingFace dependencies

In [1]:
!pip install transformers
!pip install tqdm

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 9.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 37.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 41.8MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


# Mount Drive and Imports

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys, os, json

In [4]:
root_folder = "/content/drive/My Drive/cs182_project/"
output_dir = os.path.join(root_folder, "./model_save/")
data_path = os.path.join(root_folder, "yelp_review_training_dataset.jsonl")

In [5]:
sys.path.append(root_folder)

# Load Finetuned GPT-2 Model

In [6]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

from tqdm import tqdm

In [7]:
config = GPT2Config.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

In [8]:
if torch.cuda.is_available():
  device = "cuda"
  model.cuda()
else:
  device = "cpu"
model.to(device)
print(device)

cuda


# Do a sample review generation

In [9]:
model.eval()

prompt = "<|startoftext|> OMG I absolutely loved"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
model = model.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[50257, 43821,   314,  5543,  6151]], device='cuda:0')
0:  OMG I absolutely loved this place. Amazing little cafe and I didn't realize there was a lot to choose from since that is what really got to mind during the week. Food was good and we had dessert. We were both impressed and the service was very helpful and pleasant. The place is well equipped with great music on the TVs. I had the red velvet pancakes and chocolates for the dessert for dessert and it was so good. The icing dessert. The cupcakes were perfectly proportioned. I could make it to my girlfriend. The service too many people in line.


1:  OMG I absolutely loved this place! They have good deals and they really take great care of their customers! They have so many different dresses for men. If I could give this place zero stars they would.


2:  OMG I absolutely loved this place. The menu is quite limited, but the food is so good, that is not my main preference. I found the churros here to be delicious, and had a 

In [10]:
from data_parsing import load_dataset

In [11]:
data = load_dataset(data_path)

In [12]:
reviews234 = [i for i in data if i[1] == 2 or i[1] == 3 or i[1] == 4]

In [13]:
len(reviews234)

142543

In [14]:
tokenizer.decode(sample_outputs[0], skip_special_tokens=True)

" OMG I absolutely loved this place. Amazing little cafe and I didn't realize there was a lot to choose from since that is what really got to mind during the week. Food was good and we had dessert. We were both impressed and the service was very helpful and pleasant. The place is well equipped with great music on the TVs. I had the red velvet pancakes and chocolates for the dessert for dessert and it was so good. The icing dessert. The cupcakes were perfectly proportioned. I could make it to my girlfriend. The service too many people in line."

In [18]:
new_data = []

In [None]:
for i in tqdm(range(0, 5000)):
  text, review = reviews234[i]
  prompt = "<|startoftext|>" + " " + ' '.join(text.split()[:15])
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)
  model = model.to(device)

  sample_outputs = model.generate(
                                  generated, 
                                  #bos_token_id=random.randint(1,30000),
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 300,
                                  top_p=0.95, 
                                  num_return_sequences=1
                                  )
  
  gpt_output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
  new_data.append((gpt_output, review))


  0%|          | 0/5000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/5000 [00:04<6:06:07,  4.39s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 2/5000 [00:05<4:37:05,  3.33s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 3/5000 [00:06<3:52:18,  2.79s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 4/5000 [00:08<3:16:38,  2.36s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 5/5000 [00:12<4:01:42,  2.90s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 6/5000 [00:13<3:27:55,  2.50s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 7/5000 [00:18<4:32:20,  3.27s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 8/5000 [00:19<3:2

In [None]:
new_data[10], data[10]

((' Like walking back in time, every Saturday morning my sister and I have tried the menu and always loved it.We will go again when we need something quick.',
  4.0),
 ('Like walking back in time, every Saturday morning my sister and I was in a bowling league and after we were done, we\'d spend a few quarters playing the pin ball machines until our mother came to pick us up.\n\nMy sister was daring and play the machines hard, she was afraid of that "tilt" showing up and freezing the game.  I, on the other hand was a bit more gentler and wanted to make sure I got my quarter\'s worth.\n\nThis place has rows and rows of machines, some are really old and some are more of a mid 80\'s theme.  There is even a Ms pac man!  It was fun to spend an afternoon playing the machines and remembering all the fun of my early teen years.',
  4.0))

In [None]:
str(new_data)

'[(\' Total bill for this horrible service? Over $8Gs. These crooks were a little rude and they said $7 per person would be out. The manager was on meth meth and no hello. I have a business complaint and he needs to be fired and screwed up in the front of the country.\', 1.0), (" I *adore* Travis at the Hard Rock\'s new Kelly Cardenas location and I had a fantastic experience. He\'s on time and friendly. They have complimentary wi-fi available to pick up beer, snacks, drink and even other items you can\'t find at their prices. I\'ve also been there more times for lunch and a drink than what it was at the time or cheaper. The St. O.J. (L*T. K*K) is a local brewery in the great to my god. I love the great place and the music. Best Dive Tea, I love the drinks!", 5.0), (" I have to say that this office really has it all. The doctors are wonderful and attentive. The front desk staff was so kind and sweet. They even gave the front desk to me. They answered all my questions before i got them 

In [None]:
rs = json.dumps(dict(new_data))

In [None]:
with open(os.path.join(root_folder, 'new_data3.json'), 'w') as f:
  json.dump(rs, f)