# Install HuggingFace dependencies

In [None]:
!pip install transformers
!pip install tqdm

# Mount Drive and Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys, os, json

In [None]:
root_folder = "."
output_dir = os.path.join(root_folder, "./model_save_gpt_234/")
data_path = os.path.join(root_folder, "yelp_review_training_dataset.jsonl")

In [None]:
sys.path.append(root_folder)

# Load Finetuned GPT-2 Model

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

from tqdm import tqdm
import pandas as pd

In [None]:
config = GPT2Config.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

In [None]:
if torch.cuda.is_available():
  device = "cuda"
  model.cuda()
else:
  device = "cpu"
model.to(device)
print(device)

# Do a sample review generation

In [None]:
model.eval()

prompt = "<|startoftext|> This place was"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
model = model.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
from data_parsing import load_dataset

In [None]:
data = load_dataset(data_path)

In [None]:
reviews234 = [i for i in data if i[1] == 2 or i[1] == 3 or i[1] == 4]

In [None]:
len(reviews234)

In [None]:
data_dir = os.path.join(root_folder, "yelp_review_val_dataset_234.csv")

In [None]:
df = pd.read_csv(data_dir, error_bad_lines=False, sep='\t', header='infer', lineterminator='\n')  
print(df)

In [None]:
labels_dir = os.path.join(root_folder, "yelp_review_val_dataset_234_labels.csv")

In [None]:
labels_df = pd.read_csv(labels_dir, error_bad_lines=False, sep='\t', header='infer', lineterminator='\n')  
print(labels_df)

In [None]:
labels234 = list(labels_df.text)

In [None]:
reviews234 = list(df.text)

In [None]:
reviews234[0]

In [None]:
labels234[0]

In [None]:
tokenizer.decode(sample_outputs[0], skip_special_tokens=True).strip()

# Create data for N number of samples

In [None]:
new_data = []

In [None]:
for i in tqdm(range(0, 28000)):
  text = reviews234[i]
  review = labels234[i]
  prompt = "<|startoftext|>" + " " + ' '.join(text.split()[:15])
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)
  model = model.to(device)

  sample_outputs = model.generate(
                                  generated, 
                                  #bos_token_id=random.randint(1,30000),
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 300,
                                  top_p=0.95, 
                                  num_return_sequences=1
                                  )
  
  gpt_output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True).strip()
  new_data.append((gpt_output, review))


In [None]:
new_data[10], data[10]

In [None]:
rs = json.dumps(dict(new_data))

In [None]:
with open(os.path.join(root_folder, 'new_data_234_28000_samples.json'), 'w') as f:
  json.dump(rs, f)

# Install HuggingFace dependencies

In [None]:
!pip install transformers
!pip install tqdm

# Mount Drive and Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import sys, os, json

In [None]:
root_folder = "."
output_dir = os.path.join(root_folder, "./model_save_gpt_234/")
data_path = os.path.join(root_folder, "yelp_review_training_dataset.jsonl")

In [None]:
sys.path.append(root_folder)

# Load Finetuned GPT-2 Model

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

from tqdm import tqdm
import pandas as pd

In [None]:
config = GPT2Config.from_pretrained(output_dir)
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

In [None]:
if torch.cuda.is_available():
  device = "cuda"
  model.cuda()
else:
  device = "cpu"
model.to(device)
print(device)

# Do a sample review generation

In [None]:
model.eval()

prompt = "<|startoftext|> This place was"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
model = model.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

In [None]:
from data_parsing import load_dataset

In [None]:
data = load_dataset(data_path)

In [None]:
reviews234 = [i for i in data if i[1] == 2 or i[1] == 3 or i[1] == 4]

In [None]:
len(reviews234)

In [None]:
data_dir = os.path.join(root_folder, "yelp_review_val_dataset_234.csv")

In [None]:
df = pd.read_csv(data_dir, error_bad_lines=False, sep='\t', header='infer', lineterminator='\n')  
print(df)

In [None]:
labels_dir = os.path.join(root_folder, "yelp_review_val_dataset_234_labels.csv")

In [None]:
labels_df = pd.read_csv(labels_dir, error_bad_lines=False, sep='\t', header='infer', lineterminator='\n')  
print(labels_df)

In [None]:
labels234 = list(labels_df.text)

In [None]:
reviews234 = list(df.text)

In [None]:
reviews234[0]

In [None]:
labels234[0]

In [None]:
tokenizer.decode(sample_outputs[0], skip_special_tokens=True).strip()

# Create data for N number of samples

In [None]:
new_data = []

In [None]:
for i in tqdm(range(0, 28000)):
  text = reviews234[i]
  review = labels234[i]
  prompt = "<|startoftext|>" + " " + ' '.join(text.split()[:15])
  generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
  generated = generated.to(device)
  model = model.to(device)

  sample_outputs = model.generate(
                                  generated, 
                                  #bos_token_id=random.randint(1,30000),
                                  do_sample=True,   
                                  top_k=50, 
                                  max_length = 300,
                                  top_p=0.95, 
                                  num_return_sequences=1
                                  )
  
  gpt_output = tokenizer.decode(sample_outputs[0], skip_special_tokens=True).strip()
  new_data.append((gpt_output, review))


In [None]:
new_data[10], data[10]

In [None]:
rs = json.dumps(dict(new_data))

In [None]:
with open(os.path.join(root_folder, 'new_data_234_28000_samples.json'), 'w') as f:
  json.dump(rs, f)