<a href="https://colab.research.google.com/github/RK900/cs182_project/blob/main/nlp_discriminator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab Stuff

from google.colab import drive
drive.mount('/content/drive')
root_folder = "/content/drive/My Drive/cs182_project/"

import sys

sys.path.append(root_folder)

!pip install transformers

# Code

In [None]:
import os
import time
import datetime
# from google.colab import drive

import pandas as pd
import seaborn as sns
import numpy as np
import random
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
torch.manual_seed(42)

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup
from data_parsing import load_dataset, load_gen_dataset
from training_utils import run_training_loop
from model import DiscriminatorModel

root_folder = "."

## Load Data and Models

In [None]:
data_path = os.path.join(root_folder, "yelp_review_training_dataset.jsonl")
aug_data = load_gen_dataset(os.path.join(root_folder, "new_data_234_28000_samples.json")) #+ load_gen_dataset(os.path.join(root_folder, "new_data2.json")) + load_gen_dataset(os.path.join(root_folder, "new_data3.json"))

In [None]:
data = load_dataset(data_path)

In [None]:
reviews234 = [i for i in data if i[1] == 2 or [1] == 4 or [1] == 4]

In [None]:
len(aug_data), len(reviews234)

In [None]:
output_dir = os.path.join(root_folder, "./model_save_gpt_234/")

In [None]:
config = GPT2Config.from_pretrained(output_dir)

In [None]:
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
device = torch.device('cuda')

## Generate Sample Predictions

model.eval()

prompt = "<|startoftext|> The place was alright"

generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)
model = model.to(device)

print(generated)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,
                                top_k=50, 
                                max_length = 300,
                                top_p=0.95, 
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

## Build train and test sets, 0 is fake, 1 is real

In [None]:
reviews234[0:2]

In [None]:
aug_data[0:2]

In [None]:
X, y = [], []

for i in reviews234:
    X.append(i[0])
    y.append(1)

for i in aug_data:
    X.append(i[0])
    y.append(0)

In [None]:
len(X), len(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=50)

## Build Transformer Discriminator

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
clf = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
tok = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)

In [None]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)

In [None]:
train_X = tok(X_train, return_tensors='pt', padding=True, truncation=True, max_length=64)
train_input_ids = train_X['input_ids'].detach().numpy()
train_attention_mask = train_X['attention_mask'].detach().numpy()

In [None]:
train_y = np.array(y_train)#.unsqueeze(0)

In [None]:
test_X = tok(X_test[:10000], return_tensors='pt', padding=True, truncation=True, max_length=64)
test_input_ids = test_X['input_ids'].detach().numpy()
test_attention_mask = test_X['attention_mask'].detach().numpy()

In [None]:
test_y = np.array(y_test)#.unsqueeze(0)

In [None]:
train_attention_mask.shape

In [None]:
train_input_ids.shape

In [None]:
experiment_id = f"discrim-1"
tag = f"model-discrim-1"

In [None]:
os.makedirs('completed-experiments/%s/' % experiment_id)

In [None]:
losses = []
epochs = 5
batch_size = 8

In [None]:
model = DiscriminatorModel(clf, 2)

In [None]:
clf.to(device)

In [None]:
run_training_loop(model, optimizer, device, 16, 3, train_input_ids, train_attention_mask, train_y, test_input_ids, test_attention_mask, test_y, model_id=experiment_id, tag=tag)

In [None]:
model_dir = 'completed-experiments/%s/%s.pt' % (experiment_id, tag)

In [None]:
model = DiscriminatorModel(clf, 2)

In [None]:
model.load_state_dict(torch.load(model_dir))

In [None]:
type(model.transformer)

In [None]:
searchable_real = [i[0] for i in reviews234]
searchable_fake = [i[0] for i in aug_data]

In [None]:
n = 3
test_sample = X_test[n:n+1]

In [None]:
sample_test = tok(test_sample, return_tensors='pt', padding=True, truncation=True, max_length=64)

In [None]:
sample_test_input = sample_test['input_ids']
sample_test_mask = sample_test['attention_mask']

In [None]:
model.forward(sample_test_input, sample_test_mask)

In [None]:
test_sample in searchable_real

In [None]:
print(test_sample)