# Global Setup

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install segtok
!pip install vaderSentiment
!pip install nltk
!pip install huggingface_hub

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 8.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 46.5MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 47.5MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting sentencepiece
[?25l  Downloading https://files.p

In [None]:
import os
import sys
from pathlib import Path

import json
import pandas as pd
import random

import torch
from segtok import tokenizer

import tqdm

from multiprocessing import Pool

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
ROOT_FOLDER = Path("/content/drive/My Drive/cs182_final_project/cs182-nlp")
DATA_FOLDER = ROOT_FOLDER / "dataset"
TORCH_CHECKPOINT_MODEL = ROOT_FOLDER / "models" / "training_checkpoint_oscar_tallercnn_vadar_5-class_5-11-2021.pt"

input("Please check to make sure the above checkpoint directory is yours (Hit any key)")

In [None]:
sys.path.append(ROOT_FOLDER)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
list_to_device = lambda th_obj: [tensor.to(device) for tensor in th_obj]

# Model Params

In [None]:
MAX_LEN = 128
MAX_LEN_VADER = 40
BATCH_SIZE = 32
EPOCHS = 3
USE_VADER = True

# Higher bound settings: MAX_LEN = 256 and BATCH_SIZE = 16

#Data Preprocessing Functions

## load data

In [None]:
def load_json(file_path, filter_function=lambda x: True):
  """
  file_path - full path of the file to read from
  filter_function - a data selection function, returns True to ADD a data point
  """
  result = []

  try:
    with open(file_path, "r") as f:
      for line in f:
        json_line = json.loads(line)
        if not filter_function(json_line):
          # Disallow via opposite of allow
          continue
        result.append(json_line) # each line is one data point dictionary
    return pd.DataFrame.from_records(result)
    # return result

  except IOError:
    print(f"cannot open {file_path}")
    return None

## data formatting

### tokenize

In [None]:
def tokenize(data):
  """
  data - an iterable of sentences
  """
  token_set = set()
  i = 0
  for sentences in data:
    if i % 1000 == 0:
      print(i, end=", " if i % 15000 != 0 else "\n")
    tokenized = nltk.word_tokenize(sentences.lower())
    for token in tokenized:
      token_set.add(token)
    i += 1
  return token_set

In [None]:
def tokenize_review(tokenizer, review_text):
  encodings = tokenizer.encode_plus(review_text, add_special_tokens=True,
                                    max_length=MAX_LEN,
                                    return_token_type_ids=False,
                                    return_attention_mask=False,
                                    truncation=True,
                                    pad_to_max_length=False)
  return encodings.get("input_ids", [])

### padding

In [None]:
def pad_sequence(numerized, pad_index, to_length, beginning=True):
    pad = numerized[:to_length]
    if beginning:
      padded = [pad_index] * (to_length - len(pad)) + pad
    else:
      padded = pad + [pad_index] * (to_length - len(pad))
    mask = [w != pad_index for w in padded]
    return padded, mask

### batching

In [None]:
def batch_to_torch_long(*batches):
  if len(batches) == 1:
    return torch.LongTensor(batches[0])
  return [torch.LongTensor(batch) for batch in batches]

def batch_to_torch_float(*batches):
  if len(batches) == 1:
    return torch.FloatTensor(batches[0])
  return [torch.FloatTensor(batch) for batch in batches]

### full data format

In [None]:
analyzer = SentimentIntensityAnalyzer()

def format_reviews(tokenizer, datatable, indices=None, task_bar=False, review_sentiment_dict=None):
  encoded_reviews = []
  encoded_reviews_mask = []
  review_sentiment = []
  reviews_to_process = datatable[["review_id", "text", "stars"]]
  # display(reviews_to_process)
  if indices is not None:
    reviews_to_process = reviews_to_process.iloc[indices]
  
  review_iterator = reviews_to_process.iterrows()
  if task_bar:
    review_iterator = tqdm.notebook.tqdm(reviews_to_process.iterrows(), total=reviews_to_process.shape[0])

  for i, review in review_iterator:
    # Tokenize by TOKENIZER
    review_text = review["text"]
    numerized = tokenize_review(tokenizer, review_text)
    padded, mask = pad_sequence(numerized, 0, MAX_LEN)
    encoded_reviews.append(padded)
    encoded_reviews_mask.append(mask)
    # VADER
    if review_sentiment_dict is None:
      sentence_list = nltk.tokenize.sent_tokenize(review_text)
      review_sentiment_sentence = []
      for sentence in sentence_list:
          vs = analyzer.polarity_scores(sentence)
          review_sentiment_sentence.append(vs["compound"])
      padded, _ = pad_sequence(review_sentiment_sentence, 0, MAX_LEN_VADER)
      review_sentiment.append(padded)
    else:
      if review["review_id"] in review_sentiment_dict:
        review_sentiment.append(review_sentiment_dict[review["review_id"]])
    
  torch_encoded_reviews, torch_encoded_reviews_target = \
                    batch_to_torch_long(encoded_reviews, reviews_to_process["stars"].values)
  torch_encoded_reviews_mask, torch_review_sentiment = batch_to_torch_float(encoded_reviews_mask, review_sentiment)
  return torch_encoded_reviews, torch_encoded_reviews_target, torch_review_sentiment, torch_encoded_reviews_mask

### split dataset

In [None]:
# https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=0):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    # m = df.size
    m = len(df.index)

    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end

    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]

    assert train.size + validate.size + test.size == df.size

    return train, validate, test

# Data Preprocessing Code

## load data

In [None]:
# load yelp data
yelp_reviews = load_json(DATA_FOLDER / "yelp_review_training_dataset.jsonl")
print("loaded", len(yelp_reviews.index), "data points")

In [None]:
display(yelp_reviews)

## format + split data into train, val, and test sets

In [None]:
xlnet_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'xlnet-base-cased')
# tokenize_review(xlnet_tokenizer, "I love this grub!")

In [None]:
# train 75% | validation 15% | test 10%
train_ratio = .50
validate_ratio = .40
test_ratio = .10
assert train_ratio + validate_ratio + test_ratio == 1

In [None]:
train_reviews, validate_reviews, test_reviews = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)
# train_reviews_df, val_reviews_df, test_reviews_df = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)

In [None]:
# train_reviews, train_reviews_target, train_reviews_mask = format_reviews(xlnet_tokenizer, train_reviews_df)
# validate_reviews, test_reviews_target, validate_reviews_mask = format_reviews(xlnet_tokenizer, validate_reviews_df)
# test_reviews, test_reviews_target, _ = format_reviews(xlnet_tokenizer, test_reviews_df)

In [None]:
print(len(train_reviews.index), "yelp reviews for training")
train_reviews

In [None]:
print(len(validate_reviews.index), "yelp reviews for validation")
validate_reviews

In [None]:
print(len(test_reviews.index), "yelp reviews for testing")
test_reviews

In [None]:
review_sentiment_dict = {}

In [None]:
# Create dictionary of all the reviews' Vader temporarily

if USE_VADER:
  review_iterator = tqdm.notebook.tqdm(yelp_reviews.iterrows(), total=yelp_reviews.shape[0])

  for i, review in review_iterator:
    # Tokenize by TOKENIZER
    review_text = review["text"]
    # VADER
    sentence_list = nltk.tokenize.sent_tokenize(review_text)
    review_sentiment_sentence = []
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        review_sentiment_sentence.append(vs["compound"])
    padded, _ = pad_sequence(review_sentiment_sentence, 0, MAX_LEN_VADER)
    review_sentiment_dict[review["review_id"]] = padded
    if len(review_sentiment_dict) < 20:
      print(len(review_sentiment_dict), review_sentiment_dict[review["review_id"]])


#Model

## model construction

In [None]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, rnn_size, vader_size, num_layers=1, dropout=0, use_vader=USE_VADER):
        super().__init__()
        
        #################
        #    INPUT 1    #
        #################
        # Create an embedding layer, with 768 hidden layers
        self.xlnet = torch.hub.load('huggingface/pytorch-transformers', 'model', 'xlnet-base-cased')
        for param in self.xlnet.layer.parameters():
          param.requires_grad = False
        # Output: (vocab_size x 768), where 768 hidden layers of XLNet

        # Coming in: torch.Size([BATCH_SIZE, vocab_size, 768])
        #   (XLNet has 768 hidden layers, https://huggingface.co/transformers/pretrained_models.html)
        conv2d_c_in = 1
        conv2d_c_out = 100
        conv2d_kernel_H = 768 # along Embedding Length
        conv2d_kernel_W = 5 # along Word Length

        self.conv2D_layer = nn.Conv2d(conv2d_c_in, conv2d_c_out, (conv2d_kernel_W, conv2d_kernel_H))
        # Filter of (conv2d_kernel_W, conv2d_kernel_H), Cin = 1, Cout = 1

        # conv2d_out torch.Size([32, 100, 124, 1])

        # Output:
        conv2d_out_Wout = 1 + (vocab_size - conv2d_kernel_W) # Vocab Size
        conv2d_out_Hout = 1 + (768 - conv2d_kernel_H)       # length

        self.max_pool_2d = nn.MaxPool2d((conv2d_out_Wout, 1))
        max_pool_2d_out_length = conv2d_out_Wout // conv2d_out_Wout
        max_pool_2d_out_height = conv2d_out_Hout // 1
        #################
        #  INPUT 1 END  #
        #################
        
        #################
        #    INPUT 2    #
        #################
        self.lstm = None
        if use_vader:
          self.lstm = nn.LSTM(input_size=1, hidden_size=1, num_layers=num_layers, batch_first=True, dropout=dropout)
        else:
          vader_size = 0
        #################
        #  INPUT 2 END  #
        #################

        self.dropout = nn.Dropout(dropout)
        # print(max_pool_2d_out_height, max_pool_2d_out_length, vader_size)

        hidden_layer_dense = 100

        self.dense = nn.Sequential(
                nn.Linear(100 + vader_size, hidden_layer_dense),
                nn.ReLU()
            )
        self.output = nn.Linear(hidden_layer_dense, 5) # classify yelp_reviews into 5 ratings
    
    def forward_input_vectorized(self, x):
      xlnet_out = self.xlnet(x)
      xlnet_out_hidden = xlnet_out.last_hidden_state
      batches_len, word_len, embedding_len = xlnet_out_hidden.shape
      xlnet_out_hidden = xlnet_out_hidden.reshape(batches_len, 1, word_len, embedding_len)
      conv2d_out = self.conv2D_layer(xlnet_out_hidden)
      result = self.max_pool_2d(conv2d_out)
      result = result.squeeze(2).squeeze(2)
      return result

    def forward_input_vader(self, x):
      batch_size, vader_len = x.shape
      # print(x.reshape(batch_size, vader_len, 1).shape)
      output, _ = self.lstm(x.reshape(batch_size, vader_len, 1))
      # print(output.shape)
      output = output.squeeze(2)
      return output

    def forward(self, vectorized_words, vader):
        input1 = self.forward_input_vectorized(vectorized_words)

        if self.lstm:
          input2 = self.forward_input_vader(vader)
          combined_input = (input1, input2)
        else:
          combined_input = (input1,) # Tuples need the stray comma

        # print(input1.size(), input2.size())

        combined_input = torch.cat(combined_input, dim=1)

        lstm_drop = self.dropout(combined_input)
        logits = self.dense(lstm_drop)
        logits = self.output(logits)
        return logits
    
    def loss_fn(self, prediction, target):
      loss_criterion = nn.CrossEntropyLoss(reduction='none')
      return torch.mean(loss_criterion(prediction, target - 1))

In [None]:
model = LanguageModel(vocab_size=MAX_LEN, rnn_size=256, vader_size=MAX_LEN_VADER)

## train the model

In [None]:
# num_of_validaion_set = 20 #len(validate_reviews)

# batch_val = format_reviews(xlnet_tokenizer, validate_reviews, range(num_of_validaion_set), review_sentiment_dict=review_sentiment_dict) # This cell may take a while

# (batch_input_val, batch_target_val, batch_review_sentiment_val, batch_target_mask_val) = batch_val

In [None]:
def run_validation(model, use_all=False, mode="val"):
  reviews_dataset = None
  if mode == "val":
    print("Running Validation")
    mode = "Validation"
    reviews_dataset = validate_reviews
  elif mode == "test":
    print("Running Testing")
    mode = "Test"
    reviews_dataset = test_reviews
  else:
    assert False, "Invalid mode"
  num_of_review_set = len(reviews_dataset) if use_all else 1000
  indices = np.random.permutation(len(reviews_dataset))
  t = tqdm.notebook.tqdm(range(0, ( num_of_review_set // BATCH_SIZE) + ( 1 if num_of_review_set % BATCH_SIZE > 0 else 0 )))
  loss_val_total = 0
  accuracy_val_total = 0
  temp_count = 0
  for i in t:
    val_start_i = i*BATCH_SIZE
    val_end_i = (i+1)*BATCH_SIZE
    # print(val_start_i, val_end_i, indices.shape)
    batch_val = format_reviews(xlnet_tokenizer, reviews_dataset, indices[val_start_i:val_end_i], review_sentiment_dict=review_sentiment_dict)
    (batch_input_val, batch_target_val, batch_review_sentiment_val, batch_target_mask_val) = batch_val
    # print(batch_input_val.shape, batch_review_sentiment_val.shape)
    (batch_input_val, batch_target_val) = list_to_device((batch_input_val, batch_target_val))
    batch_target_mask_val, batch_review_sentiment_val = list_to_device((batch_target_mask_val, batch_review_sentiment_val))
    # print(batch_input_val.shape, batch_review_sentiment_val.shape)
    prediction_val = model.forward(batch_input_val, batch_review_sentiment_val)
    # print(prediction_val.size(), batch_target_val.size())
    # print(prediction_val, batch_target_val)
    loss_val_total += model.loss_fn(prediction_val, batch_target_val).item()
    # print(loss_val)
    accuracy_val_total += torch.mean(torch.eq(prediction_val.argmax(dim=1,keepdim=False) + 1,batch_target_val).float()).item()
    temp_count += 1
    if i % round(8000 / BATCH_SIZE) == 0 and i != 0 and use_all:
      print(mode, "Prelim Evaluation set loss:", loss_val_total / temp_count, mode, "Prelim Accuracy:", accuracy_val_total / temp_count)
  loss_val = loss_val_total / temp_count
  accuracy_val = accuracy_val_total / temp_count
  print(mode, "Evaluation set loss:", loss_val, mode, "Accuracy set %:", accuracy_val)

In [None]:
losses = []
accuracies = []

epoch_start = 0
t_start = 0

In [None]:
# ONLY RUN THIS CELL (and next cell) if want to load checkpoint
# If you accidentally run this cell, no harm done (be careful with next cell!!!)

checkpoint = None
try:
  checkpoint = torch.load(str(TORCH_CHECKPOINT_MODEL))
  print("Checkpoint loaded")
except:
  print("No Checkpoint loaded")

In [None]:
lr = 1e-4
optimizer_method = optim.Adam
optimizer = optimizer_method(model.parameters(), lr=lr)

In [None]:
# ONLY RUN THIS CELL if want to load checkpoint

if checkpoint:
  epoch_start = checkpoint['epoch']
  t_start = checkpoint['t']
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer = optimizer_method(model.parameters(), lr=lr)
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  losses = checkpoint['losses']
  accuracies = checkpoint['accuracies']
  model.to(device)

  print("Checkpoint")
  run_validation(model)

  print(f"Checkpoint Epoch: {epoch_start} Iteration: {i} Loss: {np.mean(losses[-10:])} Accuracy: {np.mean(accuracies[-10:])}")

In [None]:
# set model to training mode
# Needs to be placed after the Checkpoint file loading
model.train()

In [None]:
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

DATASET = train_reviews

# Constants of interest: BATCH_SIZE, EPOCHS

since = time.time()

# Sanity check saving
torch.save({'epoch': 0,
            't': 0,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': losses,
            'accuracies': accuracies
            }, str(TORCH_CHECKPOINT_MODEL) + "FAKE")

# start training
for epoch in range(epoch_start, EPOCHS):
  indices = np.random.permutation(DATASET.shape[0])

  dataset_batch_cap = ( DATASET.shape[0] // BATCH_SIZE ) + (1 if DATASET.shape[0] % BATCH_SIZE > 0 else 0)

  t = tqdm.notebook.tqdm(range(t_start, dataset_batch_cap), initial = t_start, total = dataset_batch_cap)
  
  for i in t:
    # batch
    batch = format_reviews(xlnet_tokenizer, DATASET, indices[i*BATCH_SIZE:(i+1)*BATCH_SIZE], review_sentiment_dict=review_sentiment_dict)
    (batch_input, batch_target, batch_review_sentiment, batch_target_mask) = batch
    # for item in (batch_input, batch_target, batch_review_sentiment, batch_target_mask):
    #   print(item.size())
    (batch_input, batch_target, batch_target_mask, batch_review_sentiment) = list_to_device((batch_input, batch_target, batch_target_mask, batch_review_sentiment))
    model.to(device)
    
    # forward pass
    prediction = model.forward(batch_input, batch_review_sentiment)
    # print(prediction.size(), batch_target.size())
    loss = model.loss_fn(prediction, batch_target)
    # print(loss)
    losses.append(loss.item())
    accuracy = torch.mean(torch.eq(prediction.argmax(dim=1,keepdim=False) + 1,batch_target).float())
    accuracies.append(accuracy.item())
    
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # visuallize data
    if i % 1000 == 0 and i != t_start:
      torch.save({'epoch': epoch,
                  't': i,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'losses': losses,
                  'accuracies': accuracies
                  }, str(TORCH_CHECKPOINT_MODEL))
      run_validation(model)
      print(f"Epoch: {epoch} Iteration: {i} Train Loss: {np.mean(losses[-10:])} Train Accuracy: {np.mean(accuracies[-10:])}")

  t_start = 0


In [None]:
# Save the latest model
print("Saving latest model to", str(TORCH_CHECKPOINT_MODEL))
torch.save({'epoch': EPOCHS,
            't': (DATASET.shape[0] // BATCH_SIZE)+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': losses,
            'accuracies': accuracies
            }, str(TORCH_CHECKPOINT_MODEL))

## evaluate model

In [None]:
# set model to evaluation model
model.eval()

In [None]:
run_validation(model, True, "val")

In [None]:
run_validation(model, True, "test")

#Playground

In [None]:
# This is a cheap solution to stops any run all that reaches my Playground
hard_stop = input("Hard Stop here. Enter any key to allow passage.")

if len(hard_stop) == 0:
  raise Exception("Hard Stop")

In [None]:
print(list(tokenize(STARTER["text"]))[:3])

In [None]:
import urllib.request
import io
import sentencepiece as spm

# https://github.com/google/sentencepiece/tree/master/python

# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
spm.SentencePieceTrainer.train(
      sentence_iterator=STARTER["text"], model_writer=model, vocab_size=1000)

# Serialize the model as file.
# with open('out.model', 'wb') as f:
#   f.write(model.getvalue())

# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.encode('this is test'))