# Global Setup

In [1]:
!pip install transformers
!pip install sentencepiece
!pip install segtok
!pip install vaderSentiment
!pip install nltk



In [2]:
import os
import sys
from pathlib import Path

import json
import pandas as pd
import random

import torch
from segtok import tokenizer
from keras.preprocessing.sequence import pad_sequences
import tqdm

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
ROOT_FOLDER = Path("/content/drive/My Drive/cs182_final_project/cs182-nlp (master)")
DATA_FOLDER = ROOT_FOLDER / "dataset"

In [5]:
sys.path.append(ROOT_FOLDER)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [7]:
list_to_device = lambda th_obj: [tensor.to(device) for tensor in th_obj]

# Model Params

In [8]:
MAX_LEN = 128
MAX_LEN_VADER = 40
BATCH_SIZE = 8
EPOCHS = 20

# Higher bound settings: MAX_LEN = 256 and BATCH_SIZE = 16

#Data Preprocessing Functions

## load data

In [9]:
def load_json(file_path, filter_function=lambda x: True):
  """
  file_path - full path of the file to read from
  filter_function - a data selection function, returns True to ADD a data point
  """
  result = []

  try:
    with open(file_path, "r") as f:
      for line in f:
        json_line = json.loads(line)
        if not filter_function(json_line):
          # Disallow via opposite of allow
          continue
        result.append(json_line) # each line is one data point dictionary
    return pd.DataFrame.from_records(result)
    # return result

  except IOError:
    print(f"cannot open {file_path}")
    return None

## data formatting

### tokenize

In [10]:
def tokenize(data):
  """
  data - an iterable of sentences
  """
  token_set = set()
  i = 0
  for sentences in data:
    if i % 1000 == 0:
      print(i, end=", " if i % 15000 != 0 else "\n")
    tokenized = nltk.word_tokenize(sentences.lower())
    for token in tokenized:
      token_set.add(token)
    i += 1
  return token_set

In [11]:
def tokenize_review(tokenizer, review_text):
  encodings = tokenizer.encode_plus(review_text, add_special_tokens=True,
                                    max_length=MAX_LEN,
                                    return_token_type_ids=False,
                                    return_attention_mask=False,
                                    truncation=True,
                                    pad_to_max_length=False)
  return encodings.get("input_ids", [])

### padding

In [12]:
def pad_sequence(numerized, pad_index, to_length, beginning=True):
    pad = numerized[:to_length]
    if beginning:
      padded = [pad_index] * (to_length - len(pad)) + pad
    else:
      padded = pad + [pad_index] * (to_length - len(pad))
    mask = [w != pad_index for w in padded]
    return padded, mask

### batching

In [13]:
def batch_to_torch_long(*batches):
  if len(batches) == 1:
    return torch.LongTensor(batches[0])
  return [torch.LongTensor(batch) for batch in batches]

def batch_to_torch_float(*batches):
  if len(batches) == 1:
    return torch.FloatTensor(batches[0])
  return [torch.FloatTensor(batch) for batch in batches]

### full data format

In [31]:
analyzer = SentimentIntensityAnalyzer()

def format_reviews(tokenizer, datatable, indices=None):
  encoded_reviews = []
  encoded_reviews_mask = []
  review_sentiment = []
  reviews_to_process = datatable[["text", "stars"]]
  display(reviews_to_process)
  if indices is not None:
    reviews_to_process = reviews_to_process.iloc[indices]

  for review_text in reviews_to_process["text"]:
    # Tokenize by TOKENIZER
    numerized = tokenize_review(tokenizer, review_text)
    padded, mask = pad_sequence(numerized, 0, MAX_LEN)
    encoded_reviews.append(padded)
    encoded_reviews_mask.append(mask)
    # VADER
    sentence_list = nltk.tokenize.sent_tokenize(review_text)
    review_sentiment_sentence = []
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        review_sentiment_sentence.append(vs["compound"])
    padded, _ = pad_sequence(review_sentiment_sentence, 0, MAX_LEN_VADER)
    review_sentiment.append(padded)
    
  torch_encoded_reviews, torch_encoded_reviews_target = \
                    batch_to_torch_long(encoded_reviews, reviews_to_process["stars"].values)
  torch_encoded_reviews_mask, torch_review_sentiment = batch_to_torch_float(encoded_reviews_mask, review_sentiment)
  return torch_encoded_reviews, torch_encoded_reviews_target, torch_review_sentiment, torch_encoded_reviews_mask

### split dataset

In [15]:
# https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=0):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    # m = df.size
    m = len(df.index)

    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end

    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]

    assert train.size + validate.size + test.size == df.size

    return train, validate, test

# Data Preprocessing Code

## load data

In [16]:
# load yelp data
yelp_reviews = load_json(DATA_FOLDER / "yelp_review_training_dataset.jsonl")
print("loaded", len(yelp_reviews.index), "data points")

loaded 533581 data points


In [17]:
display(yelp_reviews)

Unnamed: 0,review_id,text,stars
0,Q1sbwvVQXV2734tPgoKj4Q,Total bill for this horrible service? Over $8G...,1.0
1,GJXCdrto3ASJOqKeVWPi6Q,I *adore* Travis at the Hard Rock's new Kelly ...,5.0
2,2TzJjDVDEuAW6MR5Vuc1ug,I have to say that this office really has it t...,5.0
3,yi0R0Ugj_xUx_Nek0-_Qig,Went in for a lunch. Steak sandwich was delici...,5.0
4,11a8sVPMUFtaC7_ABRkmtw,Today was my second out of three sessions I ha...,1.0
...,...,...,...
533576,2vQO_kmSr6YPBrR8GH_FPA,Dr Young and her assistants take obvious pride...,5.0
533577,DUdLTGVpgsi0sv_g4A5ITQ,We started our 20 month of daughter here on an...,5.0
533578,AKGELpRNTTXajuZHbPxdJg,"First of all, they are supposed to open at 9:0...",2.0
533579,ghYZM7lqzjej05I_T3vYyA,It's not often that you visit a company and th...,5.0


## format + split data into train, val, and test sets

In [18]:
xlnet_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'xlnet-base-cased')
# tokenize_review(xlnet_tokenizer, "I love this grub!")

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


In [19]:
# train 75% | validation 15% | test 10%
train_ratio = .75
validate_ratio = .15
test_ratio = .10
assert train_ratio + validate_ratio + test_ratio == 1

In [20]:
train_reviews, validate_reviews, test_reviews = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)
# train_reviews_df, val_reviews_df, test_reviews_df = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)

In [21]:
# train_reviews, train_reviews_target, train_reviews_mask = format_reviews(xlnet_tokenizer, train_reviews_df)
# validate_reviews, test_reviews_target, validate_reviews_mask = format_reviews(xlnet_tokenizer, validate_reviews_df)
# test_reviews, test_reviews_target, _ = format_reviews(xlnet_tokenizer, test_reviews_df)

In [22]:
print(len(train_reviews.index), "yelp reviews for training")
train_reviews

400185 yelp reviews for training


Unnamed: 0,review_id,text,stars
134046,uozx1JGyRjHzs_aASyPcXg,I had a bad experience with shitty ATTITUDE. B...,1.0
41553,Ahq1v_fyEy00giS3JC7aLg,I thought the food there was good for the pric...,3.0
225115,g4Uj59yJl118-rjlz22s0g,I don't know what happened to the other review...,5.0
17163,WgTjvFVihdfENQCym1ZMsg,Been a patient for 2 1/2 years. Be prepared to...,2.0
11902,aF79-cK0Gj-jEmmTkBmN4Q,I don't often write reviews but I was so impre...,5.0
...,...,...,...
385507,lmr-n6jfHYwEYOGYUY_8AQ,I went to a different office prior to EyesWest...,5.0
426778,w02Yb3hs-6zFCcbVLMD4GQ,Fab Restaurant Concepts Inc. recently launched...,4.0
410697,daRP_YY8SPhpd9pvPNPh8A,A Yeti Nails on college? Yes!!\n\nI always go ...,5.0
382385,ftzEU-izq2fAf6cysqqlqA,Well I've been with the PMG for years and I've...,1.0


In [23]:
print(len(validate_reviews.index), "yelp reviews for validation")
validate_reviews

80037 yelp reviews for validation


Unnamed: 0,review_id,text,stars
293393,g4tyji9AbGA8s8d8wR1Ghw,Dr. Brown is an exceptional Doctor and an even...,5.0
91236,yv-QVVz8r9L0KzPE7dC9Kw,Very satisfied with them and they do deserve 5...,5.0
519480,u7ob6oGBXoxNmoNdO6Hi7A,Discovered this place when checking out The Do...,5.0
85922,D6_cubDDmUHK9etIo4qj5g,"good deals, okay pizza, good wings mango great...",5.0
272029,UT-44vSkncrfr7pTNV7Z2Q,We had to stay here for 2 weeks as our house w...,3.0
...,...,...,...
395020,PcpbnlwESsc2INBtySjNoQ,Wow!!!! New opening and the Staff is AWESOME!!...,5.0
368659,5suyUogE1SECWmNO4Od_jQ,Nice higher end steak and seafood. Have had m...,5.0
304554,VcZhY420gOcRvwC5Z6RyFw,I like mucho. But the portions here were ridic...,1.0
314880,qoeh99DeddHIX2cnFC5C5Q,not very helpful on the phone. changed their p...,3.0


In [24]:
print(len(test_reviews.index), "yelp reviews for testing")
test_reviews

53359 yelp reviews for testing


Unnamed: 0,review_id,text,stars
138379,pfwXHRWq2jzT00n2BCbUnQ,I would avoid doing business with Nationwide a...,1.0
382311,7rMAnoL5AQbU2PCjH_4hvg,Why is it so hard to find a jamba juice in veg...,4.0
93399,W9WwTpqqOmHaEkDjFGY3kg,Definetly a fine ice cream chain. The massive ...,4.0
122224,PNYbsSoEvoeVHhrlaBYxkw,One of a very few places in gta where one can ...,5.0
224622,TDl1VLEcC49LMfhucNRDZw,Was told I'm a pain in the ass customer. Dana ...,1.0
...,...,...,...
359783,V89YHw66stC6m2q_Q8f1QA,First Yelp review I've ever written. They were...,1.0
152315,PkW-fQcMLCYK1EXXlbghxA,Excellent! I always seem to get the huevos ran...,4.0
117952,P1-rGkBlw_PHDAz2pNd4pA,"Sorry Mitchell's, I'm a Graeter's girl now. I ...",5.0
435829,rE4xHmpnFp6xaWJRWsGSvA,Jay's was referred to me by my neighbor who is...,5.0


#Model

## model construction

In [38]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, rnn_size, num_layers=1, dropout=0):
        super().__init__()
        
        #################
        #    INPUT 1    #
        #################
        # Create an embedding layer, with 768 hidden layers
        self.xlnet = torch.hub.load('huggingface/pytorch-transformers', 'model', 'xlnet-base-cased')
        # Output: (vocab_size x 768), where 768 hidden layers of XLNet

        # Coming in: torch.Size([BATCH_SIZE, vocab_size, 768])
        #   (XLNet has 768 hidden layers, https://huggingface.co/transformers/pretrained_models.html)
        conv1d_c_in = vocab_size
        conv1d_c_out = vocab_size
        conv1d_kernel = 5

        self.conv1D_layer = nn.Conv1d(conv1d_c_in, conv1d_c_out, conv1d_kernel)
        # Filter of 5, Cin = vocab_size, Cout = vocab_size

        # Output:
        conv1d_out_vocab_size = conv1d_c_out
        conv1d_out_length = 768 - ((conv1d_kernel - 1) // 2) * 2

        self.max_pool_1d = nn.MaxPool1d(conv1d_out_vocab_size)
        max_pool_1d_out_height = conv1d_out_vocab_size // conv1d_out_vocab_size
        max_pool_1d_out_length = conv1d_out_length // 1
        #################
        #  INPUT 1 END  #
        #################
        
        #################
        #    INPUT 2    #
        #################
        self.lstm = nn.LSTM(input_size=1, hidden_size=rnn_size, num_layers=num_layers, batch_first=True, dropout=dropout)
        #################
        #  INPUT 2 END  #
        #################

        self.dropout = nn.Dropout(dropout)

        self.dense = nn.Sequential(
                nn.Linear(rnn_size, vocab_size),
                nn.ReLU()
            )
        self.output = nn.Linear(rnn_size, 10) # classify yelp_reviews into 10 rating levels
    
    def forward_input_vectorized(self, x):
      xlnet_out = self.xlnet(x)
      xlnet_out_hidden = xlnet_out.last_hidden_state
      conv1d_out = self.conv1D_layer(xlnet_out_hidden)
      return self.max_pool_1d(conv1d_out.permute(0, 2, 1)).permute(0, 2, 1)

    def forward_input_vader(self, x):
      batch_size, vader_len = x.shape
      output, _ = self.lstm(x.reshape(batch_size, vader_len, 1))
      return output

    def forward(self, vectorized_words, vader):
        input1 = self.forward_input_vectorized(vectorized_words)
        input2 = self.forward_input_vader(vader)

        print(input1.size(), input2.size())

        

        lstm_drop = self.dropout(lstm_out)
        logits = self.dense(lstm_drop)
        logits = self.output(logits)
        return logits
    
    def loss_fn(self, prediction, target, mask):
      if classes is None:
        raise NotImplementedError
      else:
        # Regression
        pass

In [39]:
model = LanguageModel(vocab_size=MAX_LEN, rnn_size=256)

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


## train the model

In [40]:
# set model to training mode
model.train()

LanguageModel(
  (xlnet): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [41]:
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

DATASET = train_reviews

# Constants of interest: BATCH_SIZE, EPOCHS

since = time.time()

# start training
for epoch in range(EPOCHS):
  indices = np.random.permutation(DATASET.shape[0])
  t = tqdm.notebook.tqdm(range(0, ( DATASET.shape[0] // BATCH_SIZE)+1))
  
  for i in t:
    # batch
    batch = format_reviews(xlnet_tokenizer, DATASET, indices[i*BATCH_SIZE:(i+1)*BATCH_SIZE])
    (batch_input, batch_target, torch_review_sentiment, batch_target_mask) = batch
    for item in (batch_input, batch_target, torch_review_sentiment, batch_target_mask):
      print(item.size())
    (batch_input, batch_target, batch_target_mask) = list_to_device((batch_input, batch_target, batch_target_mask))
    
    # forward pass
    prediction = model.forward(batch_input, torch_review_sentiment)
    loss = loss_fn(prediction, batch_target, batch_target_mask)
    losses.append(loss.item())
    accuracy = (th.eq(prediction.argmax(dim=2,keepdim=False),batch_target).float()*batch_target_mask).sum()/batch_target_mask.sum()
    accuracies.append(accuracy.item())
    
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # visuallize data
    if i % 100 == 0:
        batch_input_val, batch_target_val, batch_target_mask_val = build_batch(d_valid, range(len(d_valid)))
        (batch_input_val, batch_target_val) = list_to_device(batch_to_torch_long(batch_input_val, batch_target_val))
        batch_target_mask_val = list_to_device(batch_to_torch_long(batch_target_mask_val))
        prediction_val = model.forward(batch_input_val)
        loss_val = loss_fn(prediction_val, batch_target_val, batch_target_mask_val)
        print("Evaluation set loss:", loss_val.item())
        print(f"Epoch: {epoch} Iteration: {i} Loss: {np.mean(losses[-10:])} Accuracy: {np.mean(accuracies[-10:])}")



HBox(children=(FloatProgress(value=0.0, max=50024.0), HTML(value='')))

Unnamed: 0,text,stars
134046,I had a bad experience with shitty ATTITUDE. B...,1.0
41553,I thought the food there was good for the pric...,3.0
225115,I don't know what happened to the other review...,5.0
17163,Been a patient for 2 1/2 years. Be prepared to...,2.0
11902,I don't often write reviews but I was so impre...,5.0
...,...,...
385507,I went to a different office prior to EyesWest...,5.0
426778,Fab Restaurant Concepts Inc. recently launched...,4.0
410697,A Yeti Nails on college? Yes!!\n\nI always go ...,5.0
382385,Well I've been with the PMG for years and I've...,1.0


torch.Size([8, 128])
torch.Size([8])
torch.Size([8, 40])
torch.Size([8, 128])
torch.Size([8, 1, 764]) torch.Size([8, 40, 256])


NameError: ignored

## evaluate model

In [None]:
# set model to evaluation model
model.eval()

In [None]:
# model(yelp_ratings["text"])

#Playground

In [None]:
# This is a cheap solution to stops any run all that reaches my Playground
hard_stop = input("Hard Stop here. Enter any key to allow passage.")

if len(hard_stop) == 0:
  raise Exception("Hard Stop")

In [None]:
print(list(tokenize(STARTER["text"]))[:3])

In [None]:
import urllib.request
import io
import sentencepiece as spm

# https://github.com/google/sentencepiece/tree/master/python

# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
spm.SentencePieceTrainer.train(
      sentence_iterator=STARTER["text"], model_writer=model, vocab_size=1000)

# Serialize the model as file.
# with open('out.model', 'wb') as f:
#   f.write(model.getvalue())

# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.encode('this is test'))