In [None]:
!pip install transformers
!pip install sentencepiece
!pip install segtok
!pip install vaderSentiment
!pip install nltk
!pip install huggingface_hub
!pip install pytorch-lightning

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 4.2MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 42.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 43.4MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting sentencepiece
[?25l  Downloading https://files.pythonh

In [None]:
import os
import sys
from pathlib import Path

import json
import pandas as pd
import random

import torch
from segtok import tokenizer

import tqdm

from multiprocessing import Pool

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
import matplotlib.pyplot as plt
import time
import os
import copy
import nltk
nltk.download('punkt')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from pathlib import Path
ROOT_FOLDER = Path("/content/drive/My Drive/cs182_final_project/cs182-nlp")
DATA_FOLDER = ROOT_FOLDER / "dataset"
TORCH_CHECKPOINT_MODEL = ROOT_FOLDER / "models" / "training_checkpoint_oscar_vader_sixthukn.pt"
sys.path.append(str(ROOT_FOLDER))

In [None]:
sys.path.append(str(ROOT_FOLDER))

import data
import models

In [None]:
from argparse import Namespace

args = Namespace(
    batch_size=32,
    epochs=10,
    max_len=128,
    max_len_vader=40,
    use_bert=False,
    use_cnn=True,
    use_vader=True,
)

In [None]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, rnn_size, vader_size, num_layers=1, dropout=0, use_vader=True):
        super().__init__()
        
        #################
        #    INPUT 1    #
        #################
        # Create an embedding layer, with 768 hidden layers
        self.xlnet = torch.hub.load('huggingface/pytorch-transformers', 'model', 'xlnet-base-cased')
        for param in self.xlnet.layer.parameters():
          param.requires_grad = False
        # Output: (vocab_size x 768), where 768 hidden layers of XLNet

        # Coming in: torch.Size([BATCH_SIZE, vocab_size, 768])
        #   (XLNet has 768 hidden layers, https://huggingface.co/transformers/pretrained_models.html)
        conv2d_c_in = 1
        conv2d_c_out = 1
        conv2d_kernel_W = 5 # along Embedding Length
        conv2d_kernel_H = 5 # along Word Length

        self.conv2D_layer = nn.Conv2d(conv2d_c_in, conv2d_c_out, (conv2d_kernel_H, conv2d_kernel_W))
        # Filter of (conv2d_kernel_H, conv2d_kernel_W), Cin = 1, Cout = 1

        # Output:
        conv2d_out_Hout = vocab_size - ((conv2d_kernel_H - 1) // 2) * 2 # Vocab Size
        conv2d_out_Wout = 768 - ((conv2d_kernel_W - 1) // 2) * 2        # length

        self.max_pool_2d = nn.MaxPool2d((conv2d_out_Hout, 1))
        max_pool_2d_out_height = conv2d_out_Hout // conv2d_out_Hout
        max_pool_2d_out_length = conv2d_out_Wout // 1
        #################
        #  INPUT 1 END  #
        #################
        
        #################
        #    INPUT 2    #
        #################
        self.lstm = None
        if use_vader:
          self.lstm = nn.LSTM(input_size=1, hidden_size=1, num_layers=num_layers, batch_first=True, dropout=dropout)
        else:
          vader_size = 0
        #################
        #  INPUT 2 END  #
        #################

        self.dropout = nn.Dropout(dropout)
        # print(max_pool_2d_out_length + vader_size)

        hidden_layer_dense = 100

        self.dense = nn.Sequential(
                nn.Linear(max_pool_2d_out_length + vader_size, hidden_layer_dense),
                nn.ReLU()
            )
        self.output = nn.Linear(hidden_layer_dense, 6) # classify yelp_reviews into 5 ratings
    
    xlnet_timing = 0
    def forward_input_vectorized(self, x):
      start_time = time.time()
      xlnet_out = self.xlnet(x)
      end_time = time.time()

      self.xlnet_timing += end_time - start_time

      xlnet_out_hidden = xlnet_out.last_hidden_state
      batches_len, word_len, embedding_len = xlnet_out_hidden.shape
      xlnet_out_hidden = xlnet_out_hidden.reshape(batches_len, 1, word_len, embedding_len)
      conv2d_out = self.conv2D_layer(xlnet_out_hidden)
      result = self.max_pool_2d(conv2d_out)
      # print(result.shape)
      result = result.squeeze(1).squeeze(1)
      return result

    def forward_input_vader(self, x):
      batch_size, vader_len = x.shape
      # print(x.reshape(batch_size, vader_len, 1).shape)
      output, _ = self.lstm(x.reshape(batch_size, vader_len, 1))
      # print(output.shape)
      output = output.squeeze(2)
      return output

    def predict(self, vectorized_words, vadar_sentiments):
        logits = self.forward(vectorized_words, vadar_sentiments)
        prediction = logits.argmax(dim=1, keepdim=False)
        return prediction

    total_time_concat = 0
    def forward(self, vectorized_words, vader):
        input1 = self.forward_input_vectorized(vectorized_words)

        if False and self.lstm:
          input2 = self.forward_input_vader(vader)
          combined_input = (input1, input2)
        else:
          input2 = torch.zeros(input1.size()[0], 40)
          combined_input = (input1, input2) # Tuples need the stray comma

        # print(input1.size(), input2.size())

        start_time = time.time()
        combined_input = torch.cat(combined_input, dim=1)
        end_time = time.time()

        self.total_time_concat += end_time - start_time

        lstm_drop = self.dropout(combined_input)
        logits = self.dense(lstm_drop)
        logits = self.output(logits)
        return logits
    
    def loss_fn(self, prediction, target):
      loss_criterion = nn.CrossEntropyLoss(reduction='none')
      return torch.mean(loss_criterion(prediction, target - 1))

In [None]:
import json
import pickle
import sys

import nltk
import torch
import tqdm

import data
import models

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

MAX_LEN = 128
MAX_LEN_VADER = 40
BATCH_SIZE = 64

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_params = torch.load(
    TORCH_CHECKPOINT_MODEL, map_location=device
)

list_to_device = lambda th_obj: [tensor.to(device) for tensor in th_obj]

model = LanguageModel(MAX_LEN, 256, MAX_LEN_VADER)
# vocab_size, rnn_size, vader_size, num_layers=1, dropout=0, use_vader=True)

model.load_state_dict(model_params["model_state_dict"])
model = model.to(device)
model.eval()

analyzer = SentimentIntensityAnalyzer()
xlnet_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'xlnet-base-cased')

def predict_stars(texts):
    """
    text - a SINGLE texts
    """
    # This is where you call your model to get the number of stars output
    vectorized_list = []
    vadar_sentiments_list = []
    for text in texts:
      encodings = xlnet_tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=MAX_LEN,
          return_token_type_ids=False,
          return_attention_mask=False,
          truncation=True,
          pad_to_max_length=False,
      )
      text_encoding = encodings.get("input_ids", [])
      vectorized, _ = data.pad_sequence(text_encoding, 0, MAX_LEN)
      vectorized_list.append(vectorized)

      sentence_list = nltk.tokenize.sent_tokenize(
          text
      )  # Text is one at a time anyway here
      review_sentiment_sentence = []
      for sentence in sentence_list:
          vs = analyzer.polarity_scores(sentence)
          review_sentiment_sentence.append(vs["compound"])
      vadar_sentiments, _ = data.pad_sequence(review_sentiment_sentence, 0, MAX_LEN_VADER)
      vadar_sentiments_list.append(vadar_sentiments)

    # Place the data as a batch, even if there is only 1
    vectorized = data.batch_to_torch_long(vectorized_list)
    vadar_sentiments = data.batch_to_torch_float(vadar_sentiments_list)

    p = model.predict(vectorized, vadar_sentiments)
    return p.tolist()

In [None]:
count = 0
total_time = 0

model.eval()

model.total_time_concat = 0
model.xlnet_timing = 0

if len(sys.argv) > 1:
    validation_file = "/content/yelp_review_training_dataset.jsonl"
    with open("output.jsonl", "w") as fw:
        pandas_dataset = data.load_json(validation_file)

        dataset_batch_cap = ( pandas_dataset.shape[0] // BATCH_SIZE ) + (1 if pandas_dataset.shape[0] % BATCH_SIZE > 0 else 0)
        
        t = tqdm.notebook.tqdm(range(0, dataset_batch_cap), initial = 0, total = dataset_batch_cap)
        
        for i in t:
          val_start_i = i*BATCH_SIZE
          val_end_i = (i+1)*BATCH_SIZE
          # print(val_start_i, val_end_i, indices.shape)

          data_subset = pandas_dataset.iloc[val_start_i:val_end_i]
          
          # batch
          batch_val = data.format_reviews(args, datatable=data_subset)

          """
          return (
              torch.LongTensor(encoded_reviews),  # text
              torch.FloatTensor(review_sentiments),  # sentiments
              torch.LongTensor(reviews_to_process["stars"].values),  # target
              torch.FloatTensor(encoded_reviews_mask),  # mask
          )
          """
          start_time = time.time()
          (batch_input_val, batch_review_sentiment_val, batch_target_val, batch_target_mask_val) = batch_val
          # print(batch_input_val.shape, batch_review_sentiment_val.shape)
          (batch_input_val, batch_target_val) = list_to_device((batch_input_val, batch_target_val))
          batch_target_mask_val, batch_review_sentiment_val = list_to_device((batch_target_mask_val, batch_review_sentiment_val))
          end_time = time.time()
          print("process time", end_time - start_time)

          # forward pass
          start_time = time.time()
          prediction = model.predict(batch_input_val, batch_review_sentiment_val)
          end_time = time.time()
          total_time += end_time - start_time

          # print(prediction)
          for i, pred_val in enumerate(prediction):
            pred_val = pred_val.item()
            count += 1
            fw.write(
                json.dumps(
                    {
                        "review_id": data_subset.iloc[i]["review_id"],
                        "predicted_stars": float(pred_val),
                    }
                )
                + "\n"
            )
          
          print(count, total_time, model.total_time_concat, total_time - model.total_time_concat)
          print("xlnet_timing", model.xlnet_timing)

    print("Output prediction file written")
else:
    print("No validation file given")