# Global Setup

In [37]:
!pip install transformers
!pip install sentencepiece
!pip install segtok
!pip install vaderSentiment
!pip install nltk
!pip install huggingface_hub



In [38]:
import os
import sys
from pathlib import Path

import json
import pandas as pd
import random

import torch
from segtok import tokenizer

import tqdm

from multiprocessing import Pool

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [73]:
ROOT_FOLDER = Path("/content/drive/My Drive/cs182_final_project/cs182-nlp")
DATA_FOLDER = ROOT_FOLDER / "dataset"
TORCH_CHECKPOINT_MODEL = ROOT_FOLDER / "models" / "training_checkpoint_oscar_tallercnn_vadar.pt"

input("Please check to make sure the above checkpoint directory is yours (Hit any key)")

Please check to make sure the above checkpoint directory is yours (Hit any key)


''

In [41]:
sys.path.append(ROOT_FOLDER)

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [43]:
list_to_device = lambda th_obj: [tensor.to(device) for tensor in th_obj]

# Model Params

In [44]:
MAX_LEN = 128
MAX_LEN_VADER = 40
BATCH_SIZE = 32
EPOCHS = 5
USE_VADER = True

# Higher bound settings: MAX_LEN = 256 and BATCH_SIZE = 16

#Data Preprocessing Functions

## load data

In [45]:
def load_json(file_path, filter_function=lambda x: True):
  """
  file_path - full path of the file to read from
  filter_function - a data selection function, returns True to ADD a data point
  """
  result = []

  try:
    with open(file_path, "r") as f:
      for line in f:
        json_line = json.loads(line)
        if not filter_function(json_line):
          # Disallow via opposite of allow
          continue
        result.append(json_line) # each line is one data point dictionary
    return pd.DataFrame.from_records(result)
    # return result

  except IOError:
    print(f"cannot open {file_path}")
    return None

## data formatting

### tokenize

In [46]:
def tokenize(data):
  """
  data - an iterable of sentences
  """
  token_set = set()
  i = 0
  for sentences in data:
    if i % 1000 == 0:
      print(i, end=", " if i % 15000 != 0 else "\n")
    tokenized = nltk.word_tokenize(sentences.lower())
    for token in tokenized:
      token_set.add(token)
    i += 1
  return token_set

In [47]:
def tokenize_review(tokenizer, review_text):
  encodings = tokenizer.encode_plus(review_text, add_special_tokens=True,
                                    max_length=MAX_LEN,
                                    return_token_type_ids=False,
                                    return_attention_mask=False,
                                    truncation=True,
                                    pad_to_max_length=False)
  return encodings.get("input_ids", [])

### padding

In [48]:
def pad_sequence(numerized, pad_index, to_length, beginning=True):
    pad = numerized[:to_length]
    if beginning:
      padded = [pad_index] * (to_length - len(pad)) + pad
    else:
      padded = pad + [pad_index] * (to_length - len(pad))
    mask = [w != pad_index for w in padded]
    return padded, mask

### batching

In [49]:
def batch_to_torch_long(*batches):
  if len(batches) == 1:
    return torch.LongTensor(batches[0])
  return [torch.LongTensor(batch) for batch in batches]

def batch_to_torch_float(*batches):
  if len(batches) == 1:
    return torch.FloatTensor(batches[0])
  return [torch.FloatTensor(batch) for batch in batches]

### full data format

In [50]:
analyzer = SentimentIntensityAnalyzer()

def format_reviews(tokenizer, datatable, indices=None, task_bar=False, review_sentiment_dict=None):
  encoded_reviews = []
  encoded_reviews_mask = []
  review_sentiment = []
  reviews_to_process = datatable[["review_id", "text", "stars"]]
  # display(reviews_to_process)
  if indices is not None:
    reviews_to_process = reviews_to_process.iloc[indices]
  
  review_iterator = reviews_to_process.iterrows()
  if task_bar:
    review_iterator = tqdm.notebook.tqdm(reviews_to_process.iterrows(), total=reviews_to_process.shape[0])

  for i, review in review_iterator:
    # Tokenize by TOKENIZER
    review_text = review["text"]
    numerized = tokenize_review(tokenizer, review_text)
    padded, mask = pad_sequence(numerized, 0, MAX_LEN)
    encoded_reviews.append(padded)
    encoded_reviews_mask.append(mask)
    # VADER
    if review_sentiment_dict is None:
      sentence_list = nltk.tokenize.sent_tokenize(review_text)
      review_sentiment_sentence = []
      for sentence in sentence_list:
          vs = analyzer.polarity_scores(sentence)
          review_sentiment_sentence.append(vs["compound"])
      padded, _ = pad_sequence(review_sentiment_sentence, 0, MAX_LEN_VADER)
      review_sentiment.append(padded)
    else:
      if review["review_id"] in review_sentiment_dict:
        review_sentiment.append(review_sentiment_dict[review["review_id"]])
    
  torch_encoded_reviews, torch_encoded_reviews_target = \
                    batch_to_torch_long(encoded_reviews, reviews_to_process["stars"].values)
  torch_encoded_reviews_mask, torch_review_sentiment = batch_to_torch_float(encoded_reviews_mask, review_sentiment)
  return torch_encoded_reviews, torch_encoded_reviews_target, torch_review_sentiment, torch_encoded_reviews_mask

### split dataset

In [51]:
# https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=0):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    # m = df.size
    m = len(df.index)

    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end

    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]

    assert train.size + validate.size + test.size == df.size

    return train, validate, test

# Data Preprocessing Code

## load data

In [52]:
# load yelp data
yelp_reviews = load_json(DATA_FOLDER / "yelp_review_training_dataset.jsonl")
print("loaded", len(yelp_reviews.index), "data points")

loaded 533581 data points


In [53]:
display(yelp_reviews)

Unnamed: 0,review_id,text,stars
0,Q1sbwvVQXV2734tPgoKj4Q,Total bill for this horrible service? Over $8G...,1.0
1,GJXCdrto3ASJOqKeVWPi6Q,I *adore* Travis at the Hard Rock's new Kelly ...,5.0
2,2TzJjDVDEuAW6MR5Vuc1ug,I have to say that this office really has it t...,5.0
3,yi0R0Ugj_xUx_Nek0-_Qig,Went in for a lunch. Steak sandwich was delici...,5.0
4,11a8sVPMUFtaC7_ABRkmtw,Today was my second out of three sessions I ha...,1.0
...,...,...,...
533576,2vQO_kmSr6YPBrR8GH_FPA,Dr Young and her assistants take obvious pride...,5.0
533577,DUdLTGVpgsi0sv_g4A5ITQ,We started our 20 month of daughter here on an...,5.0
533578,AKGELpRNTTXajuZHbPxdJg,"First of all, they are supposed to open at 9:0...",2.0
533579,ghYZM7lqzjej05I_T3vYyA,It's not often that you visit a company and th...,5.0


## format + split data into train, val, and test sets

In [54]:
xlnet_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'xlnet-base-cased')
# tokenize_review(xlnet_tokenizer, "I love this grub!")

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


In [55]:
# train 75% | validation 15% | test 10%
train_ratio = .50
validate_ratio = .40
test_ratio = .10
assert train_ratio + validate_ratio + test_ratio == 1

In [56]:
train_reviews, validate_reviews, test_reviews = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)
# train_reviews_df, val_reviews_df, test_reviews_df = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)

In [57]:
# train_reviews, train_reviews_target, train_reviews_mask = format_reviews(xlnet_tokenizer, train_reviews_df)
# validate_reviews, test_reviews_target, validate_reviews_mask = format_reviews(xlnet_tokenizer, validate_reviews_df)
# test_reviews, test_reviews_target, _ = format_reviews(xlnet_tokenizer, test_reviews_df)

In [58]:
print(len(train_reviews.index), "yelp reviews for training")
train_reviews

266790 yelp reviews for training


Unnamed: 0,review_id,text,stars
134046,uozx1JGyRjHzs_aASyPcXg,I had a bad experience with shitty ATTITUDE. B...,1.0
41553,Ahq1v_fyEy00giS3JC7aLg,I thought the food there was good for the pric...,3.0
225115,g4Uj59yJl118-rjlz22s0g,I don't know what happened to the other review...,5.0
17163,WgTjvFVihdfENQCym1ZMsg,Been a patient for 2 1/2 years. Be prepared to...,2.0
11902,aF79-cK0Gj-jEmmTkBmN4Q,I don't often write reviews but I was so impre...,5.0
...,...,...,...
351069,tmQ8yx2eTohANkWh7GnnuQ,Tried Metro Pizza on a family trip to Vegas. ...,5.0
441432,o9P5i583-5zWs6CTjTFayQ,$35 order with 2 pizzas and some Sprite. Get t...,1.0
426231,gur3ktaXvb04sZduAFgbNw,I went to Vision Work in Gilbert and was assis...,5.0
149924,Pi1K9OLjNVDNY91rMqWbxQ,My experiences at UNLV Office of Veterans Serv...,5.0


In [59]:
print(len(validate_reviews.index), "yelp reviews for validation")
validate_reviews

213432 yelp reviews for validation


Unnamed: 0,review_id,text,stars
333161,PjYyIQTX0DwBH-Fva4smQg,When visiting Las Vegas you have the choice of...,4.0
112739,RL6rD5_W3c1KhfTbH1U0hA,My designer took me and my husband to ProSourc...,2.0
524056,XIlXT_mBIy7Moj3SeziY9g,Nue Brows is my favorite waxing place ever. I ...,5.0
378799,Hu0S2l4kSJUOq2qOGyVpzA,This review is mostly based on the service. \n...,2.0
257386,ekUHsCKrXzTs4yB4kBA6bA,"Married November 1st, 2014 in halls B&C. \n\n...",5.0
...,...,...,...
395020,PcpbnlwESsc2INBtySjNoQ,Wow!!!! New opening and the Staff is AWESOME!!...,5.0
368659,5suyUogE1SECWmNO4Od_jQ,Nice higher end steak and seafood. Have had m...,5.0
304554,VcZhY420gOcRvwC5Z6RyFw,I like mucho. But the portions here were ridic...,1.0
314880,qoeh99DeddHIX2cnFC5C5Q,not very helpful on the phone. changed their p...,3.0


In [60]:
print(len(test_reviews.index), "yelp reviews for testing")
test_reviews

53359 yelp reviews for testing


Unnamed: 0,review_id,text,stars
138379,pfwXHRWq2jzT00n2BCbUnQ,I would avoid doing business with Nationwide a...,1.0
382311,7rMAnoL5AQbU2PCjH_4hvg,Why is it so hard to find a jamba juice in veg...,4.0
93399,W9WwTpqqOmHaEkDjFGY3kg,Definetly a fine ice cream chain. The massive ...,4.0
122224,PNYbsSoEvoeVHhrlaBYxkw,One of a very few places in gta where one can ...,5.0
224622,TDl1VLEcC49LMfhucNRDZw,Was told I'm a pain in the ass customer. Dana ...,1.0
...,...,...,...
359783,V89YHw66stC6m2q_Q8f1QA,First Yelp review I've ever written. They were...,1.0
152315,PkW-fQcMLCYK1EXXlbghxA,Excellent! I always seem to get the huevos ran...,4.0
117952,P1-rGkBlw_PHDAz2pNd4pA,"Sorry Mitchell's, I'm a Graeter's girl now. I ...",5.0
435829,rE4xHmpnFp6xaWJRWsGSvA,Jay's was referred to me by my neighbor who is...,5.0


In [61]:
review_sentiment_dict = {}

In [62]:
# Create dictionary of all the reviews' Vader temporarily

if USE_VADER:
  review_iterator = tqdm.notebook.tqdm(yelp_reviews.iterrows(), total=yelp_reviews.shape[0])

  for i, review in review_iterator:
    # Tokenize by TOKENIZER
    review_text = review["text"]
    # VADER
    sentence_list = nltk.tokenize.sent_tokenize(review_text)
    review_sentiment_sentence = []
    for sentence in sentence_list:
        vs = analyzer.polarity_scores(sentence)
        review_sentiment_sentence.append(vs["compound"])
    padded, _ = pad_sequence(review_sentiment_sentence, 0, MAX_LEN_VADER)
    review_sentiment_dict[review["review_id"]] = padded
    if len(review_sentiment_dict) < 20:
      print(len(review_sentiment_dict), review_sentiment_dict[review["review_id"]])


HBox(children=(FloatProgress(value=0.0, max=533581.0), HTML(value='')))

1 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.628, 0.0, 0.0, 0.0, -0.296]
2 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5411, 0.8313, 0.875, -0.2263, 0.7845, 0.6588, 0.8555, 0.7778, 0.5954, 0.0, 0.7269, 0.2263, 0.8074, 0.4404, 0.6588, 0.0, 0.0]
3 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.5797, 0.8221, 0.8718, 0.8398, 0.7082]
4 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.9485, 0.0, 0.7227, 0.7269, 0.4939, 0.6115, 0.6249, 0.0]
5 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0.6908, 0.5719, 0.0, 0.2732, 0.0, 0.0258, 0.0258, 0.296, 0.0, -0.5563, 0.0, 0.5413, -0.296, -0.5106, -0.5542, 0.296, -0.8825, -0.5423, 0.2732, 0.6514, 0.1027, 0.6191, 0.0459, -0.2732, -0.743, -0.3818, -0.5095, -0.4854]
6 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -0.0608, -0.2865, 0.0, -0.2732, 0.6996, 0.4019,

#Model

## model construction

In [63]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, rnn_size, vader_size, num_layers=1, dropout=0, use_vader=USE_VADER):
        super().__init__()
        
        #################
        #    INPUT 1    #
        #################
        # Create an embedding layer, with 768 hidden layers
        self.xlnet = torch.hub.load('huggingface/pytorch-transformers', 'model', 'xlnet-base-cased')
        for param in self.xlnet.layer.parameters():
          param.requires_grad = False
        # Output: (vocab_size x 768), where 768 hidden layers of XLNet

        # Coming in: torch.Size([BATCH_SIZE, vocab_size, 768])
        #   (XLNet has 768 hidden layers, https://huggingface.co/transformers/pretrained_models.html)
        conv2d_c_in = 1
        conv2d_c_out = 100
        conv2d_kernel_H = 768 # along Embedding Length
        conv2d_kernel_W = 5 # along Word Length

        self.conv2D_layer = nn.Conv2d(conv2d_c_in, conv2d_c_out, (conv2d_kernel_W, conv2d_kernel_H))
        # Filter of (conv2d_kernel_W, conv2d_kernel_H), Cin = 1, Cout = 1

        # conv2d_out torch.Size([32, 100, 124, 1])

        # Output:
        conv2d_out_Wout = 1 + (vocab_size - conv2d_kernel_W) # Vocab Size
        conv2d_out_Hout = 1 + (768 - conv2d_kernel_H)       # length

        self.max_pool_2d = nn.MaxPool2d((conv2d_out_Wout, 1))
        max_pool_2d_out_length = conv2d_out_Wout // conv2d_out_Wout
        max_pool_2d_out_height = conv2d_out_Hout // 1
        #################
        #  INPUT 1 END  #
        #################
        
        #################
        #    INPUT 2    #
        #################
        self.lstm = None
        if use_vader:
          self.lstm = nn.LSTM(input_size=1, hidden_size=1, num_layers=num_layers, batch_first=True, dropout=dropout)
        else:
          vader_size = 0
        #################
        #  INPUT 2 END  #
        #################

        self.dropout = nn.Dropout(dropout)
        # print(max_pool_2d_out_height, max_pool_2d_out_length, vader_size)

        hidden_layer_dense = 100

        self.dense = nn.Sequential(
                nn.Linear(100 + vader_size, hidden_layer_dense),
                nn.ReLU()
            )
        self.output = nn.Linear(hidden_layer_dense, 5) # classify yelp_reviews into 5 ratings
    
    def forward_input_vectorized(self, x):
      xlnet_out = self.xlnet(x)
      xlnet_out_hidden = xlnet_out.last_hidden_state
      batches_len, word_len, embedding_len = xlnet_out_hidden.shape
      xlnet_out_hidden = xlnet_out_hidden.reshape(batches_len, 1, word_len, embedding_len)
      conv2d_out = self.conv2D_layer(xlnet_out_hidden)
      result = self.max_pool_2d(conv2d_out)
      result = result.squeeze(2).squeeze(2)
      return result

    def forward_input_vader(self, x):
      batch_size, vader_len = x.shape
      # print(x.reshape(batch_size, vader_len, 1).shape)
      output, _ = self.lstm(x.reshape(batch_size, vader_len, 1))
      # print(output.shape)
      output = output.squeeze(2)
      return output

    def forward(self, vectorized_words, vader):
        input1 = self.forward_input_vectorized(vectorized_words)

        if self.lstm:
          input2 = self.forward_input_vader(vader)
          combined_input = (input1, input2)
        else:
          combined_input = (input1,) # Tuples need the stray comma

        # print(input1.size(), input2.size())

        combined_input = torch.cat(combined_input, dim=1)

        lstm_drop = self.dropout(combined_input)
        logits = self.dense(lstm_drop)
        logits = self.output(logits)
        return logits
    
    def loss_fn(self, prediction, target):
      loss_criterion = nn.CrossEntropyLoss(reduction='none')
      return torch.mean(loss_criterion(prediction, target - 1))

In [74]:
model = LanguageModel(vocab_size=MAX_LEN, rnn_size=256, vader_size=MAX_LEN_VADER)

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


## train the model

In [75]:
# num_of_validaion_set = 20 #len(validate_reviews)

# batch_val = format_reviews(xlnet_tokenizer, validate_reviews, range(num_of_validaion_set), review_sentiment_dict=review_sentiment_dict) # This cell may take a while

# (batch_input_val, batch_target_val, batch_review_sentiment_val, batch_target_mask_val) = batch_val

In [76]:
def run_validation(model, use_all=False, mode="val"):
  reviews_dataset = None
  if mode == "val":
    print("Running Validation")
    mode = "Validation"
    reviews_dataset = validate_reviews
  elif mode == "test":
    print("Running Testing")
    mode = "Test"
    reviews_dataset = test_reviews
  else:
    assert False, "Invalid mode"
  num_of_review_set = len(reviews_dataset) if use_all else 1000
  indices = np.random.permutation(len(reviews_dataset))
  t = tqdm.notebook.tqdm(range(0, ( num_of_review_set // BATCH_SIZE) + ( 1 if num_of_review_set % BATCH_SIZE > 0 else 0 )))
  loss_val_total = 0
  accuracy_val_total = 0
  temp_count = 0
  for i in t:
    val_start_i = i*BATCH_SIZE
    val_end_i = (i+1)*BATCH_SIZE
    # print(val_start_i, val_end_i, indices.shape)
    batch_val = format_reviews(xlnet_tokenizer, reviews_dataset, indices[val_start_i:val_end_i], review_sentiment_dict=review_sentiment_dict)
    (batch_input_val, batch_target_val, batch_review_sentiment_val, batch_target_mask_val) = batch_val
    # print(batch_input_val.shape, batch_review_sentiment_val.shape)
    (batch_input_val, batch_target_val) = list_to_device((batch_input_val, batch_target_val))
    batch_target_mask_val, batch_review_sentiment_val = list_to_device((batch_target_mask_val, batch_review_sentiment_val))
    # print(batch_input_val.shape, batch_review_sentiment_val.shape)
    prediction_val = model.forward(batch_input_val, batch_review_sentiment_val)
    # print(prediction_val.size(), batch_target_val.size())
    # print(prediction_val, batch_target_val)
    loss_val_total += model.loss_fn(prediction_val, batch_target_val).item()
    # print(loss_val)
    accuracy_val_total += torch.mean(torch.eq(prediction_val.argmax(dim=1,keepdim=False),batch_target_val).float()).item()
    temp_count += 1
    if i % round(8000 / BATCH_SIZE) == 0 and i != 0 and use_all:
      print(mode, "Prelim Evaluation set loss:", loss_val_total / temp_count, mode, "Prelim Accuracy:", accuracy_val_total / temp_count)
  loss_val = loss_val_total / temp_count
  accuracy_val = accuracy_val_total / temp_count
  print(mode, "Evaluation set loss:", loss_val, mode, "Accuracy set %:", accuracy_val)

In [77]:
losses = []
accuracies = []

epoch_start = 0
t_start = 0

In [78]:
# ONLY RUN THIS CELL (and next cell) if want to load checkpoint
# If you accidentally run this cell, no harm done (be careful with next cell!!!)

checkpoint = None
try:
  checkpoint = torch.load(str(TORCH_CHECKPOINT_MODEL))
  print("Checkpoint loaded")
except:
  print("No Checkpoint loaded")

No Checkpoint loaded


In [79]:
lr = 1e-4
optimizer_method = optim.Adam
optimizer = optimizer_method(model.parameters(), lr=lr)

In [80]:
# ONLY RUN THIS CELL if want to load checkpoint

if checkpoint:
  epoch_start = checkpoint['epoch']
  t_start = checkpoint['t']
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer = optimizer_method(model.parameters(), lr=lr)
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  losses = checkpoint['losses']
  accuracies = checkpoint['accuracies']
  model.to(device)

  print("Checkpoint")
  run_validation(model)

  print(f"Checkpoint Epoch: {epoch_start} Iteration: {i} Loss: {np.mean(losses[-10:])} Accuracy: {np.mean(accuracies[-10:])}")

In [81]:
# set model to training mode
# Needs to be placed after the Checkpoint file loading
model.train()

LanguageModel(
  (xlnet): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

DATASET = train_reviews

# Constants of interest: BATCH_SIZE, EPOCHS

since = time.time()

# Sanity check saving
torch.save({'epoch': 0,
            't': 0,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': losses,
            'accuracies': accuracies
            }, str(TORCH_CHECKPOINT_MODEL) + "FAKE")

# start training
for epoch in range(epoch_start, EPOCHS):
  indices = np.random.permutation(DATASET.shape[0])

  dataset_batch_cap = ( DATASET.shape[0] // BATCH_SIZE ) + (1 if DATASET.shape[0] % BATCH_SIZE > 0 else 0)

  t = tqdm.notebook.tqdm(range(t_start, dataset_batch_cap), initial = t_start, total = dataset_batch_cap)
  
  for i in t:
    # batch
    batch = format_reviews(xlnet_tokenizer, DATASET, indices[i*BATCH_SIZE:(i+1)*BATCH_SIZE], review_sentiment_dict=review_sentiment_dict)
    (batch_input, batch_target, batch_review_sentiment, batch_target_mask) = batch
    # for item in (batch_input, batch_target, batch_review_sentiment, batch_target_mask):
    #   print(item.size())
    (batch_input, batch_target, batch_target_mask, batch_review_sentiment) = list_to_device((batch_input, batch_target, batch_target_mask, batch_review_sentiment))
    model.to(device)
    
    # forward pass
    prediction = model.forward(batch_input, batch_review_sentiment)
    # print(prediction.size(), batch_target.size())
    loss = model.loss_fn(prediction, batch_target)
    # print(loss)
    losses.append(loss.item())
    accuracy = torch.mean(torch.eq(prediction.argmax(dim=1,keepdim=False),batch_target).float())
    accuracies.append(accuracy.item())
    
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # visuallize data
    if i % 1000 == 0 and i != t_start:
      torch.save({'epoch': epoch,
                  't': i,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'losses': losses,
                  'accuracies': accuracies
                  }, str(TORCH_CHECKPOINT_MODEL))
      run_validation(model)
      print(f"Epoch: {epoch} Iteration: {i} Train Loss: {np.mean(losses[-10:])} Train Accuracy: {np.mean(accuracies[-10:])}")

  t_start = 0


HBox(children=(FloatProgress(value=0.0, max=8338.0), HTML(value='')))

Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.6322529898025095 Validation Accuracy set %: 0.11328125
Epoch: 0 Iteration: 1000 Train Loss: 0.6033372819423676 Train Accuracy: 0.115625
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.6128142150118947 Validation Accuracy set %: 0.103515625
Epoch: 0 Iteration: 2000 Train Loss: 0.6286964416503906 Train Accuracy: 0.0875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5367765333503485 Validation Accuracy set %: 0.0947265625
Epoch: 0 Iteration: 3000 Train Loss: 0.5639455020427704 Train Accuracy: 0.121875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.563629874959588 Validation Accuracy set %: 0.1064453125
Epoch: 0 Iteration: 4000 Train Loss: 0.5916164755821228 Train Accuracy: 0.103125
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5964832110330462 Validation Accuracy set %: 0.109375
Epoch: 0 Iteration: 5000 Train Loss: 0.5343139916658401 Train Accuracy: 0.090625
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5513808084651828 Validation Accuracy set %: 0.0947265625
Epoch: 0 Iteration: 6000 Train Loss: 0.640533858537674 Train Accuracy: 0.15
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.526653602719307 Validation Accuracy set %: 0.09765625
Epoch: 0 Iteration: 7000 Train Loss: 0.48872745633125303 Train Accuracy: 0.06875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5752646895125508 Validation Accuracy set %: 0.09765625
Epoch: 0 Iteration: 8000 Train Loss: 0.5396734565496445 Train Accuracy: 0.084375



HBox(children=(FloatProgress(value=0.0, max=8338.0), HTML(value='')))

Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5345576959662139 Validation Accuracy set %: 0.076171875
Epoch: 1 Iteration: 1000 Train Loss: 0.5130729138851166 Train Accuracy: 0.11875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5323717929422855 Validation Accuracy set %: 0.0947265625
Epoch: 1 Iteration: 2000 Train Loss: 0.4978444010019302 Train Accuracy: 0.121875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5305659435689449 Validation Accuracy set %: 0.09765625
Epoch: 1 Iteration: 3000 Train Loss: 0.5344327926635742 Train Accuracy: 0.096875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5229110214859247 Validation Accuracy set %: 0.09765625
Epoch: 1 Iteration: 4000 Train Loss: 0.45046032071113584 Train Accuracy: 0.084375
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5794236110523343 Validation Accuracy set %: 0.111328125
Epoch: 1 Iteration: 5000 Train Loss: 0.43845262229442594 Train Accuracy: 0.075
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5163069413974881 Validation Accuracy set %: 0.0859375
Epoch: 1 Iteration: 6000 Train Loss: 0.5941014736890793 Train Accuracy: 0.13125
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5528400344774127 Validation Accuracy set %: 0.08984375
Epoch: 1 Iteration: 7000 Train Loss: 0.47787667214870455 Train Accuracy: 0.1125
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5646775746718049 Validation Accuracy set %: 0.091796875
Epoch: 1 Iteration: 8000 Train Loss: 0.4652991622686386 Train Accuracy: 0.0875



HBox(children=(FloatProgress(value=0.0, max=8338.0), HTML(value='')))

Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.6065077763050795 Validation Accuracy set %: 0.125
Epoch: 2 Iteration: 1000 Train Loss: 0.45449023842811587 Train Accuracy: 0.096875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5319720199331641 Validation Accuracy set %: 0.0908203125
Epoch: 2 Iteration: 2000 Train Loss: 0.3918859034776688 Train Accuracy: 0.0625
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5352317001670599 Validation Accuracy set %: 0.0947265625
Epoch: 2 Iteration: 3000 Train Loss: 0.4466226786375046 Train Accuracy: 0.090625
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5326072489842772 Validation Accuracy set %: 0.1103515625
Epoch: 2 Iteration: 4000 Train Loss: 0.45448345839977267 Train Accuracy: 0.078125
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5515474081039429 Validation Accuracy set %: 0.103515625
Epoch: 2 Iteration: 5000 Train Loss: 0.43154471218585966 Train Accuracy: 0.09375
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5588687919080257 Validation Accuracy set %: 0.09765625
Epoch: 2 Iteration: 6000 Train Loss: 0.5433090955018998 Train Accuracy: 0.115625
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.6012680158019066 Validation Accuracy set %: 0.107421875
Epoch: 2 Iteration: 7000 Train Loss: 0.43954105377197267 Train Accuracy: 0.084375
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5780292637646198 Validation Accuracy set %: 0.0947265625
Epoch: 2 Iteration: 8000 Train Loss: 0.46844940185546874 Train Accuracy: 0.05625



HBox(children=(FloatProgress(value=0.0, max=8338.0), HTML(value='')))

Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5673925383016467 Validation Accuracy set %: 0.10546875
Epoch: 3 Iteration: 1000 Train Loss: 0.43584659695625305 Train Accuracy: 0.1
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.6521177971735597 Validation Accuracy set %: 0.1015625
Epoch: 3 Iteration: 2000 Train Loss: 0.47784543335437774 Train Accuracy: 0.10625
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.6332937804982066 Validation Accuracy set %: 0.099609375
Epoch: 3 Iteration: 3000 Train Loss: 0.40025038421154024 Train Accuracy: 0.084375
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5792655609548092 Validation Accuracy set %: 0.091796875
Epoch: 3 Iteration: 4000 Train Loss: 0.4629738450050354 Train Accuracy: 0.084375
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5165133671835065 Validation Accuracy set %: 0.0810546875
Epoch: 3 Iteration: 5000 Train Loss: 0.38445235788822174 Train Accuracy: 0.071875
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.5714722881093621 Validation Accuracy set %: 0.103515625
Epoch: 3 Iteration: 6000 Train Loss: 0.4210585355758667 Train Accuracy: 0.05625
Running Validation


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))


Validation Evaluation set loss: 0.6199276065453887 Validation Accuracy set %: 0.119140625
Epoch: 3 Iteration: 7000 Train Loss: 0.32291741818189623 Train Accuracy: 0.059375


In [None]:
# Save the latest model
print("Saving latest model to", str(TORCH_CHECKPOINT_MODEL))
torch.save({'epoch': EPOCHS,
            't': (DATASET.shape[0] // BATCH_SIZE)+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': losses,
            'accuracies': accuracies
            }, str(TORCH_CHECKPOINT_MODEL))

## evaluate model

In [None]:
# set model to evaluation model
model.eval()

In [None]:
run_validation(model, True, "val")

Running Validation


HBox(children=(FloatProgress(value=0.0, max=13340.0), HTML(value='')))

Validation Prelim Evaluation set loss: 0.7003109046442066 Validation Prelim Accuracy: 0.7769461077844312
Validation Prelim Evaluation set loss: 0.7019601297210325 Validation Prelim Accuracy: 0.7729145854145855
Validation Prelim Evaluation set loss: 0.7012036135364361 Validation Prelim Accuracy: 0.7724017321785477
Validation Prelim Evaluation set loss: 0.7019049818899812 Validation Prelim Accuracy: 0.7715829585207397
Validation Prelim Evaluation set loss: 0.6987897005894396 Validation Prelim Accuracy: 0.7732157137145141
Validation Prelim Evaluation set loss: 0.7016105142784185 Validation Prelim Accuracy: 0.7726174608463845
Validation Prelim Evaluation set loss: 0.7032026262417589 Validation Prelim Accuracy: 0.7724935732647815
Validation Prelim Evaluation set loss: 0.706181633513364 Validation Prelim Accuracy: 0.7721819545113722
Validation Prelim Evaluation set loss: 0.7064535731794856 Validation Prelim Accuracy: 0.7718701399688958
Validation Prelim Evaluation set loss: 0.705922719980412

In [None]:
run_validation(model, True, "test")

Running Testing


HBox(children=(FloatProgress(value=0.0, max=3335.0), HTML(value='')))

Test Prelim Evaluation set loss: 0.7104197494507193 Test Prelim Accuracy: 0.7723303393213573
Test Prelim Evaluation set loss: 0.7166598277030648 Test Prelim Accuracy: 0.7687937062937062
Test Prelim Evaluation set loss: 0.7128958379717011 Test Prelim Accuracy: 0.7719020652898068
Test Prelim Evaluation set loss: 0.714400890948384 Test Prelim Accuracy: 0.7722701149425287
Test Prelim Evaluation set loss: 0.7153426486878145 Test Prelim Accuracy: 0.7720661735305878
Test Prelim Evaluation set loss: 0.7138283398115349 Test Prelim Accuracy: 0.7721176274575141

Test Evaluation set loss: 0.04473292892023305 Test Accuracy set %: 0.0482227303123038


#Playground

In [None]:
# This is a cheap solution to stops any run all that reaches my Playground
hard_stop = input("Hard Stop here. Enter any key to allow passage.")

if len(hard_stop) == 0:
  raise Exception("Hard Stop")

In [None]:
print(list(tokenize(STARTER["text"]))[:3])

In [None]:
import urllib.request
import io
import sentencepiece as spm

# https://github.com/google/sentencepiece/tree/master/python

# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
spm.SentencePieceTrainer.train(
      sentence_iterator=STARTER["text"], model_writer=model, vocab_size=1000)

# Serialize the model as file.
# with open('out.model', 'wb') as f:
#   f.write(model.getvalue())

# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.encode('this is test'))