# Global Setup

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install segtok
!pip install vaderSentiment



In [20]:
import os
import sys
from pathlib import Path

import json
import pandas as pd
import random

import torch
from segtok import tokenizer
from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
ROOT_FOLDER = Path("/content/drive/My Drive/cs182_final_project/cs182-nlp (master)")
DATA_FOLDER = ROOT_FOLDER / "dataset"

In [None]:
sys.path.append(ROOT_FOLDER)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


#Data Preprocessing

## load data

In [None]:
def load_json(file_path, filter_function=lambda x: True):
  """
  file_path - full path of the file to read from
  filter_function - a data selection function, returns True to ADD a data point
  """
  result = []

  try:
    with open(file_path, "r") as f:
      for line in f:
        json_line = json.loads(line)
        if not filter_function(json_line):
          # Disallow via opposite of allow
          continue
        result.append(json_line) # each line is one data point dictionary
    return pd.DataFrame.from_records(result)
    # return result

  except IOError:
    print(f"cannot open {file_path}")
    return None

In [21]:
# load yelp data
yelp_reviews = load_json(DATA_FOLDER / "yelp_review_training_dataset.jsonl")
print("loaded", yelp_reviews.size, "data points")

loaded 1600743 data points


In [8]:
display(yelp_reviews)

Unnamed: 0,review_id,text,stars
0,Q1sbwvVQXV2734tPgoKj4Q,Total bill for this horrible service? Over $8G...,1.0
1,GJXCdrto3ASJOqKeVWPi6Q,I *adore* Travis at the Hard Rock's new Kelly ...,5.0
2,2TzJjDVDEuAW6MR5Vuc1ug,I have to say that this office really has it t...,5.0
3,yi0R0Ugj_xUx_Nek0-_Qig,Went in for a lunch. Steak sandwich was delici...,5.0
4,11a8sVPMUFtaC7_ABRkmtw,Today was my second out of three sessions I ha...,1.0
...,...,...,...
533576,2vQO_kmSr6YPBrR8GH_FPA,Dr Young and her assistants take obvious pride...,5.0
533577,DUdLTGVpgsi0sv_g4A5ITQ,We started our 20 month of daughter here on an...,5.0
533578,AKGELpRNTTXajuZHbPxdJg,"First of all, they are supposed to open at 9:0...",2.0
533579,ghYZM7lqzjej05I_T3vYyA,It's not often that you visit a company and th...,5.0


## format data

In [17]:
def tokenize(data):
  """
  data - an iterable of sentences
  """
  token_set = set()
  i = 0
  for sentences in data:
    if i % 1000 == 0:
      print(i, end=", " if i % 15000 != 0 else "\n")
    tokenized = tokenizer.word_tokenizer(sentences.lower())
    for token in tokenized:
      token_set.add(token)
    i += 1
  return token_set

def tokenize_review(review):
  encodings = tokenizer.encode_plus(review, add_special_tokens=True,
                                    max_length=256,
                                    return_token_type_ids=False,
                                    return_attention_mask=False,
                                    truncation=True,
                                    pad_to_max_length=False)
  return encodings.get("input_ids", [])

def pad_sequence(numerized, pad_index, to_length):
    pad = numerized[:to_length]
    padded = pad + [pad_index] * (to_length - len(pad))
    mask = [w != pad_index for w in padded]
    return padded, mask

In [18]:
tokenize_review("I love this grub!")

AttributeError: ignored

#Model

## load pretrained model

In [25]:
# load XLNet pretrained model
model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'xlnet-base-cased')
model.to(device)

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): XLNetFeedForward(
        (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (layer_1): Linear(in_features=768, out_features=3072, b

## train the model

In [None]:
# set model to training mode
model.train()

In [None]:
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

# TODO: fix this function
## STARTER -> yelp_reviews['text']?

def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
  since = time.time()

  for epoch in range(num_epochs):
      
      indices = np.random.permutation(range(len(STARTER)))
      t = tqdm.notebook.tqdm(range(0,(len(STARTER)//batch_size)+1))
      
      for i in t:
        # batch
        batch = process_reviews(STARTER, indices[i*batch_size:(i+1)*batch_size])
        (batch_input, batch_target, batch_target_mask) = batch_to_torch(*batch)
        for item in (batch_input, batch_target, batch_target_mask):
          print(item.size())
        (batch_input, batch_target, batch_target_mask) = list_to_device((batch_input, batch_target, batch_target_mask))
        
        # forward pass
        prediction = model.forward(batch_input)
        loss = loss_fn(prediction, batch_target, batch_target_mask)
        losses.append(loss.item())
        accuracy = (th.eq(prediction.argmax(dim=2,keepdim=False),batch_target).float()*batch_target_mask).sum()/batch_target_mask.sum()
        accuracies.append(accuracy.item())
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # visuallize data
        if i % 100 == 0:
            batch_val = build_batch(d_valid, range(len(d_valid)))
            (batch_input_val, batch_target_val, batch_target_mask_val) = list_to_device(batch_to_torch(*batch_val))
            prediction_val = model.forward(batch_input_val)
            loss_val = loss_fn(prediction_val, batch_target_val, batch_target_mask_val)
            print("Evaluation set loss:", loss_val.item())
            print(f"Epoch: {epoch} Iteration: {i} Loss: {np.mean(losses[-10:])} Accuracy: {np.mean(accuracies[-10:])}")

In [None]:
train_model(model)

## evaluate model

In [None]:
# set model to evaluation model
model.eval()

In [None]:
# model(yelp_ratings["text"])

#Playground

In [None]:
# This is a cheap solution to stops any run all that reaches my Playground
hard_stop = input("Hard Stop here. Enter any key to allow passage.")

if len(hard_stop) == 0:
  raise Exception("Hard Stop")

Hard Stop here. Enter any key to allow passage.


Exception: ignored

In [None]:
print(list(tokenize(STARTER["text"]))[:3])

In [None]:
import urllib.request
import io
import sentencepiece as spm

# https://github.com/google/sentencepiece/tree/master/python

# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
spm.SentencePieceTrainer.train(
      sentence_iterator=STARTER["text"], model_writer=model, vocab_size=1000)

# Serialize the model as file.
# with open('out.model', 'wb') as f:
#   f.write(model.getvalue())

# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.encode('this is test'))

NameError: ignored