# Global Setup

In [1]:
import os
import sys
from pathlib import Path

import json
import pandas as pd
import random

import torch
from segtok import tokenizer
from keras.preprocessing.sequence import pad_sequences
import tqdm

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy

import huggingface_hub

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# ROOT_FOLDER = Path("/content/drive/My Drive/cs182_final_project/cs182-nlp")
# DATA_FOLDER = ROOT_FOLDER / "starter"
# sys.path.insert(0, ROOT_FOLDER)

In [3]:
DATA_FOLDER = Path("starter")

In [4]:
from data import *
from models import LanguageModel

[nltk_data] Downloading package punkt to /Users/aprilsin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [6]:
list_to_device = lambda th_obj: [tensor.to(device) for tensor in th_obj]

# Model Params

In [7]:
MAX_LEN = 128
MAX_LEN_VADER = 40
BATCH_SIZE = 32
EPOCHS = 5

# Higher bound settings: MAX_LEN = 256 and BATCH_SIZE = 16

#  Data Preprocessing Functions

## load data

## data formatting

### tokenize

### padding

### batching

### full data format

### split dataset

# Data Preprocessing Code

## load data

In [35]:
# load yelp data
yelp_reviews = load_json(DATA_FOLDER / "yelp_review_training_dataset.jsonl")
print("loaded", len(yelp_reviews), "data points")

loaded 533581 data points


In [36]:
display(yelp_reviews)

Unnamed: 0,review_id,text,stars
0,Q1sbwvVQXV2734tPgoKj4Q,Total bill for this horrible service? Over $8G...,1.0
1,GJXCdrto3ASJOqKeVWPi6Q,I *adore* Travis at the Hard Rock's new Kelly ...,5.0
2,2TzJjDVDEuAW6MR5Vuc1ug,I have to say that this office really has it t...,5.0
3,yi0R0Ugj_xUx_Nek0-_Qig,Went in for a lunch. Steak sandwich was delici...,5.0
4,11a8sVPMUFtaC7_ABRkmtw,Today was my second out of three sessions I ha...,1.0
...,...,...,...
533576,2vQO_kmSr6YPBrR8GH_FPA,Dr Young and her assistants take obvious pride...,5.0
533577,DUdLTGVpgsi0sv_g4A5ITQ,We started our 20 month of daughter here on an...,5.0
533578,AKGELpRNTTXajuZHbPxdJg,"First of all, they are supposed to open at 9:0...",2.0
533579,ghYZM7lqzjej05I_T3vYyA,It's not often that you visit a company and th...,5.0


## format + split data into train, val, and test sets

In [37]:
from transfomers import BertTokenizer
from transfomers import BertForSequenceClassification

base_model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", num_labels=200
)
model_tokenizer = torch.hub.load(
    "huggingface/pytorch-transformers", "tokenizer", "xlnet-base-cased"
)

base_model.classifier.add_module('bert_activation', nn.Tanh())
base_model.classifier.add_module('prediction', nn.Linear(200, 5))
# tokenize_review(model_tokenizer, "I love this grub!")

Using cache found in /Users/aprilsin/.cache/torch/hub/huggingface_pytorch-transformers_master


RuntimeError: Missing dependencies: importlib_metadata, huggingface_hub

In [None]:
# train 75% | validation 15% | test 10%
train_ratio = .75
validate_ratio = .15
test_ratio = .10
assert train_ratio + validate_ratio + test_ratio == 1

In [None]:
train_reviews, validate_reviews, test_reviews = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)
# train_reviews_df, val_reviews_df, test_reviews_df = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)

In [None]:
# train_reviews, train_reviews_target, train_reviews_mask = format_reviews(xlnet_tokenizer, train_reviews_df)
# validate_reviews, test_reviews_target, validate_reviews_mask = format_reviews(xlnet_tokenizer, validate_reviews_df)
# test_reviews, test_reviews_target, _ = format_reviews(xlnet_tokenizer, test_reviews_df)

In [None]:
print(len(train_reviews.index), "yelp reviews for training")
train_reviews

In [None]:
print(len(validate_reviews.index), "yelp reviews for validation")
validate_reviews

In [None]:
print(len(test_reviews.index), "yelp reviews for testing")
test_reviews

# Model

## model construction

In [18]:
model = LanguageModel(vocab_size=MAX_LEN, rnn_size=256, vader_size=MAX_LEN_VADER)

NameError: name 'MAX_LEN_VADER' is not defined

## train the model

In [None]:
# set model to training mode
model.train()

In [None]:
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

# TODO: fix this block
DATASET = train_reviews


since = time.time()

# start training
for epoch in range(num_epochs):
    indices = np.random.permutation(range(DATASET.size))
    t = tqdm.notebook.tqdm(range(0, (DATASET.size // batch_size) + 1))

    for i in t:
        # batch
        batch = format_reviews(
            model_tokenizer, DATASET, indices[i * batch_size : (i + 1) * batch_size]
        )
        (batch_input, batch_target, batch_target_mask) = batch_to_torch(*batch)
        for item in (batch_input, batch_target, batch_target_mask):
            print(item.size())
        (batch_input, batch_target, batch_target_mask) = list_to_device(
            (batch_input, batch_target, batch_target_mask)
        )

        # forward pass
        prediction = model.forward(batch_input)
        loss = loss_fn(prediction, batch_target, batch_target_mask)
        losses.append(loss.item())
        accuracy = (
            th.eq(prediction.argmax(dim=2, keepdim=False), batch_target).float()
            * batch_target_mask
        ).sum() / batch_target_mask.sum()
        accuracies.append(accuracy.item())

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # visuallize data
        if i % 100 == 0:
            batch_val = build_batch(d_valid, range(len(d_valid)))
            (batch_input_val, batch_target_val, batch_target_mask_val) = list_to_device(
                batch_to_torch(*batch_val)
            )
            prediction_val = model.forward(batch_input_val)
            loss_val = loss_fn(prediction_val, batch_target_val, batch_target_mask_val)
            print("Evaluation set loss:", loss_val.item())
            print(
                f"Epoch: {epoch} Iteration: {i} Loss: {np.mean(losses[-10:])} Accuracy: {np.mean(accuracies[-10:])}"
            )

## evaluate model

In [None]:
# set model to evaluation model
model.eval()

In [None]:
# model(yelp_ratings["text"])