# Global Setup

In [4]:
!pip install transformers==4.5.0
!pip install sentencepiece
!pip install segtok
!pip install vaderSentiment
!pip install nltk
!pip install huggingface_hub

Collecting transformers==4.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/81/91/61d69d58a1af1bd81d9ca9d62c90a6de3ab80d77f27c5df65d9a2c1f5626/transformers-4.5.0-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.2MB 10.0MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 55.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.8MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.0
Collecting sentencepiece
[?25l  Downloading https:/

In [5]:
import os
import sys
from pathlib import Path

import json
import pandas as pd
import random

import torch
from segtok import tokenizer
from keras.preprocessing.sequence import pad_sequences
import tqdm

from multiprocessing import Pool

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import tokenize

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
ROOT_FOLDER = Path("/content/drive/My Drive/cs182_final_project/cs182-nlp")
DATA_FOLDER = ROOT_FOLDER / "dataset"
TORCH_CHECKPOINT_MODEL = ROOT_FOLDER / "output" / "training_checkpoint_chandana.pt"

input("Please check to make sure the above checkpoint directory is yours (Hit any key)")

Please check to make sure the above checkpoint directory is yours (Hit any key)


''

In [8]:
sys.path.append(ROOT_FOLDER)

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [10]:
list_to_device = lambda th_obj: [tensor.to(device) for tensor in th_obj]

# Model Params

In [11]:
MAX_LEN = 128
#MAX_LEN_VADER = 40
BATCH_SIZE = 32
EPOCHS = 5

# Higher bound settings: MAX_LEN = 256 and BATCH_SIZE = 16

#Data Preprocessing Functions

## load data

In [12]:
def load_json(file_path, filter_function=lambda x: True):
  """
  file_path - full path of the file to read from
  filter_function - a data selection function, returns True to ADD a data point
  """
  result = []

  try:
    with open(file_path, "r") as f:
      for line in f:
        json_line = json.loads(line)
        if not filter_function(json_line):
          # Disallow via opposite of allow
          continue
        result.append(json_line) # each line is one data point dictionary
    return pd.DataFrame.from_records(result)
    # return result

  except IOError:
    print(f"cannot open {file_path}")
    return None

## data formatting

### tokenize

In [13]:
def tokenize(data):
  """
  data - an iterable of sentences
  """
  token_set = set()
  i = 0
  for sentences in data:
    if i % 1000 == 0:
      print(i, end=", " if i % 15000 != 0 else "\n")
    tokenized = nltk.word_tokenize(sentences.lower())
    for token in tokenized:
      token_set.add(token)
    i += 1
  return token_set

In [14]:
def tokenize_review(tokenizer, review_text):
  encodings = tokenizer.encode_plus(review_text, add_special_tokens=True,
                                    max_length=MAX_LEN,
                                    return_token_type_ids=False,
                                    return_attention_mask=False,
                                    truncation=True,
                                    pad_to_max_length=False)
  return encodings.get("input_ids", [])


### padding

In [15]:
def pad_sequence(numerized, pad_index, to_length, beginning=True):
    pad = numerized[:to_length]
    if beginning:
      padded = [pad_index] * (to_length - len(pad)) + pad
    else:
      padded = pad + [pad_index] * (to_length - len(pad))
    mask = [w != pad_index for w in padded]
    return padded, mask

### batching

In [16]:
def batch_to_torch_long(*batches):
  if len(batches) == 1:
    return torch.LongTensor(batches[0])
  return [torch.LongTensor(batch) for batch in batches]

def batch_to_torch_float(*batches):
  if len(batches) == 1:
    return torch.FloatTensor(batches[0])
  return [torch.FloatTensor(batch) for batch in batches]

### full data format

### split dataset

In [17]:
# https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=0):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    # m = df.size
    m = len(df.index)

    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end

    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]

    assert train.size + validate.size + test.size == df.size

    return train, validate, test

# Data Preprocessing Code

## load data

In [18]:
# load yelp data
yelp_reviews = load_json(DATA_FOLDER / "yelp_review_training_dataset.jsonl")
print("loaded", len(yelp_reviews.index), "data points")

loaded 533581 data points


In [19]:
yelp_reviews = yelp_reviews[0:10000]
display(yelp_reviews)

Unnamed: 0,review_id,text,stars
0,Q1sbwvVQXV2734tPgoKj4Q,Total bill for this horrible service? Over $8G...,1.0
1,GJXCdrto3ASJOqKeVWPi6Q,I *adore* Travis at the Hard Rock's new Kelly ...,5.0
2,2TzJjDVDEuAW6MR5Vuc1ug,I have to say that this office really has it t...,5.0
3,yi0R0Ugj_xUx_Nek0-_Qig,Went in for a lunch. Steak sandwich was delici...,5.0
4,11a8sVPMUFtaC7_ABRkmtw,Today was my second out of three sessions I ha...,1.0
...,...,...,...
9995,TDkRJ7b3yznA4YmqzNrNhQ,Creative Paradise just did my neighbor's back ...,1.0
9996,yzsca4ik4ousawRfsLLYJQ,Rugged warehouse is an interesting little stor...,3.0
9997,MH0ckhQKQu-eIqbCgjsrhw,I can't pinpoint exactly what it is that separ...,4.0
9998,tuDIfqFjtj5zTLjtY9W1Hg,Police Station Pizza has always been my favori...,5.0


## format + split data into train, val, and test sets

In [39]:
#SPLIT TRAIN INTO A DATASET INTO EQUAL NUMBER OF REVIEWS FOR EACH RATING. 
def train_validate_test_split_equal(df, train_percent=.6, validate_percent=.2, seed=0):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)

    rating_one = df[df['stars'] == 1.0]
    perm_one = np.random.permutation(rating_one.index)
    print(len(rating_one))
    print(int(train_percent*m/5))
    print(perm_one[:1000])
    train_one = rating_one.iloc[perm_one[:1000]]

    rating_two = df[df['stars'] == 2.0]
    perm_two = np.random.permutation(rating_two.index)
    train_two = rating_two.iloc[perm_two[:(int(train_percent*m)/5)]]

    rating_three = df[df['stars'] == 3.0]
    perm_three = np.random.permutation(rating_three.index)
    train_three = rating_three.iloc[perm_three[:(int(train_percent*m)/5)]]

    rating_four = df[df['stars'] == 4.0]
    perm_four = np.random.permutation(rating_four.index)
    train_four = rating_four.iloc[perm_four[:(int(train_percent*m)/5)]]

    rating_five = df[df['stars'] == 5.0]
    perm_five = np.random.permutation(rating_five.index)
    train_five = rating_five.iloc[perm_five[:(int(train_percent*m)/5)]]

    train = pd.concat(train_one, train_two, train_three, train_four, train_five) 
    
    leftover = pd.concat(perm_one[(int(train_percent*m)/5):], perm_two[(int(train_percent*m)/5):], perm_three[(int(train_percent*m)/5):], perm_four[(int(train_percent*m)/5):], perm_five[(int(train_percent*m)/5):]) 
  
    validate_end = int(validate_percent * m)
    validate = leftover.iloc[perm[:validate_end]]
    test = leftover.iloc[perm[validate_end:]]

    assert train.size + validate.size + test.size == df.size

    return train, validate, test

In [40]:
# train 75% | validation 15% | test 10%
train_ratio = .50
validate_ratio = .40
test_ratio = .10
assert train_ratio + validate_ratio + test_ratio == 1

In [41]:
train_reviews, validate_reviews, test_reviews = train_validate_test_split_equal(yelp_reviews, train_ratio, validate_ratio)
# train_reviews_df, val_reviews_df, test_reviews_df = train_validate_test_split(yelp_reviews, train_ratio, validate_ratio)

2017
1000
[7711 7246 1373 2761 3658 5612 9679  379 5644 2335 4717 4050 3192 6163
 4066 4226 7997 6466 8803 5610 3062 5844 1749 6907 4975 1978 1756 3651
  799  861 1789 9951 6404 1940  466 1415 3100 1699 3385  522 8835 1591
 3403 8521 5013 3170 4477 1802 9561 4611 9332 2506 8844 2892  664 2745
 2594 7112 1014 3197 5959  968 6384 7378 6248  935 6181 2948  265  794
 8994 6079 3044 2917 6525 1370 6171 2463 2484 4351 4131 6577 9602 6542
 8474 3609 4022 3146 5068 8636 3638 3185 5763 4175  814 5821 5585 1602
 8802 3183 9285 1051 3586 4944 5119   66 5312  866 1694  887 3466 6916
 8367 4009 4646 4415 9307 8867 4986 6493 4996 6545 6032 8790 7751 2106
 6051 4247 7300 1132 8173 7888 6712 4452 6438 9942 9518  901 7575 4603
 9497 1782 6655 9345 6580 7220 8014 8005 3641 5481 9452 6033 6637 1277
 1664 3144 1766 9688 5682 7071 6094 4641 5650 5987 4655  145 9770 7609
 2767  848  911  374 2531 6720 5271 5715 7556 6273 2737 5829 7381 5979
 2363  847 8557 1687 8066 3509 9326 4980 1199  250 4846 7968 9012 6

IndexError: ignored

In [20]:
# train_reviews, train_reviews_target, train_reviews_mask = format_reviews(xlnet_tokenizer, train_reviews_df)
# validate_reviews, test_reviews_target, validate_reviews_mask = format_reviews(xlnet_tokenizer, validate_reviews_df)
# test_reviews, test_reviews_target, _ = format_reviews(xlnet_tokenizer, test_reviews_df)

In [21]:
print(len(train_reviews.index), "yelp reviews for training")
train_reviews

5000 yelp reviews for training


Unnamed: 0,review_id,text,stars
9394,sA7BAih3Dx0ExgAWWfqb9g,I go to this Frys because my prescriptions are...,1.0
898,tTE48wqGP0LyEsnwCdiJWw,the ice cream is great but the owner is very r...,2.0
2398,mSmQkm0yKsgJJkbHeDBDcw,I've been coming to Family Doctors of Green Va...,5.0
5906,rlni5y-c32ustqTxl0hMHg,I have used Creative Event Rentals for a few d...,5.0
2343,z7DXbsKWvEd6EjJreidSpA,I came by with some friends to give Boteco a t...,3.0
...,...,...,...
3996,Qjm-Vmv72-FIawENZ8L3fA,My little girl started at SpringStone Montesso...,5.0
5889,KmGkeweiG2ex-TFRNtwZag,I have always been a fan of this local gem. Ja...,4.0
4577,dIlxFWqAZ2wwd2BBDC_9hg,This is our favorite local Mexican restaurant....,5.0
8600,KO2uv_tomqwJGnc_3nFFAg,I've lived in the Parkdale area for a while no...,4.0


In [102]:
train_reviews[train_reviews['stars'] == 5.0] ###TOO MANY 5 STAR REVIEWS 

Unnamed: 0,review_id,text,stars
2398,mSmQkm0yKsgJJkbHeDBDcw,I've been coming to Family Doctors of Green Va...,5.0
5906,rlni5y-c32ustqTxl0hMHg,I have used Creative Event Rentals for a few d...,5.0
8225,J1wEAjjMJtM66I2E8EpHOg,I went to this nail salon for the first time t...,5.0
5506,e_VCWhdExZoi0MR79rQBOw,"After reviewing paint contractors on Yelp, I c...",5.0
6451,m8tIOCCcEOaKrm95ii5C6A,Each time I visit I'm surprised by the variety...,5.0
...,...,...,...
5832,fI42sJ11eikF4yCuJzzXcA,I'm on my 3rd mortgage...all thanks to Taum an...,5.0
5408,Rrje_Iyea73thdF_aXptAQ,Today was my first adjustment but I can alread...,5.0
3851,NLcdH_VllLlyB9wMcfwFDA,Guess we were lucky we got such great service!...,5.0
3996,Qjm-Vmv72-FIawENZ8L3fA,My little girl started at SpringStone Montesso...,5.0


In [22]:
print(len(validate_reviews.index), "yelp reviews for validation")
validate_reviews

4000 yelp reviews for validation


Unnamed: 0,review_id,text,stars
333,kKYoJIofGkSmtuCDgq0Lmg,I should have listened at the other reviews fo...,2.0
6391,6dgPuFtYQn9tj8kePNP4ag,I've been waiting to try this place for months...,5.0
4786,dvbvomiSNrcA-FWBBeA7KQ,We stayed at the MGM grand to take a tour of t...,5.0
357,XN_6dESVdZAcIMs92y5raQ,We were here on a Saturday afternoon for lunch...,3.0
9854,zR14PDbLWeJQD9lWIE8P-A,Better burrito than the other Mexican restaura...,4.0
...,...,...,...
7454,Zh8JuFw3EKqwNgA6l8ceCQ,"All about the greek salad here, add some chick...",3.0
5156,0Vj1pK44WkRgcNxSd5k0CQ,"Super good, I love their mangoniada with a sco...",5.0
9583,EBoKEzvl_ER5is8Nkwk70g,My husband took his 2006 VW Jetta to ACE Car S...,5.0
8177,Pn5dXIOFgys_2Hoyq8eDQQ,This was hands down the worst mani/pedi I have...,1.0


In [23]:
print(len(test_reviews.index), "yelp reviews for testing")
test_reviews

1000 yelp reviews for testing


Unnamed: 0,review_id,text,stars
4361,4xIean1TgQyrLZnkaN0wFg,Mon restaurant portugais préféré à Montréal de...,5.0
3257,B3qM3oqzS2wu6yNrYxHOlQ,I love this class! My son's development and in...,5.0
5681,hMM9tgSrp3I9eUye8SSvoA,Just took the Sunday 10:00am class with Jen. ...,4.0
6064,G2d1RaHJK-bFmpQYXzvE2w,The Twins are fabulous! I purchased my Valley ...,5.0
7459,Ly6sOTfBvRap-Vu5eHBAqg,I usually don't complain or take time to write...,1.0
...,...,...,...
9225,ICLUM9BRPBzF6qwTE6eGWQ,Our new go to place for breakfast. Nearly ever...,5.0
4859,4Spf13dGUIUJ8XQckHMwdw,Definitely the go to place for stuff you need ...,5.0
3264,762bdV36VTkmwK1Gz47RWQ,Thank you for your timely response to my reque...,4.0
9845,SWH8YiyAD7YIt5WgkgYGfg,"Our car Lurker died, and we were looking for a...",4.0


In [24]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification

In [25]:
#experimentation 

In [26]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=200)
model.classifier.add_module('bert_activation', nn.Tanh())
model.classifier.add_module('prediction', nn.Linear(200, 5))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [27]:
model.classifier

Linear(
  in_features=768, out_features=200, bias=True
  (bert_activation): Tanh()
  (prediction): Linear(in_features=200, out_features=5, bias=True)
)

In [28]:
from transformers.data.processors.utils import InputExample


In [29]:
from transformers.data.processors.glue import glue_convert_examples_to_features

In [65]:
def get_features(df, text_col, label_col):
    l = [InputExample(guid=idx, text_a=df.loc[idx, text_col], label=df.loc[idx, label_col]) for 
       idx, row in tqdm.tqdm(df.iterrows(), total=df.shape[0])]
    features = glue_convert_examples_to_features(examples=l, 
                                    tokenizer=tokenizer,
                                    max_length=128,
                                    label_list = df[label_col].values,
                                    output_mode='regression')

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
    all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
    print(all_labels)
    dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels)
    return dataset


In [66]:
test_reviews['stars'][0:10]

4361    5.0
3257    5.0
5681    4.0
6064    5.0
7459    1.0
373     5.0
8926    3.0
9162    5.0
2596    2.0
4080    5.0
Name: stars, dtype: float64

In [68]:
from torch.utils.data import TensorDataset, DataLoader

In [69]:
train_dataset = get_features(train_reviews, 'text', 'stars')
test_dataset = get_features(test_reviews, 'text', 'stars')

100%|██████████| 5000/5000 [00:00<00:00, 11000.20it/s]
100%|██████████| 1000/1000 [00:00<00:00, 11412.67it/s]

tensor([1, 2, 5,  ..., 5, 4, 1])





tensor([5, 5, 4, 5, 1, 5, 3, 5, 2, 5, 4, 5, 5, 5, 5, 1, 5, 5, 5, 5, 1, 4, 5, 2,
        5, 1, 1, 4, 4, 2, 4, 5, 1, 5, 1, 3, 5, 4, 1, 5, 5, 1, 4, 1, 5, 1, 1, 1,
        5, 5, 5, 5, 5, 5, 2, 5, 1, 1, 1, 5, 5, 3, 4, 5, 5, 5, 5, 5, 3, 5, 5, 1,
        4, 5, 1, 2, 5, 1, 1, 5, 1, 3, 1, 1, 5, 5, 5, 4, 5, 5, 5, 5, 5, 1, 5, 1,
        2, 4, 5, 4, 5, 5, 5, 1, 4, 2, 3, 4, 5, 4, 1, 5, 5, 5, 5, 5, 2, 1, 5, 1,
        2, 5, 5, 5, 2, 5, 2, 2, 4, 4, 5, 2, 3, 5, 5, 4, 5, 1, 4, 3, 4, 5, 1, 5,
        5, 4, 2, 5, 5, 4, 1, 4, 5, 5, 5, 5, 5, 1, 1, 4, 5, 1, 4, 5, 5, 4, 1, 4,
        5, 1, 5, 1, 5, 4, 1, 5, 5, 5, 1, 1, 1, 4, 4, 5, 4, 1, 5, 5, 3, 5, 5, 5,
        4, 4, 5, 1, 1, 1, 5, 2, 5, 5, 2, 4, 4, 5, 3, 4, 1, 1, 5, 1, 3, 5, 1, 5,
        4, 1, 5, 5, 1, 5, 1, 3, 3, 4, 1, 4, 2, 5, 4, 1, 5, 1, 3, 3, 1, 1, 5, 4,
        1, 1, 5, 5, 4, 4, 5, 1, 5, 5, 2, 1, 1, 1, 4, 5, 2, 3, 5, 5, 4, 1, 5, 4,
        3, 5, 5, 5, 2, 4, 5, 5, 4, 5, 1, 5, 5, 3, 2, 5, 5, 5, 3, 5, 4, 3, 5, 5,
        3, 1, 5, 1, 1, 3, 5, 5, 1, 5, 5,

In [70]:
len(train_dataset[0][1])

128

In [71]:
train_dataset[0:9]
#the way this is structured, 3 matrices, each row of the matrix is one sentence. matrix 1) sentence tokenized 2) attention mask 3) target review. 

(tensor([[  101,   146,  1301,  ...,     0,     0,     0],
         [  101,  1103,  2854,  ...,     0,     0,     0],
         [  101,   146,   112,  ...,   146,  6759,   102],
         ...,
         [  101,  1258, 19730,  ...,  2261,  1110,   102],
         [  101,  2994,  1159,  ...,     0,     0,     0],
         [  101,  2066,  2802,  ...,  1198,  1458,   102]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 1, 1, 1]]),
 tensor([1, 2, 5, 5, 3, 5, 5, 5, 4]))

In [72]:
validation_dataset = get_features(validate_reviews, 'text', 'stars')

100%|██████████| 4000/4000 [00:00<00:00, 11148.01it/s]


tensor([2, 5, 5,  ..., 5, 1, 5])


In [73]:
#add requires_grad for each parameter 
for param in model.classifier.parameters():
    param.requires_grad = True
model.cuda();

In [74]:
output = model(input_ids=train_dataset[:2][0].cuda(), 
      attention_mask=train_dataset[:2][1].cuda(), 
      labels=train_dataset[:2][2].cuda());

print(output[:2])

(tensor(2.3787, device='cuda:0', grad_fn=<NllLossBackward>), tensor([[  7.9288,   6.9673,   6.3678,   6.2745,   8.5522, -11.4989, -12.1174,
         -11.4633, -11.2982, -11.4607, -10.8837, -10.8444, -11.2592, -10.9864,
         -10.5308, -11.4406, -10.1246, -10.4332, -10.2586, -11.0173, -10.7564,
         -10.7302, -10.1538,  -9.9445, -10.5819, -11.1568, -11.0880, -10.8768,
         -10.2505, -11.0153, -10.6978, -10.1884, -10.7739, -11.0971, -10.9958,
         -10.4222, -11.4245, -10.9608, -11.0285, -10.9922, -11.2044, -10.5780,
         -10.6639, -11.5561, -11.0720, -10.6187, -11.4375, -11.2885, -10.9179,
         -11.6151, -11.7864, -10.8343, -10.5322, -10.9621, -10.6175, -11.4052,
         -11.0633, -10.7927, -10.9963, -11.4808, -10.5378, -11.3742, -11.4733,
         -10.7270, -11.8483, -10.8836, -11.3428, -10.0429, -11.1309, -10.7940,
         -11.3557, -11.0276, -11.2759, -10.3529, -10.9117, -10.4857, -11.4879,
          -9.8851, -12.0496, -10.8175, -10.3905, -10.9613, -10.4126, -

In [75]:
from torch.optim import Adam

In [76]:
batch_size = 32
gradient_every = 64

accumulation_steps = gradient_every//batch_size

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(validation_dataset, batch_size=batch_size*2, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size*2, shuffle=False)

epochs = 15

lr = 0.002
optimizer = Adam(model.classifier.parameters(), lr=lr)  

In [77]:
from tqdm import tnrange

In [78]:
tr_losses = []
v_losses = []

for epoch in tqdm.notebook.tnrange(epochs, desc='epoch'):
    """ Training stage """
    model.train()
    epoch_tr_losses = []
    print(f'epoch {epoch+1}')
    for k, (input_ids, attention_mask, labels) in enumerate(tqdm.tqdm(train_dataloader, total=len(train_dataloader), desc='batch')):
        feed_dict = {'input_ids': input_ids.cuda(),
                     'attention_mask': attention_mask.cuda(),
                     'labels': labels.cuda()}
        
        output = model(**feed_dict)
        # gradient accumulation
        epoch_tr_losses.append(output.loss.item())
        output.loss = output.loss/accumulation_steps
        output.loss.backward()
        if (k + 1) % accumulation_steps == 0:
            optimizer.step()
            model.zero_grad()

    tr_losses.append(np.mean(epoch_tr_losses))
    print(f'train NLL loss: {np.mean(epoch_tr_losses)}')
  
    """ Validation stage """
    epoch_v_losses = [] 
    with torch.no_grad():
        for k, (input_ids, attention_mask, labels) in enumerate(tqdm.tqdm(val_dataloader, total=len(val_dataloader), desc='val batch')):
            feed_dict = {'input_ids': input_ids.cuda(),
                         'attention_mask': attention_mask.cuda(),
                         'labels': labels.cuda()} 

            output = model(**feed_dict)
            epoch_v_losses.append(output.loss.item())
        v_losses.append(np.mean(epoch_v_losses))
    print(f'validation BCE loss: {np.mean(epoch_v_losses)}')
    torch.save(model.classifier.state_dict(), str(ROOT_FOLDER / "state_dict_model.pt"))

HBox(children=(FloatProgress(value=0.0, description='epoch', max=15.0, style=ProgressStyle(description_width='…

batch:   0%|          | 0/157 [00:00<?, ?it/s]

epoch 1


batch: 100%|██████████| 157/157 [01:38<00:00,  1.59it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 2.399008463902079


val batch: 100%|██████████| 63/63 [00:30<00:00,  2.05it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.3998945478409055
epoch 2


batch: 100%|██████████| 157/157 [01:45<00:00,  1.49it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.3558917911189377


val batch: 100%|██████████| 63/63 [00:31<00:00,  1.99it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.3412584255612086
epoch 3


batch: 100%|██████████| 157/157 [01:48<00:00,  1.45it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.3149264154920153


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.3009833362367418
epoch 4


batch: 100%|██████████| 157/157 [01:49<00:00,  1.43it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.2601101497176346


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.94it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.2516989329504589
epoch 5


batch: 100%|██████████| 157/157 [01:49<00:00,  1.43it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.2421472941993907


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.92it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.2403872296923684
epoch 6


batch: 100%|██████████| 157/157 [01:50<00:00,  1.42it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.2080047745613536


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.197847622727591
epoch 7


batch: 100%|██████████| 157/157 [01:50<00:00,  1.42it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1957107281229298


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.93it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.1798507750980438
epoch 8


batch: 100%|██████████| 157/157 [01:49<00:00,  1.43it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1865467793622595


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.94it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.2129425794359237
epoch 9


batch: 100%|██████████| 157/157 [01:49<00:00,  1.43it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.194013627471438


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.222401788310399
epoch 10


batch: 100%|██████████| 157/157 [01:49<00:00,  1.43it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1735540484167208


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.94it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.1410639560411846
epoch 11


batch: 100%|██████████| 157/157 [01:49<00:00,  1.43it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1587524376097758


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.1613511149845426
epoch 12


batch: 100%|██████████| 157/157 [01:49<00:00,  1.44it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1570124838762224


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.212028213909694
epoch 13


batch: 100%|██████████| 157/157 [01:49<00:00,  1.44it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1390902737902988


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.164008565365322
epoch 14


batch: 100%|██████████| 157/157 [01:49<00:00,  1.44it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1544394728484426


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]
batch:   0%|          | 0/157 [00:00<?, ?it/s]

validation BCE loss: 1.1277007971491133
epoch 15


batch: 100%|██████████| 157/157 [01:49<00:00,  1.44it/s]
val batch:   0%|          | 0/63 [00:00<?, ?it/s]

train NLL loss: 1.1433799141531538


val batch: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]

validation BCE loss: 1.1287100939523607






In [79]:
batch_predictions, batch_actual = [], []
with torch.no_grad():
    for k, (input_ids, attention_mask, labels) in enumerate(tqdm.tqdm(test_dataloader, total=len(test_dataloader), desc='test batch')):
        feed_dict = {'input_ids': input_ids.cuda(),
                     'attention_mask': attention_mask.cuda()} 
        
        pred = model(**feed_dict)[0].cpu()
        batch_predictions.append(pred.numpy())
        batch_actual.append(labels)

test batch: 100%|██████████| 16/16 [00:07<00:00,  2.10it/s]


In [80]:
predictions = np.array([i for k in batch_predictions for i in k ])

predictions = np.argmax(predictions, axis=1)
actual = np.array([i for k in batch_actual for i in k ])

In [81]:
test_reviews

Unnamed: 0,review_id,text,stars
4361,4xIean1TgQyrLZnkaN0wFg,Mon restaurant portugais préféré à Montréal de...,5.0
3257,B3qM3oqzS2wu6yNrYxHOlQ,I love this class! My son's development and in...,5.0
5681,hMM9tgSrp3I9eUye8SSvoA,Just took the Sunday 10:00am class with Jen. ...,4.0
6064,G2d1RaHJK-bFmpQYXzvE2w,The Twins are fabulous! I purchased my Valley ...,5.0
7459,Ly6sOTfBvRap-Vu5eHBAqg,I usually don't complain or take time to write...,1.0
...,...,...,...
9225,ICLUM9BRPBzF6qwTE6eGWQ,Our new go to place for breakfast. Nearly ever...,5.0
4859,4Spf13dGUIUJ8XQckHMwdw,Definitely the go to place for stuff you need ...,5.0
3264,762bdV36VTkmwK1Gz47RWQ,Thank you for your timely response to my reque...,4.0
9845,SWH8YiyAD7YIt5WgkgYGfg,"Our car Lurker died, and we were looking for a...",4.0


In [84]:
test_reviews['prediction'] = predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [86]:
test_reviews['distance'] = np.abs(test_reviews['stars'] - test_reviews['prediction']) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [88]:
test_reviews[test_reviews['distance'] >= 3]

Unnamed: 0,review_id,text,stars,prediction,distance
3257,B3qM3oqzS2wu6yNrYxHOlQ,I love this class! My son's development and in...,5.0,1,4.0
7459,Ly6sOTfBvRap-Vu5eHBAqg,I usually don't complain or take time to write...,1.0,5,4.0
4080,JNIUh-D9EwYTdLT7B2MN9Q,I recently went to this place for cleaning my ...,5.0,1,4.0
5691,MqjBR2zz4rgyKtB-cKwZGw,Our third and final stop on the Restaurant Wee...,4.0,1,3.0
6015,SvpjSyXM2ZBcl4Z35hdSwQ,I really liked this gym when I went there. My ...,1.0,5,4.0
...,...,...,...,...,...
1207,VncR4VqbwldiJ7PKaSkKzQ,Went to the White Brick Kitchen three times in...,4.0,1,3.0
3219,34NDgFx1Ve4Eqk141j0hbQ,We regularly take our elderly mother who lives...,1.0,5,4.0
797,zc89sz0aPxs86BsfGa-qfg,Picture Homer Simpson when he sees something r...,5.0,1,4.0
755,afPyJp-v9TtRxqNCjU6rRA,"As convenience stores go, this one could be th...",1.0,5,4.0


In [97]:
huge_differences = test_reviews[test_reviews['distance'] >= 3]

In [101]:
#OVERPREDICTIONS
huge_differences.iloc[1]['text']

"I usually don't complain or take time to write reviews but my experience with dr Petar has been nothing but horrible.  This guy has managed to practice all the things no professional in any field should do.  Doctor-patient manners are non existent.  Doesn't keep his cellphone on silent or vibrate,  answers private calls and texts in the middle of your appointment on your dime and time, doesn't even bother to excuse himself or apologize for obvious rudeness, treats numerous patience at the same time and at any given appointment he would leave multiple times to do other things while he can't even pretend to care about your needs.  Couple times I was scheduled early in the morning and each time after a 20-30min wait I was notified by nurses that dr Petar is not even in the office yet.  When I brought it to his attention that my time is valuable too and shouldn't be scheduled at times he is still at home, shockingly he had a nerve to say go find another doctor if you don't like the servic

In [98]:
#UNDERPREDICTIONS 
huge_differences.iloc[0]['text']

"I love this class! My son's development and interaction with others has taken off since he started a little over a month ago. Every class he does a little more than the last. Sue is a great teacher. She's great at making it fun while helping them to learn self control and patience. Someone else commented on the condition of the puppets and parachute; yeah they are a bit crusty and probably could be cleaned a little better, but I decided to put my germaphobe fears aside for my child's development. Even if he gets a cold or two from class (which he hasn't after 5 weeks) that seems to be par for the course when it comes to interacting with other children and the benefits to his development outweigh a few sniffles in my opinion!"

In [99]:
huge_differences.iloc[2]['text']

"I recently went to this place for cleaning my car, after one of my Uber passengers throw up on the back seat. Since Uber was not kind enough to cover the fees entirely, I asked Evan if we can clean the car with the amount that Uber will cover, and he was really generous to accept, giving a signification discount to me so I can clean my car with the amount Uber provided. The service was great. They cleaned the whole interior and treated the cabin with ozone, so the smell completely disappeared (note that vomit has a terrible odour!). \n\nI never saw my car so clean, since I bought it used, and even the time I purchased the car from the dealership, it wasn't as clean as I got the car from this shop. \n\nAbout what they did, they washed the seats with shampoo and by vacuum machine sucked the whole water out, so I didn't feel the seats being wet. They also cleaned my dashboard and all the plastic components in the car and I beloved waxed them as well. \n\nOverall, very happy with their se

#Model

## model construction

In [None]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, rnn_size, vader_size, num_layers=1, dropout=0, use_vader=True):
        super().__init__()
        
        #################
        #    INPUT 1    #
        #################
        # Create an embedding layer, with 768 hidden layers
        self.xlnet = torch.hub.load('huggingface/pytorch-transformers', 'model', 'xlnet-base-cased')
        for param in self.xlnet.layer.parameters():
          param.requires_grad = False

          
        # Output: (vocab_size x 768), where 768 hidden layers of XLNet

        # Coming in: torch.Size([BATCH_SIZE, vocab_size, 768])
        #   (XLNet has 768 hidden layers, https://huggingface.co/transformers/pretrained_models.html)
        conv2d_c_in = 1
        conv2d_c_out = 1
        conv2d_kernel_W = 5 # along Embedding Length
        conv2d_kernel_H = 5 # along Word Length

        self.conv2D_layer = nn.Conv2d(conv2d_c_in, conv2d_c_out, (conv2d_kernel_H, conv2d_kernel_W))
        # Filter of (conv2d_kernel_H, conv2d_kernel_W), Cin = 1, Cout = 1

        # Output:
        conv2d_out_Hout = vocab_size - ((conv2d_kernel_H - 1) // 2) * 2 # Vocab Size
        conv2d_out_Wout = 768 - ((conv2d_kernel_W - 1) // 2) * 2        # length

        self.max_pool_2d = nn.MaxPool2d((conv2d_out_Hout, 1))
        max_pool_2d_out_height = conv2d_out_Hout // conv2d_out_Hout
        max_pool_2d_out_length = conv2d_out_Wout // 1
        #################
        #  INPUT 1 END  #
        #################
        
        #################
        #    INPUT 2    #
        #################
        self.lstm = None
        if use_vader:
          self.lstm = nn.LSTM(input_size=1, hidden_size=1, num_layers=num_layers, batch_first=True, dropout=dropout)
        else:
          vader_size = 0
        #################
        #  INPUT 2 END  #
        #################

        self.dropout = nn.Dropout(dropout)
        # print(max_pool_2d_out_length + vader_size)

        hidden_layer_dense = 100

        self.dense = nn.Sequential(
                nn.Linear(max_pool_2d_out_length + vader_size, hidden_layer_dense),
                nn.ReLU()
            )
        self.output = nn.Linear(hidden_layer_dense, 5) # classify yelp_reviews into 5 ratings
    
    def forward_input_vectorized(self, x):
      xlnet_out = self.xlnet(x)
      xlnet_out_hidden = xlnet_out.last_hidden_state
      batches_len, word_len, embedding_len = xlnet_out_hidden.shape
      xlnet_out_hidden = xlnet_out_hidden.reshape(batches_len, 1, word_len, embedding_len)
      conv2d_out = self.conv2D_layer(xlnet_out_hidden)
      result = self.max_pool_2d(conv2d_out)
      # print(result.shape)
      result = result.squeeze(1).squeeze(1)
      return result

    def forward_input_vader(self, x):
      batch_size, vader_len = x.shape
      # print(x.reshape(batch_size, vader_len, 1).shape)
      output, _ = self.lstm(x.reshape(batch_size, vader_len, 1))
      # print(output.shape)
      output = output.squeeze(2)
      return output

    def forward(self, vectorized_words, vader):
        input1 = self.forward_input_vectorized(vectorized_words)

        if self.lstm:
          input2 = self.forward_input_vader(vader)
          combined_input = (input1, input2)
        else:
          combined_input = (input1,) # Tuples need the stray comma

        # print(input1.size(), input2.size())

        combined_input = torch.cat(combined_input, dim=1)

        lstm_drop = self.dropout(combined_input)
        logits = self.dense(lstm_drop)
        logits = self.output(logits)
        return logits
    
    def loss_fn(self, prediction, target):
      loss_criterion = nn.CrossEntropyLoss(reduction='none')
      return torch.mean(loss_criterion(prediction, target))

In [None]:
model = LanguageModel(vocab_size=MAX_LEN, rnn_size=256, vader_size=MAX_LEN_VADER)

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_master


## train the model

In [None]:
# num_of_validaion_set = 20 #len(validate_reviews)

# batch_val = format_reviews(xlnet_tokenizer, validate_reviews, range(num_of_validaion_set), review_sentiment_dict=review_sentiment_dict) # This cell may take a while

# (batch_input_val, batch_target_val, batch_review_sentiment_val, batch_target_mask_val) = batch_val

In [None]:
def run_validation(model, use_all=False, mode="val"):
  reviews_dataset = None
  if mode == "val":
    print("Running Validation")
    mode = "Validation"
    reviews_dataset = validate_reviews
  elif mode == "test":
    print("Running Testing")
    mode = "Test"
    reviews_dataset = test_reviews
  else:
    assert False, "Invalid mode"
  num_of_review_set = len(reviews_dataset) if use_all else 1000
  indices = np.random.permutation(len(reviews_dataset))
  t = tqdm.notebook.tqdm(range(0, ( num_of_review_set // BATCH_SIZE) + ( 1 if num_of_review_set % BATCH_SIZE > 0 else 0 )))
  loss_val_total = 0
  accuracy_val_total = 0
  temp_count = 0
  for i in t:
    val_start_i = i*BATCH_SIZE
    val_end_i = (i+1)*BATCH_SIZE
    # print(val_start_i, val_end_i, indices.shape)
    batch_val = format_reviews(xlnet_tokenizer, reviews_dataset, indices[val_start_i:val_end_i], review_sentiment_dict=review_sentiment_dict)
    (batch_input_val, batch_target_val, batch_review_sentiment_val, batch_target_mask_val) = batch_val
    # print(batch_input_val.shape, batch_review_sentiment_val.shape)
    (batch_input_val, batch_target_val) = list_to_device((batch_input_val, batch_target_val))
    batch_target_mask_val, batch_review_sentiment_val = list_to_device((batch_target_mask_val, batch_review_sentiment_val))
    # print(batch_input_val.shape, batch_review_sentiment_val.shape)
    prediction_val = model.forward(batch_input_val, batch_review_sentiment_val)
    # print(prediction_val.size(), batch_target_val.size())
    # print(prediction_val, batch_target_val)
    loss_val_total += model.loss_fn(prediction_val, batch_target_val).item()
    # print(loss_val)
    accuracy_val_total += torch.mean(torch.eq(prediction_val.argmax(dim=1,keepdim=False),batch_target_val).float()).item()
    temp_count += 1
    if i % round(8000 / BATCH_SIZE) == 0 and i != 0 and use_all:
      print(mode, "Prelim Evaluation set loss:", loss_val_total / temp_count, mode, "Prelim Accuracy:", accuracy_val_total / temp_count)
  loss_val = loss_val_total / temp_count
  accuracy_val = accuracy_val_total / temp_count
  print(mode, "Evaluation set loss:", loss_val, mode, "Accuracy set %:", accuracy_val)

In [None]:
losses = []
accuracies = []

epoch_start = 0
t_start = 0

In [None]:
# ONLY RUN THIS CELL (and next cell) if want to load checkpoint
# If you accidentally run this cell, no harm done (be careful with next cell!!!)

checkpoint = None
try:
  checkpoint = torch.load(str(TORCH_CHECKPOINT_MODEL))
  print("Checkpoint loaded")
except:
  print("No Checkpoint loaded")

No Checkpoint loaded


In [None]:
lr = 1e-4
optimizer_method = optim.Adam
optimizer = optimizer_method(model.parameters(), lr=lr)

In [None]:
# ONLY RUN THIS CELL if want to load checkpoint

if checkpoint:
  epoch_start = checkpoint['epoch']
  t_start = checkpoint['t']
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer = optimizer_method(model.parameters(), lr=lr)
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  losses = checkpoint['losses']
  accuracies = checkpoint['accuracies']
  model.to(device)

  print("Checkpoint")
  run_validation(model)

  print(f"Checkpoint Epoch: {epoch_start} Iteration: {i} Loss: {np.mean(losses[-10:])} Accuracy: {np.mean(accuracies[-10:])}")

In [None]:
# set model to training mode
# Needs to be placed after the Checkpoint file loading
model.train()

LanguageModel(
  (xlnet): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [None]:
# https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html

DATASET = train_reviews

# Constants of interest: BATCH_SIZE, EPOCHS

since = time.time()

# start training
for epoch in range(epoch_start, EPOCHS):
  indices = np.random.permutation(DATASET.shape[0])

  dataset_batch_cap = ( DATASET.shape[0] // BATCH_SIZE ) + (1 if DATASET.shape[0] % BATCH_SIZE > 0 else 0)

  t = tqdm.notebook.tqdm(range(t_start, dataset_batch_cap), initial = t_start, total = dataset_batch_cap)
  
  for i in t:
    # batch
    batch = format_reviews(xlnet_tokenizer, DATASET, indices[i*BATCH_SIZE:(i+1)*BATCH_SIZE], review_sentiment_dict=review_sentiment_dict)
    (batch_input, batch_target, batch_review_sentiment, batch_target_mask) = batch
    # for item in (batch_input, batch_target, batch_review_sentiment, batch_target_mask):
    #   print(item.size())
    (batch_input, batch_target, batch_target_mask, batch_review_sentiment) = list_to_device((batch_input, batch_target, batch_target_mask, batch_review_sentiment))
    model.to(device)
    
    # forward pass
    prediction = model.forward(batch_input, batch_review_sentiment)
    # print(prediction.size(), batch_target.size())
    loss = model.loss_fn(prediction, batch_target)
    # print(loss)
    losses.append(loss.item())
    accuracy = torch.mean(torch.eq(prediction.argmax(dim=1,keepdim=False),batch_target).float())
    accuracies.append(accuracy.item())
    
    # backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # visuallize data
    if i % 1000 == 0 and i != t_start:
      torch.save({'epoch': epoch,
                  't': i,
                  'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'losses': losses,
                  'accuracies': accuracies
                  }, str(TORCH_CHECKPOINT_MODEL))
      run_validation(model)
      print(f"Epoch: {epoch} Iteration: {i} Train Loss: {np.mean(losses[-10:])} Train Accuracy: {np.mean(accuracies[-10:])}")

  t_start = 0


HBox(children=(FloatProgress(value=0.0, max=8338.0), HTML(value='')))

RuntimeError: ignored

In [None]:
# Save the latest model
print("Saving latest model to", str(TORCH_CHECKPOINT_MODEL))
torch.save({'epoch': EPOCHS,
            't': (DATASET.shape[0] // BATCH_SIZE)+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': losses,
            'accuracies': accuracies
            }, str(TORCH_CHECKPOINT_MODEL))

## evaluate model

In [None]:
# set model to evaluation model
model.eval()

In [None]:
run_validation(model, True, "val")

In [None]:
run_validation(model, True, "test")

Running Testing


HBox(children=(FloatProgress(value=0.0, max=3335.0), HTML(value='')))

Test Prelim Evaluation set loss: 0.7104197494507193 Test Prelim Accuracy: 0.7723303393213573
Test Prelim Evaluation set loss: 0.7166598277030648 Test Prelim Accuracy: 0.7687937062937062
Test Prelim Evaluation set loss: 0.7128958379717011 Test Prelim Accuracy: 0.7719020652898068
Test Prelim Evaluation set loss: 0.714400890948384 Test Prelim Accuracy: 0.7722701149425287
Test Prelim Evaluation set loss: 0.7153426486878145 Test Prelim Accuracy: 0.7720661735305878
Test Prelim Evaluation set loss: 0.7138283398115349 Test Prelim Accuracy: 0.7721176274575141

Test Evaluation set loss: 0.04473292892023305 Test Accuracy set %: 0.0482227303123038


#Playground

In [None]:
# This is a cheap solution to stops any run all that reaches my Playground
hard_stop = input("Hard Stop here. Enter any key to allow passage.")

if len(hard_stop) == 0:
  raise Exception("Hard Stop")

In [None]:
print(list(tokenize(STARTER["text"]))[:3])

In [None]:
import urllib.request
import io
import sentencepiece as spm

# https://github.com/google/sentencepiece/tree/master/python

# Loads model from URL as iterator and stores the model to BytesIO.
model = io.BytesIO()
spm.SentencePieceTrainer.train(
      sentence_iterator=STARTER["text"], model_writer=model, vocab_size=1000)

# Serialize the model as file.
# with open('out.model', 'wb') as f:
#   f.write(model.getvalue())

# Directly load the model from serialized model.
sp = spm.SentencePieceProcessor(model_proto=model.getvalue())
print(sp.encode('this is test'))