In [None]:
'''
Dataset Description: https://www.tensorflow.org/datasets/catalog/yelp_polarity_reviews
Dataset Homepage: https://course.fast.ai/datasets
Dataset Download: https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz

In short, Yelp reviews in two categories: Bad and Good
'''

#Download 
!wget -nc  https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz
!tar xzf '/content/yelp_review_polarity_csv.tgz'

--2020-07-22 12:07:08--  https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.112.229
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.112.229|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 166373201 (159M) [application/x-tar]
Saving to: ‘yelp_review_polarity_csv.tgz’


2020-07-22 12:07:20 (14.4 MB/s) - ‘yelp_review_polarity_csv.tgz’ saved [166373201/166373201]



In [None]:
'''
Read text data.
'''
from pandas import read_csv
import math
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
from functools import reduce
from tensorflow.data import Dataset

def massage(x):
  return (x[1].encode('UTF-8'),x[0])
def encode(text,rating):
  encodedText = encoder.encode(text.numpy())
  return encodedText, rating-1

def encodeMapFunc(text,label):
  encodedText, label = tf.py_function(encode,
                                      inp=[text,label],
                                      Tout=(tf.int64,tf.int64))
  
  encodedText.set_shape([None])
  label.set_shape([])
  return encodedText,label


rawTrain = read_csv('/content/yelp_review_polarity_csv/train.csv').set_axis(['rating','text'],axis='columns')
rawTest = read_csv('/content/yelp_review_polarity_csv/test.csv').set_axis(['rating','text'],axis='columns')

train = rawTrain.apply(massage,axis=1,result_type='expand').set_axis(['text','rating'],axis='columns')
trainTexts = train.pop('text')
trainRatings = train.pop('rating')
train = Dataset.from_tensor_slices((trainTexts,trainRatings))

test = rawTest.apply(massage,axis=1,result_type='expand').set_axis(['text','rating'],axis='columns')
testTexts = test.pop('text')
testRatings = test.pop('rating')
test = Dataset.from_tensor_slices((testTexts,testRatings))

tokenizer = tfds.features.text.Tokenizer()
vocab = set()
for data in [trainTexts,testTexts]:
  for text,label in train:
    tokens = tokenizer.tokenize(text.numpy())
    vocab.update(tokens)
encoder = tfds.features.text.TokenTextEncoder(vocab)

train = train.map(encodeMapFunc)
train = train.shuffle(10000)
train = train.padded_batch(128)


test = test.map(encodeMapFunc)
test = test.shuffle(10000)
test = test.padded_batch(128)

In [None]:
'''
Model structure.
'''
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *


model = Sequential([
  Embedding(encoder.vocab_size,64),
  Bidirectional(LSTM(64, return_sequences=True)),
  Dropout(0.5),
  Bidirectional(LSTM(64)),
  Dropout(0.5),
  Dense(128),
  Dropout(0.5),
  Dense(1,activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 64)          19080448  
_________________________________________________________________
bidirectional_6 (Bidirection (None, None, 128)         66048     
_________________________________________________________________
dropout_9 (Dropout)          (None, None, 128)         0         
_________________________________________________________________
bidirectional_7 (Bidirection (None, 128)               98816     
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)              

In [None]:
'''
Train Model
'''

model.fit(train,epochs=3,validation_data=test)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f1a99dac518>

In [None]:
'''
Save Model and encoder
'''
from tensorflow.keras.models import save_model
save_model(model,'./polarYelpReviewerModel_Epochs00003')
encoder.save_to_file("./polarYelpEncoder")

INFO:tensorflow:Assets written to: ./polarYelpReviewerModel_Epochs00003/assets


In [None]:
'''
Small trial and test.

Negation appears to not be understood by the model.
'''
goodTexts = [
  "I love it.", 
  "You cannot hate it.",
  "Beautiful place."]
goodTexts = [encoder.encode(x) for x in goodTexts]
for i in range(len(goodTexts)):
  print(model.predict(tf.expand_dims(goodTexts[i], 0)))

badTexts = [
"I hate it.", 
"You cannot love it.",
"Ugly place."]
badTexts = [encoder.encode(x) for x in badTexts]
for i in range(len(badTexts)):
  print(model.predict(tf.expand_dims(badTexts[i], 0)))

[[0.99363303]]
[[0.2584478]]
[[0.9897226]]
[[0.00644007]]
[[0.9691264]]
[[0.09578083]]
