In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation, Conv2D, MaxPooling2D
from tensorflow.keras.models import Sequential
import pandas as pd
import json

In [3]:
# Reading in data

business_json_path = 'C:\\Users\\sjmif\\Documents\\YelpReviews\\yelp_academic_dataset_business.json'
df_b = pd.read_json(business_json_path, lines=True)
df_b

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150341,IUQopTMmYQG-qRtBk-8QnA,Binh's Nails,3388 Gateway Blvd,Edmonton,AB,T6J 5H2,53.468419,-113.492054,3.0,13,1,"{'ByAppointmentOnly': 'False', 'RestaurantsPri...","Nail Salons, Beauty & Spas","{'Monday': '10:0-19:30', 'Tuesday': '10:0-19:3..."
150342,c8GjPIOTGVmIemT7j5_SyQ,Wild Birds Unlimited,2813 Bransford Ave,Nashville,TN,37204,36.115118,-86.766925,4.0,5,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Pets, Nurseries & Gardening, Pet Stores, Hobby...","{'Monday': '9:30-17:30', 'Tuesday': '9:30-17:3..."
150343,_QAMST-NrQobXduilWEqSw,Claire's Boutique,"6020 E 82nd St, Ste 46",Indianapolis,IN,46250,39.908707,-86.065088,3.5,8,1,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...","Shopping, Jewelry, Piercing, Toy Stores, Beaut...",
150344,mtGm22y5c2UHNXDFAjaPNw,Cyclery & Fitness Center,2472 Troy Rd,Edwardsville,IL,62025,38.782351,-89.950558,4.0,24,1,"{'BusinessParking': '{'garage': False, 'street...","Fitness/Exercise Equipment, Eyewear & Optician...","{'Monday': '9:0-20:0', 'Tuesday': '9:0-20:0', ..."


In [4]:
# Cleaning some columns/ selecting relevent data.

df_b = df_b[df_b['is_open']==1]
drop_columns = ['hours','is_open','review_count']
df_b = df_b.drop(drop_columns, axis=1)

In [5]:
# Lets shave down large portion of the dataset to speed up the example. Selecting only 'Brewpubs'.

business_Rest = df_b[df_b['categories'].str.contains(
              'Brewpubs',
              case=False, na=False)]

In [9]:
# Now we have to read in the very large reviews json file. Because of the size of the file if we were to try and load it in
#       normally it would crash our memory.

# The trick will be to break up the file into 'chunks' load each individually and then concatenate all the chunks at the end of 
#       the process
# We can see the last argument offered in the pd.read_json function allows us to specify the chunksize.

size = 500000
review_json_path = 'C:\\Users\\sjmif\\Documents\\YelpReviews\\yelp_academic_dataset_review.json'
review = pd.read_json(review_json_path, lines = True,
                        dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize = size)

AttributeError: 'JsonReader' object has no attribute 'head'

In [10]:
# Lets create an empty list to store each chunk
# Remove uneccessary columns

chunk_list = []
for chunk_review in review:
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    chunk_merged = pd.merge(business_Rest, chunk_review, on='business_id', how='inner')
    print(f"{chunk_merged.shape[0]} out of {size:,} related reviews")
    chunk_list.append(chunk_merged)
df = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

2096 out of 500,000 related reviews
1834 out of 500,000 related reviews
1250 out of 500,000 related reviews
1524 out of 500,000 related reviews
1829 out of 500,000 related reviews
2368 out of 500,000 related reviews
2312 out of 500,000 related reviews
1445 out of 500,000 related reviews
1897 out of 500,000 related reviews
1469 out of 500,000 related reviews
349 out of 500,000 related reviews
1106 out of 500,000 related reviews
1289 out of 500,000 related reviews
1551 out of 500,000 related reviews


In [9]:
df

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date
0,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food",lrtGPAmDqCFnbfAKiB4NmA,4,The craft brewery scene has finally hit the sl...,2019-11-02 01:18:50
1,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food",Qp43wr0CkKw4W79MFu_MGw,5,Glad to have a local brewery so close to my ho...,2019-08-16 13:26:24
2,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food",DCvqkfO3exqOaTf0-fvyLQ,5,First visit to this new and very local brewery...,2019-11-24 03:49:39
3,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food",Y7BFSuNRNzvFbyZcZSXQJw,5,What an amazing brewery and an absolute asset ...,2020-02-27 06:21:36
4,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,Philadelphia,PA,19123,39.962582,-75.135657,3.5,"{'OutdoorSeating': 'True', 'RestaurantsPriceRa...","Eatertainment, Arts & Entertainment, Brewpubs,...",KGEdaKlPI-Sv2K_pa2HKgg,4,This is a great place to take guests visiting ...,2019-07-22 22:48:30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22314,nTEmSvz8_DK-iK3miQFCwA,Great Basin Brewing,846 Victorian Ave,Sparks,NV,89431,39.535095,-119.754133,4.0,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Food, Bars, Restaurants, Brewpubs, Breweries, ...",DgfU87xkj-QGqnuHhbBfuQ,3,Great outdoor seating which is nice right now....,2020-06-21 18:59:27
22315,nTEmSvz8_DK-iK3miQFCwA,Great Basin Brewing,846 Victorian Ave,Sparks,NV,89431,39.535095,-119.754133,4.0,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Food, Bars, Restaurants, Brewpubs, Breweries, ...",5viAaAjjAw1ozlrR7BchGg,5,The meal was perfect bar! Ambiance was great ...,2022-01-16 00:03:52
22316,nTEmSvz8_DK-iK3miQFCwA,Great Basin Brewing,846 Victorian Ave,Sparks,NV,89431,39.535095,-119.754133,4.0,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Food, Bars, Restaurants, Brewpubs, Breweries, ...",sGMB4rFamgxbFOKAmiJ0mg,4,So in the last couple of years we have had hor...,2017-12-10 19:41:59
22317,nTEmSvz8_DK-iK3miQFCwA,Great Basin Brewing,846 Victorian Ave,Sparks,NV,89431,39.535095,-119.754133,4.0,"{'RestaurantsPriceRange2': '2', 'BusinessAccep...","Food, Bars, Restaurants, Brewpubs, Breweries, ...",kkaWSsIDLsKzHOl1UpD2tg,3,We were going to a different restaurant this m...,2017-02-18 21:27:15


In [11]:
df.isna().any()

business_id     False
name            False
address         False
city            False
state           False
postal_code     False
latitude        False
longitude       False
stars           False
attributes       True
categories      False
user_id         False
review_stars    False
text            False
date            False
dtype: bool

## It would appear at least with this cursory glance that the data we want is roughly in a format with which we can proceed.

## Now to continue by preparing and designing the neural network for sentiment analysis

In [12]:
# we have to tokenize the text from each review in order to feed the data into the model.

import numpy as np

reviews = df['text'].to_numpy()
reviews

# Each text for each review is now a numpy array. This will allow us to perform tokenization.

array(["The craft brewery scene has finally hit the sleepy Green Lane/Perkiomen Valley area.  The brewery is located in a former bank.  There is no food available but there are several restaurants nearby that will deliver and you can bring your own food.  Based on the layout, I don't think the brewery will be able to expand to have a full menu.  The layout is geared towards families with a place for kids to play and a television set-up with videos.  I don't remember a TV at the bar to watch sports.  My friend and I sampled the 9 beers that were on tap.  Everything was good but we highly recommend everyone to try Shagbark Old English Ale.  It has a unique taste that you will love (like us) or one sip and you are done.  But you definitely need to try it.",
       "Glad to have a local brewery so close to my house!  I've been eagerly waiting for them to open and they have exceeded my expectations.  I wasn't quite sure how they would make the old bank building into a brewpub but it is welc

In [14]:
tokenizer = Tokenizer(num_words = 15000)
tokenizer.fit_on_texts(reviews)
word_index = tokenizer.word_index
print(word_index)



In [15]:
vocab_size = len(tokenizer.word_index)

In [16]:
# This is the initialization of the dimensions and layers of the whole neural network

NNmodel = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 120),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation = 'relu'),
    tf.keras.layers.Dense(6, activation = 'softmax')
])

### With the text from reviews processed into tokens, and the model parameters established we should now consider how to split the data to provide the model with a test and training set as well as a way to label each review (review stars).

In [17]:
#splitting the data to allow us to feed it to the neural network.

# We need X values which will be our text from the reviews in their token form, along with padding to make sure all the inputs 
# are the same length

reviewX = tokenizer.texts_to_sequences(df['text'])

max_length = max([len(x) for x in reviewX])

In [18]:
# padding of the x values

reviewX = np.array(tf.keras.preprocessing.sequence.pad_sequences(reviewX, maxlen = max_length, padding = 'pre'))

In [27]:
print(len(reviewX))

22319


In [19]:
# Y values will be the star value given by each author of the respective review. Stored as categorical data in 6 columns.

reviewY = tf.keras.utils.to_categorical(df['review_stars'])

In [15]:
print(reviewY)

[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1.]]


In [24]:
from sklearn.model_selection import train_test_split

# Now lets set aside a validation set to help us with overfitting.

x_train, x_test, y_train, y_test = train_test_split(    
    reviewX, reviewY, test_size=0.25, random_state=42)

### Consider now how we might prevent overfitting of our neural network model.

    An important feature of any machine learning model is being able to provide the same accuracy returned in the fitting process to data that is completely separate from the training data. Overfit models will perform poorly on novel data that is fed into it, and as such is always something that should be avoided.
    
    We can use the model save feature to do something called early stopping. This will use the loss metric to tell the model fitting process to stop early and not complete any further epochs

In [27]:
from tensorflow.keras.callbacks import EarlyStopping


# now we are ready to feed these into the neural network using an early stopping procedure to halt the fitting process 
#    in an attempt to prevent overfitting.

# first we compile the layers we established earlier
# Then add in the chosen optimizer operator, loss function, and which metric to determine effectivness.

NNmodel.compile(
    optimizer='adam', 
    loss= 'categorical_crossentropy',
    metrics=['accuracy']
)


# We now need a function which tells the model to stop fitting if it appears like the neural network is just trying 
#    to memorize the data instead of developing true predictive accuracy.

# This process essentially relies on the validation set telling us if the network has become overfit on the training data
#    and now no longer is capable of applying itself to novel data.

monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, 
        verbose=1, mode='auto', restore_best_weights=True)

history = NNmodel.fit(
    x_train,
    y_train,
    validation_data = (x_test, y_test),
    callbacks = [monitor],
    verbose = 1,
    epochs=1000,
)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 00008: early stopping


In [29]:
# Saving the models architecture and escpecially the weights is important both for further use of the model without having to 
# retrain which can take a great deal of time, but can also be used in an effort to prevent overfitting.

import io
import os
import requests

save_path = '.'
NNmodel.save(os.path.join(save_path, 'network.h5'))

In [30]:
# reloading the model is important to aswell as potentially using in deployment of a web application or software of some sort.
from tensorflow.keras.models import load_model

model2 = load_model(os.path.join(save_path,"network.h5"))