In [48]:
import json
from collections import Counter
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer
from nltk.tag import pos_tag
import nltk
import gensim
from gensim import corpora
import shelve

The original data is too large to be uploaded onto GitHub. Thus, the pre-processing and extracting of data were conducted locally with the following code. The original data can be retrived from [Yelp Dataset](https://www.yelp.com/dataset/challenge). We extract a subset from it and save the data we worked with in the "data" folder in this repository.

In [6]:
# the local path of JSON files that needed
json_business_path='../dataset/business.json'
json_review_path='../dataset/review.json'

In [7]:
# extract business information
business_contents=[]
with open(json_business_path) as fin:
    for line in fin:
        line=json.loads(line)
        business_contents.append(line)

As we can see below, the data contains business information from different states and different business types. Among the most popular locations and business types, we focus on restaurants at Ohio.

In [9]:
# In order to control the scope the project, we take OH state, 10930 businesses
state=[]
for line in business_contents:
    state.append(line['state']) 
# the 10 most popular locations
Counter(state).most_common(10)

[('AZ', 47376),
 ('NV', 30571),
 ('ON', 26520),
 ('NC', 11299),
 ('OH', 10930),
 ('PA', 8916),
 ('QC', 7273),
 ('WI', 4190),
 ('EDH', 3561),
 ('BW', 3071)]

In [10]:
# we focus on Restaurants in OH, around 4513 businesses
cat_of_business=[]
for line in business_contents:
    if line['state']=='OH':
        try:
            cat_of_business.extend(line['categories'])
        except:
            next 
    
# The 10 most popular business types,focus on 'Restaurants'
Counter(cat_of_business).most_common(10)

[('Restaurants', 4513),
 ('Food', 1916),
 ('Shopping', 1628),
 ('Nightlife', 1180),
 ('Bars', 1083),
 ('Beauty & Spas', 930),
 ('American (Traditional)', 891),
 ('Automotive', 738),
 ('Pizza', 706),
 ('Sandwiches', 677)]

In [17]:
# we have 4513 restaurants in OH to be reseaerched
OH_Restaurants=[]
for line in business_contents:
    if (line['state']=='OH') and ('Restaurants' in line['categories']):
        OH_Restaurants.append(line)
print('we will research %d restaurants in OH'%len(OH_Restaurants))

we will research 4513 restaurants in OH


For the convenience of research and reproducibility, we will save the data that extracted from the original JSON data into data frames and use them for future analyis and statistical modeling.


In [30]:
# we need to keep those columns
res_col=['state','city','address','name','business_id','stars','review_count','categories']
restaurants=json_normalize(OH_Restaurants)[res_col]

In [32]:
# a glimpse of the restaurant
restaurants.head()

Unnamed: 0,state,city,address,name,business_id,stars,review_count,categories
0,OH,Painesville,1 S State St,Sidewalk Cafe Painesville,Bl7Y-ATTzXytQnCceg5k6w,3.0,26,"[American (Traditional), Breakfast & Brunch, R..."
1,OH,Northfield,10430 Northfield Rd,Zeppe's Pizzeria,7HFRdxVttyY9GiMpywhhYw,3.0,7,"[Pizza, Caterers, Italian, Wraps, Event Planni..."
2,OH,Mentor,9209 Mentor Ave,Firehouse Subs,lXcxSdPa2m__LqhsaL9t9A,3.5,9,"[Restaurants, Sandwiches, Delis, Fast Food]"
3,OH,Cleveland,13181 Cedar Rd,Richie Chan's Chinese Restaurant,Pawavw9U8rjxWVPU-RB7LA,3.5,22,"[Chinese, Restaurants]"
4,OH,Northfield,134 E Aurora Rd,Romeo's Pizza,RzVHK8Jfcy8RvXjn_z3OBw,4.0,4,"[Restaurants, Pizza]"


In [34]:
restaurants.shape

(4513, 8)

There are a lot of business reviews, when we extract them from JSON file, we only extract those reviews which are for the restaurants in our sample.

In [35]:
# extract review information
review_contents=[]
with open(json_review_path) as fin:
    for line in fin:
        line=json.loads(line)
        if line['business_id'] in list(restaurants.business_id):
            review_contents.append(line)

In [37]:
# total 154764 reviews on 4513 restaurants
len(review_contents)
# convert into data frame
reviews=json_normalize(review_contents)

We'd like to remove restaurant with too few reviews. We see that 25% of restaurants have more than 39 reviews, so we only keep restaurants which have more than 100 reviews.

In [38]:
reviews.business_id.value_counts().describe()

count    4513.000000
mean       34.292932
std        57.777458
min         3.000000
25%         6.000000
50%        15.000000
75%        39.000000
max       896.000000
Name: business_id, dtype: float64

In [59]:
len(reviews.business_id.value_counts())

4513

In [66]:
with shelve.open('../dataset/ohio_all_reviews') as db:
    db['reviews'] = review_contents

In [43]:
keep_busID=reviews.business_id.value_counts().index[reviews.business_id.value_counts()>100]
reviews_keep=reviews.loc[reviews.business_id.isin(keep_busID),:]

In [56]:
reviews_keep.business_id.value_counts().describe()

count    316.000000
mean     190.575949
std      122.290224
min      101.000000
25%      117.750000
50%      149.000000
75%      206.250000
max      896.000000
Name: business_id, dtype: float64

In [57]:
# save to csv
reviews_keep.to_csv('reviews.csv')

In [61]:
restaurants.to_csv('restaurant.csv')

In [62]:
restaurants.shape

(4513, 8)

In [40]:
import resource

In [41]:
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000

943.620096

In [45]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

In [46]:
review_list = []
for text in reviews_keep.text:
    text = text.lower()
    # Tokenization
    token = nltk.word_tokenize(text)
    # remove stop words and punctuation
    token = [i for i in token if i not in stop]
    token = [i for i in token if i not in exclude]   
    # remove non-nune tokens
    tagged_token = pos_tag(token)
    token = [token for token,pos in tagged_token if pos == 'NN']
    # lemmatize
    token = [lemma.lemmatize(i) for i in token]
    review_list.append(token)        

In [49]:
# create dictionary and term_matrix
dictionary = corpora.Dictionary(review_list)
term_matrix = [dictionary.doc2bow(review) for review in review_list]

In [50]:
# Creating the object for LDA model using gensim library
lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = lda(term_matrix, num_topics=8, id2word = dictionary, passes=10)

In [51]:
print(ldamodel.print_topics(num_topics=8, num_words=10))

[(0, '0.070*"food" + 0.056*"spicy" + 0.038*"place" + 0.034*"rice" + 0.030*"thai" + 0.025*"cleveland" + 0.021*"restaurant" + 0.020*"tofu" + 0.017*"service" + 0.014*"soup"'), (1, '0.095*"pizza" + 0.051*"taco" + 0.015*"crust" + 0.012*"hour" + 0.012*"burrito" + 0.011*"place" + 0.010*"salsa" + 0.009*"sauce" + 0.009*"corn" + 0.008*"angelo"'), (2, '0.012*"door" + 0.011*"line" + 0.010*"music" + 0.009*"cleveland" + 0.008*"day" + 0.008*"market" + 0.007*"way" + 0.007*"city" + 0.006*"man" + 0.006*"room"'), (3, '0.084*"food" + 0.042*"service" + 0.042*"place" + 0.033*"time" + 0.020*"bar" + 0.017*"restaurant" + 0.016*"night" + 0.014*"order" + 0.014*"experience" + 0.014*"staff"'), (4, '0.018*"dinner" + 0.017*"cream" + 0.017*"dessert" + 0.016*"salad" + 0.013*"night" + 0.012*"bread" + 0.012*"chocolate" + 0.011*"meal" + 0.011*"butter" + 0.010*"cheese"'), (5, '0.040*"beer" + 0.040*"place" + 0.039*"burger" + 0.026*"food" + 0.018*"selection" + 0.018*"bar" + 0.018*"coffee" + 0.015*"menu" + 0.015*"breakfast" 

In [409]:
# on new data
new_data_topics=[]
for review in review_list[500:1000]:
    doc_bow = dictionary.doc2bow(review)
    topics = [sorted(ldamodel[doc_bow], key=lambda x: x[1], reverse=True)]
    new_data_topics.extend(topics)

In [414]:
reviews_keep.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,dwQEZBFen2GdihLLfWeexA,0,2011-08-21,0,4RF8dMNBW-p2eTluPME_4g,4,Enjoyed the bright fun Mexican decor! The foo...,0,rv6_U_4AsOQ-L50aNRuNNg
1,dwQEZBFen2GdihLLfWeexA,0,2013-06-03,0,ClgrKJ6dqiM7vSKJBJ2w6Q,4,I've been here at least 5 times now and each t...,0,T5MGS0NHBCWgofZ6Q6Btng
2,dwQEZBFen2GdihLLfWeexA,0,2014-03-15,0,IBCTqmvwvd5ZqQhuvFDNXg,5,"Terrific service. The place was packed, but we...",0,NtkMuGqcis30GjAkq91etA
3,dwQEZBFen2GdihLLfWeexA,0,2014-06-09,0,69kni-xG6qtg9y3Hq_zw5g,4,Ate here for lunch on a Sunday. Arrived aroun...,0,unEY79t6hHECP9Yd58R1dg
4,dwQEZBFen2GdihLLfWeexA,0,2012-02-25,0,rQOasxLFCDNWLNW27VHnyA,5,Been dining here since it first opened. Wife i...,0,UwfgmOOul1fc79IcI5h2MQ


In [417]:
topic_dict = {0:"Mexican",1:"Family",2:"Night/Bar"}
topic_count={"Mexican":0,"Family":0,"Night/Bar":0}

In [418]:
# get training matrix for linear regression
train = np.zeros((500,3))
for i in range(500):
    items = new_data_topics[i]
    for s in items:
        if s[1]>0.05:
            topic = topic_dict[s[0]]
            topic_count[topic]+=1
            train[i,s[0]] = 1/len(items)


In [52]:
import os

In [64]:
os.getcwd()

'/Users/tianxia/Desktop/project-3-p2-yo-xi-lo'