In [93]:
import json
from collections import Counter
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string
from nltk.stem.porter import PorterStemmer
from nltk.tag import pos_tag
import nltk

The original data is too large to be uploaded onto GitHub. Thus, the pre-processing and extraction 

In [71]:
# the path of JSON files that needed
json_business_path='./Desktop/dataset/business.json'
json_review_path='./Desktop/dataset/review.json'

In [189]:
# extract business information
business_contents=[]
with open(json_business_path) as fin:
    for line in fin:
        line=json.loads(line)
        business_contents.append(line)

In [205]:
# In order to control the scope the project, we take OH state, 10930 businesses
state=[]
for line in business_contents:
    state.append(line['state']) 
Counter(state).most_common(10)

In [235]:
# we focus on Restaurants in OH, around 4513 businesses
cat_of_business=[]
for line in business_contents:
    if line['state']=='OH':
        try:
            cat_of_business.extend(line['categories'])
        except:
            next 
    
# focus on 'Restaurants'
Counter(cat_of_business).most_common(5)

[('Restaurants', 4513),
 ('Food', 1916),
 ('Shopping', 1628),
 ('Nightlife', 1180),
 ('Bars', 1083)]

In [236]:
# we have 4513 restaurants in OH to be reseaerched
OH_Restaurants=[]
for line in business_contents:
    if (line['state']=='OH') and ('Restaurants' in line['categories']):
        OH_Restaurants.append(line)
len(OH_Restaurants)

In [240]:
# convert the list into data frames
restaurant_list=[]
for line in OH_Restaurants:
        restaurant_list.append([line['state'],line['city'],line['address'],
                       line['name'],line['business_id']])      

restaurant=pd.DataFrame(restaurant_list)
restaurant.columns=['state','city','address','name','business_id']

In [247]:
# a glimpse of the restaurant
restaurant.head()

(4513, 5)

In [273]:
# extract review information
review_contents=[]
with open(json_review_path) as fin:
    for line in fin:
        line=json.loads(line)
        if line['business_id'] in list(restaurant.business_id):
            review_contents.append(line)

In [275]:
# total 154764 reviews on 4513 restaurants
len(review_contents)
# convert into data frame
reviews=json_normalize(review_contents)

154764

We'd like to remove restaurant with too few reviews. We see that 50% of restaurants have more than 15 reviews, so we only keep restaurants which have more than 15 reviews.

In [316]:
reviews.business_id.value_counts().describe()

count    4513.000000
mean       34.292932
std        57.777458
min         3.000000
25%         6.000000
50%        15.000000
75%        39.000000
max       896.000000
Name: business_id, dtype: float64

In [323]:
keep_busID=reviews.business_id.value_counts().index[reviews.business_id.value_counts()>15]
reviews_keep=reviews.loc[reviews.business_id.isin(keep_busID),:]

In [285]:
# save to csv
reviews.to_csv('reviews.csv')
restaurant.to_csv('restaurant.csv')

In [258]:
import resource

In [413]:
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1000000

1638.244352

In [334]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

In [404]:
review_list = []
for text in reviews_keep.text[0:1000]:
    text = text.lower()
    # Tokenization
    token = nltk.word_tokenize(text)
    # remove stop words and punctuation
    token = [i for i in token if i not in stop]
    token = [i for i in token if i not in exclude]   
    # remove non-nune tokens
    tagged_token = pos_tag(token)
    token = [token for token,pos in tagged_token if pos == 'NN']
    # lemmatize
    token = [lemma.lemmatize(i) for i in token]
    review_list.append(token)        

In [365]:
import gensim
from gensim import corpora

In [405]:
# create dictionary and term_matrix
dictionary = corpora.Dictionary(review_list)
term_matrix = [dictionary.doc2bow(review) for review in review_list[0:500]]

In [407]:
# Creating the object for LDA model using gensim library
lda = gensim.models.ldamodel.LdaModel

# Running and Trainign LDA model on the document term matrix.
ldamodel = lda(term_matrix, num_topics=3, id2word = dictionary, passes=10)

In [408]:
print(ldamodel.print_topics(num_topics=3, num_words=3))

[(0, '0.049*"food" + 0.026*"place" + 0.017*"sauce"'), (1, '0.040*"food" + 0.025*"place" + 0.021*"service"'), (2, '0.022*"sauce" + 0.012*"chicken" + 0.010*"food"')]


In [409]:
# on new data
new_data_topics=[]
for review in review_list[500:1000]:
    doc_bow = dictionary.doc2bow(review)
    topics = [sorted(ldamodel[doc_bow], key=lambda x: x[1], reverse=True)]
    new_data_topics.extend(topics)

In [414]:
reviews_keep.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,dwQEZBFen2GdihLLfWeexA,0,2011-08-21,0,4RF8dMNBW-p2eTluPME_4g,4,Enjoyed the bright fun Mexican decor! The foo...,0,rv6_U_4AsOQ-L50aNRuNNg
1,dwQEZBFen2GdihLLfWeexA,0,2013-06-03,0,ClgrKJ6dqiM7vSKJBJ2w6Q,4,I've been here at least 5 times now and each t...,0,T5MGS0NHBCWgofZ6Q6Btng
2,dwQEZBFen2GdihLLfWeexA,0,2014-03-15,0,IBCTqmvwvd5ZqQhuvFDNXg,5,"Terrific service. The place was packed, but we...",0,NtkMuGqcis30GjAkq91etA
3,dwQEZBFen2GdihLLfWeexA,0,2014-06-09,0,69kni-xG6qtg9y3Hq_zw5g,4,Ate here for lunch on a Sunday. Arrived aroun...,0,unEY79t6hHECP9Yd58R1dg
4,dwQEZBFen2GdihLLfWeexA,0,2012-02-25,0,rQOasxLFCDNWLNW27VHnyA,5,Been dining here since it first opened. Wife i...,0,UwfgmOOul1fc79IcI5h2MQ


In [417]:
topic_dict = {0:"Mexican",1:"Family",2:"Night/Bar"}
topic_count={"Mexican":0,"Family":0,"Night/Bar":0}

In [418]:
# get training matrix for linear regression
train = np.zeros((500,3))
for i in range(500):
    items = new_data_topics[i]
    for s in items:
        if s[1]>0.05:
            topic = topic_dict[s[0]]
            topic_count[topic]+=1
            train[i,s[0]] = 1/len(items)


In [421]:
topic_dict

{0: 'Mexican', 1: 'Family', 2: 'Night/Bar'}

In [422]:
topic_count

{'Family': 438, 'Mexican': 441, 'Night/Bar': 290}

In [420]:
new_data_topics[0]

[(1, 0.96188396787094388),
 (2, 0.023204289531856249),
 (0, 0.014911742597199856)]

In [419]:
train

array([[ 0.        ,  0.33333333,  0.        ],
       [ 0.33333333,  0.33333333,  0.33333333],
       [ 0.33333333,  0.        ,  0.33333333],
       ..., 
       [ 0.33333333,  0.33333333,  0.        ],
       [ 0.33333333,  0.33333333,  0.        ],
       [ 0.33333333,  0.33333333,  0.33333333]])

In [424]:
reviews_keep.to_csv('reviews.csv')

In [425]:
reviews_keep.shape

(138398, 9)

In [426]:
review_contents[0]

{'business_id': 'dwQEZBFen2GdihLLfWeexA',
 'cool': 0,
 'date': '2011-08-21',
 'funny': 0,
 'review_id': '4RF8dMNBW-p2eTluPME_4g',
 'stars': 4,
 'text': 'Enjoyed the bright fun Mexican decor!  The food was delicious and reasonably priced!  And the margaritas were delicious!',
 'useful': 0,
 'user_id': 'rv6_U_4AsOQ-L50aNRuNNg'}

In [427]:
business_contents[0]

{'address': '691 Richmond Rd',
 'attributes': {'BikeParking': True,
  'BusinessParking': {'garage': False,
   'lot': True,
   'street': False,
   'valet': False,
   'validated': False},
  'RestaurantsPriceRange2': 2,
  'WheelchairAccessible': True},
 'business_id': 'YDf95gJZaq05wvo7hTQbbQ',
 'categories': ['Shopping', 'Shopping Centers'],
 'city': 'Richmond Heights',
 'hours': {'Friday': '10:00-21:00',
  'Monday': '10:00-21:00',
  'Saturday': '10:00-21:00',
  'Sunday': '11:00-18:00',
  'Thursday': '10:00-21:00',
  'Tuesday': '10:00-21:00',
  'Wednesday': '10:00-21:00'},
 'is_open': 1,
 'latitude': 41.5417162,
 'longitude': -81.4931165,
 'name': 'Richmond Town Square',
 'neighborhood': '',
 'postal_code': '44143',
 'review_count': 17,
 'stars': 2.0,
 'state': 'OH'}

In [428]:
restaurant.head()

Unnamed: 0,state,city,address,name,business_id
0,OH,Painesville,1 S State St,Sidewalk Cafe Painesville,Bl7Y-ATTzXytQnCceg5k6w
1,OH,Northfield,10430 Northfield Rd,Zeppe's Pizzeria,7HFRdxVttyY9GiMpywhhYw
2,OH,Mentor,9209 Mentor Ave,Firehouse Subs,lXcxSdPa2m__LqhsaL9t9A
3,OH,Cleveland,13181 Cedar Rd,Richie Chan's Chinese Restaurant,Pawavw9U8rjxWVPU-RB7LA
4,OH,Northfield,134 E Aurora Rd,Romeo's Pizza,RzVHK8Jfcy8RvXjn_z3OBw
