In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from networkx.algorithms import bipartite as bi

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
%matplotlib inline

import mplleaflet

from collections import OrderedDict 
from operator import getitem 

import math

In [None]:
#getting full list of businesses reviewed
busDF = pd.read_json(r"C:\Users\scohendevries\Documents\DATA620\yelp_dataset\business.json",lines=True)

In [None]:
#inspecting format of business table
#busDF.head()
busDF.groupby('state').count()['business_id']

In [None]:
#large number of reviews, reading in chunks
size = 1000000
review = pd.read_json(r"C:\Users\scohendevries\Documents\DATA620\yelp_dataset\review.json", lines=True,
                      dtype={'review_id':str,'user_id':str,
                             'business_id':str,'stars':int,
                             'date':str,'text':str,'useful':int,
                             'funny':int,'cool':int},
                      chunksize=size)

In [None]:
#then iterating through chunks
chunk_list = []
for chunk_review in review:
    #dropping fields out of scope for this study
    chunk_review = chunk_review.drop(['review_id','useful','funny','cool'], axis=1)
    chunk_review = chunk_review.rename(columns={'stars': 'review_stars'})
    chunk_list.append(chunk_review)
revDF = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [None]:
#inspecting format of reviews table
revDF.head()

In [None]:
#following same process for Users data
users = pd.read_json(r"C:\Users\scohendevries\Documents\DATA620\yelp_dataset\user.json", lines=True,
                      dtype={'user_id':str,'name':str,
                             'review_count':int,'yelping_since':str,
                             'friends':list,'useful':int,'funny':int,
                             'cool':int,'fans':int,'elite':list,'average_stars':float,
                             'compliment_hot':int,'compliment_more':int,'compliment_profile':int,
                             'compliment_cute':int,'compliment_list':int,'compliment_note':int,
                             'compliment_plain':int,'compliment_cool':int,'compliment_funny':int,
                             'compliment_writer':int,'compliment_photos':int
                            },
                      chunksize=size
                     )

In [None]:
chunk_list = []
for chunk_review in users:
    chunk_review = chunk_review.drop(['yelping_since'
                                      ,'useful'
                                      ,'funny'
                                      ,'cool'
                                      ,'compliment_hot'
                                      ,'compliment_more'
                                      ,'compliment_profile'
                                      ,'compliment_cute'
                                      ,'compliment_list'
                                      ,'compliment_note'
                                      ,'compliment_plain'
                                      ,'compliment_cool'
                                      ,'compliment_funny'
                                      ,'compliment_writer'
                                      ,'compliment_photos'
                                     ], axis=1)
    chunk_list.append(chunk_review)
userDF = pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [None]:
userDF.head()

In [None]:
#pre-processed numbers from data set
print(str(len(busDF.index)) + ' businesses')
print(str(len(userDF.index)) + ' users')
print(str(len(revDF.index)) + ' reviews')

In [None]:
#Grouping businesses by regions for greater numbers
regionDF = pd.read_csv('https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv')
busDF = pd.merge(busDF,regionDF,how='inner',left_on='state',right_on='State Code')

In [None]:
busDF

In [None]:
#focus on single category
catDF = busDF.assign(categories = busDF.categories
                         .str.split(', ')).explode('categories')
catDF.categories.value_counts()

In [None]:
#further filtering data set by looking at top 10% by number of reviews
#here we are also grouping by region to ensure we are getting samples from all regions, rather than skewing numbers towards densely populated areas
busDF = busDF[busDF['categories'].str.contains('Restaurants|Nightlife|Bars|Food',case=False,na=False)]
busDF['q'] = busDF.groupby(['Division'])['review_count'].rank(pct=True)
busDF = busDF[busDF['q'] > 0.9]
userDF = userDF[userDF.review_count > userDF.review_count.quantile(0.9)]

In [None]:
#in our filtered data set, we still see skewed numbers by state
#this is a function of the data set
print(str(len(busDF.index)) + ' businesses')
print(str(len(userDF.index)) + ' users')

busDF.groupby('state').count()['business_id']

In [None]:
df = pd.merge(pd.merge(revDF,userDF,how='inner',on='user_id'),busDF,how='inner',on='business_id')

In [None]:
df = df[
    ['user_id'
     ,'business_id'
     ,'city'
     ,'state'
     ,'latitude'
     ,'longitude'
     ,'Region'
     ,'Division'
     ,'name_y'
     ,'review_stars'
     ,'stars'
     ,'average_stars'
     ,'categories'
     ,'text'
     ,'date']
]

In [None]:
#we are limiting our analysis to just under 1MM reviews
print(str(len(df.index)) + ' reviews')

In [None]:
df = df.rename({'stars':'avgStars_biz'
           ,'average_stars':'avgStars_usr'
          }
         )

In [None]:
df.head(50)

In [None]:
stopWords = set(stopwords.words('english'))

In [None]:
#using nltk to filter out stopwords, numbers and punctuation
def nonStops(text):
    return [word for word in word_tokenize(text) \
            if word.lower() not in stopWords \
            and not word.isdigit()\
            and word.isalpha()
           ]

In [None]:
byReg = {}
allWords = {}
for i in df.itertuples():
    t = nonStops(i.text)
    #leveling weights here, so that a review score of 3 is seen as neutral, and anything above/below is positive/negative
    revWt = i.review_stars - 3
    #creating a dictionary of all words to differentiate weighting by region
    for w in t:
        if w not in allWords.keys():
            allWords[w] = {'n':1,'wt':revWt,'mean':revWt,'stDev':0}
        else:
            allWords[w]['n'] += 1
            allWords[w]['wt'] += revWt
            allWords[w]['mean'] = allWords[w]['wt']/allWords[w]['n']
        if w not in byReg.keys():
            byReg[w] = {}
        if i.Division not in byReg[w].keys():
            byReg[w][i.Division] = {'n':1,'wt':revWt,'mean':revWt}
        else:
            byReg[w][i.Division]['n'] += 1
            byReg[w][i.Division]['wt'] += revWt
            byReg[w][i.Division]['mean'] = byReg[w][i.Division]['wt']/byReg[w][i.Division]['n']

In [None]:
for i in allWords.keys():
    mn = allWords[i]['wt']/allWords[i]['n']
    sdn = 0
    n = 0
    for j in byReg[i].keys():
        sdn += byReg[i][j]['mean'] - mn
        n += 1
    sd = (sdn**2.0)/(n)
    allWords[i]['stDev'] = sd

In [None]:
byReg

In [None]:
allWords

In [None]:
#in order to find words which might have their sentiment vary by region, we find those with the most volatility in terms of related review scores
OrderedDict(sorted(allWords.items(),key = lambda x: getitem(x[1], 'stDev'),reverse=True))

In [None]:
#looking at sample reviews for one of the more voltile words
df[df['text'].str.contains('rental')]

In [None]:
#The second part of the analysis involves a network analysis of business, based on patrons
usr = df['user_id'].values.tolist()
biz = df['business_id'].values.tolist()

g=nx.from_pandas_edgelist(df,'user_id','business_id',['city','state','latitude','longitude'])

print(len(g))

In [None]:
#creating a bipartite graph with Businesses as the top node
bNet = bi.weighted_projected_graph(g, biz)

In [None]:
for i in sorted(nx.connected_components(bNet), key = len, reverse=True)[0:20]:
    print(len(i))

In [None]:
#Defining a set of functions to utilize the island method for network analysis
def trim_edges(g, weight=1):
    g2=nx.Graph()
    for f, to, edata in g.edges(data=True):
        if edata['weight'] > weight:
            g2.add_edge(f,to,weight=edata['weight'])
    return g2

def island_method(g, iterations=5):
    weights= [edata['weight'] for f,to,edata in g.edges(data=True)]
    mn=int(min(weights))
    mx=int(max(weights))
    #compute the size of the step, so we get a reasonable step in iterations
    step=int((mx-mn)/iterations)
    return [[threshold, trim_edges(g, threshold)] for threshold in range(mn,mx,step)]

def sorted_map(d):
    ms = sorted(d.items(), key=lambda x: x[1], reverse=True)
    return ms

In [None]:
bweights=[edata['weight'] for f,t,edata in bNet.edges(data=True)]
nx.draw_networkx(bNet,width=bweights, node_size=10, with_labels=False)

In [None]:
plt.hist(bweights)

In [None]:
bnet_trim=trim_edges(bNet, weight=75)
weights=[math.log(edata['weight']) for f,t,edata in bnet_trim.edges(data=True)]
nx.draw_networkx(bnet_trim,width=weights, node_size=10, with_labels=False)

In [None]:
bPos = busDF[['business_id','latitude','longitude']].set_index('business_id')[['latitude', 'longitude']].T.apply(tuple)
pos = bPos.to_dict()
#pos
#nx.draw_networkx(bNet,pos)
nx.draw(bnet_trim, nx.get_node_attributes(bnet_trim, 'pos'), with_labels=True, node_size=0)
#mplleaflet.display(fig=ax.figure) 

In [None]:
#top connected businesses could be interpreted as "tourist" centers, in that they represent places of business which are
#reviews by patrons who review/patronize other businesses (presumably in other areas)
btwn = nx.betweenness_centrality(bnet_trim)
top_biz = []
for i in sorted_map(btwn)[:20]:
    top_biz.append(i[0])
busDF[busDF['business_id'].isin(top_biz)]

In [None]:
"""
https://towardsdatascience.com/converting-yelp-dataset-to-csv-using-pandas-2a4c8f03bd88
"""