# Using Pandas read_json function:

In [1]:
import pandas as pd
import numpy as np

In [3]:
with open('yelp_dataset/yelp_academic_dataset_tip.json') as file:
        tip = pd.read_json(file, lines = True)

In [2]:
with open('yelp_dataset/yelp_academic_dataset_review.json') as file:
        review = pd.read_json(file, lines = True)

In [125]:
with open('yelp_dataset/yelp_academic_dataset_business.json') as file:
        business = pd.read_json(file, lines = True)

In [None]:
with open('yelp_dataset/yelp_academic_dataset_checkin.json') as file:
        checkin = pd.read_json(file, lines = True)

In [4]:
with open('yelp_dataset/yelp_academic_dataset_user.json') as file:
        user = pd.read_json(file, lines = True)

In [8]:
# dfs = ['tip', 'review', 'business', 'checkin', 'user']

tip:  ['business_id' 'date' 'likes' 'text' 'type' 'user_id']
review:  ['business_id' 'date' 'review_id' 'stars' 'text' 'type' 'user_id' 'votes']
business:  ['attributes' 'business_id' 'categories' 'city' 'full_address' 'hours'
 'latitude' 'longitude' 'name' 'neighborhoods' 'open' 'review_count'
 'stars' 'state' 'type']
checkin:  ['business_id' 'checkin_info' 'type']
user:  ['average_stars' 'compliments' 'elite' 'fans' 'friends' 'name'
 'review_count' 'type' 'user_id' 'votes' 'yelping_since']

In [428]:
a = business.groupby(business.city)

In [433]:
pd.DataFrame({'count':a.size(), 'avg_stars': a.stars.mean()}).reset_index()

Unnamed: 0,city,avg_stars,count
0,,4.500000,2
1,110 Las Vegas,5.000000,1
2,Ahwahtukee,5.000000,1
3,Ahwatukee,3.750000,12
4,Ahwatukee Foothills Village,5.000000,1
5,Allegheny,5.000000,1
6,Allentown,3.500000,2
7,Anjou,3.318182,11
8,Anthem,3.696296,135
9,Apache Junction,3.596774,186


# Building a recommender system
First, get the relevant data: user-business stars... create matrices Y, R

** Y ** that contains #business as rows, #user as columns, values = stars

** R ** same size as Y, values = 1 if a user has rated a business **



# Need to determine businesses and users to include in the model

## Drop businesses and users with review counts less than 20

In [92]:
user = user[user.review_count > 20]

In [136]:
user.shape

(148994, 11)

In [131]:
business = business[business.review_count > 20]

## Drop states with less than 20 businesses (recommend within state if #>20)

In [129]:
business = business.groupby('state').filter(lambda r: len(r) > 20)

In [171]:
business.shape

(25658, 15)

In [137]:
np.zeros([25658,148994])

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

## Merge review+business+user

In [193]:
rev_biz_usr = pd.merge(pd.merge(review, business, on='business_id'), user, on='user_id')

## Pivoting the table needed for the recommender system algorithm

### The following pivot table gives an error due to out of memory, not negagtive dimensions, even after reducing dataset:
memory needed before reducing the dataset, assuming 8 bytes per value is:
686556 x 85539 x 8 / 1024 / 1024/ 1024 = 438 GB

In [189]:
# memory requirements for pivoting the table after reducing the dataset is still large ~ 27.5 GB
len(rev_biz_usr.business_id.unique()) * len(rev_biz_usr.user_id.unique()) * 8 / 1024/1024/1024

27.547535933554173

In [191]:
rev_biz_usr.stars_x.dtype

dtype('int32')

In [20]:
# rev_biz_usr.pivot(index='business_id', columns='user_id', values='stars_x')

In [168]:
rev_biz_usr.shape

(1251142, 32)

In [179]:
col_drop = rev_biz_usr.columns.difference(['business_id', 'user_id', 'stars_x'])

In [180]:
rev_biz_usr.drop(col_drop, axis=1, inplace=True)

In [None]:
pd.pivot_table(rev_biz_usr, index='business_id', columns='user_id', values='stars_x')

# Trying to create a sparse matrix from
http://stackoverflow.com/questions/31661604/efficiently-create-sparse-pivot-tables-in-pandas

In [195]:
from scipy.sparse import csr_matrix

users = list(sorted(rev_biz_usr.user_id.unique()))
businesses = list(sorted(rev_biz_usr.business_id.unique()))

stars = rev_biz_usr['stars_x'].tolist()
row = rev_biz_usr.user_id.astype('category', categories=users).cat.codes
col = rev_biz_usr.business_id.astype('category', categories=businesses).cat.codes
sparse_matrix = csr_matrix((stars, (row, col)), shape=(len(users), len(businesses)))

In [196]:
sparse_matrix

<144265x25629 sparse matrix of type '<class 'numpy.int32'>'
	with 1201156 stored elements in Compressed Sparse Row format>

In [197]:
len(sparse_matrix.indptr)

144266

## The R matrix can be easily obtained by first converting the csr to coo and using the ij values

```
sr_matrix.tocoo(copy=True)

    Convert this matrix to COOrdinate format.

    With copy=False, the data/indices may be shared between this matrix and the resultant coo_matrix.
```

In [199]:
coo_mat = sparse_matrix.tocoo()

In [200]:
len(coo_mat.row)

1201156

In [201]:
R = list(zip(coo_mat.row,coo_mat.col))

In [203]:
sorted(R)

[(0, 15977),
 (1, 6023),
 (1, 10013),
 (1, 12667),
 (1, 17546),
 (1, 19296),
 (1, 22532),
 (2, 24831),
 (3, 447),
 (3, 3221),
 (3, 8781),
 (3, 11898),
 (4, 652),
 (4, 4332),
 (4, 4645),
 (4, 4721),
 (4, 4984),
 (4, 5948),
 (4, 6324),
 (4, 7945),
 (4, 8192),
 (4, 9895),
 (4, 10773),
 (4, 10826),
 (4, 11546),
 (4, 12097),
 (4, 13059),
 (4, 13246),
 (4, 13502),
 (4, 13605),
 (4, 14262),
 (4, 14961),
 (4, 15180),
 (4, 15795),
 (4, 16131),
 (4, 16960),
 (4, 18093),
 (4, 18538),
 (4, 19564),
 (4, 19983),
 (4, 21848),
 (4, 23434),
 (5, 1484),
 (5, 1592),
 (5, 2213),
 (5, 2281),
 (5, 9698),
 (5, 10180),
 (5, 12386),
 (5, 14886),
 (5, 15513),
 (5, 18589),
 (5, 18843),
 (5, 22498),
 (5, 24981),
 (6, 363),
 (6, 814),
 (6, 831),
 (6, 874),
 (6, 900),
 (6, 972),
 (6, 1243),
 (6, 1571),
 (6, 1808),
 (6, 1831),
 (6, 1917),
 (6, 1931),
 (6, 2232),
 (6, 2613),
 (6, 2631),
 (6, 2694),
 (6, 3016),
 (6, 3102),
 (6, 3165),
 (6, 4110),
 (6, 4682),
 (6, 4711),
 (6, 4754),
 (6, 5099),
 (6, 5401),
 (6, 5603),


# Check if cities and distances between them may be used instead of distances between businesses

##  No: cities are small, with distances that can be 3 miles

# assume 30 km as maximum distance willing to travel, to find vicinity within state

In [40]:
business.latitude.min(), business.latitude.max()

(32.865882499999998, 56.033777299999997)

In [59]:
business.longitude.min(), business.longitude.max()

(-115.38655, 8.5492494004572208)

In [41]:
vincenty((57, -71), (57, -72)).km

60.77164272792849

In [42]:
vincenty((32, -71), (32, -72)).km

94.49280502196765

In [58]:
vincenty((45, -71), (45, -72)).km

78.8463347093115

for out dataset, businesses lie within latitudes 32 and 56 North

where 1 degree in longitude ranges between 60 to 95 km

so nearby business ( < 30) should also be within 0.5 degree of longitude (near 32N distances will be < 47km)

for latitudes, one degree difference is about 110 km, so will adopt 0.3 difference to consider nearby businesses

In [53]:
vincenty((44, -71), (45, -71)).km

111.12200826241184

In [54]:
vincenty((44, -70), (45, -70)).km

111.12200826241184

In [55]:
vincenty((55, -71), (56, -71)).km

111.33269933263972

In [56]:
vincenty((32, -71), (33, -71)).km

110.89560891986069

In [65]:
business[business.latitude > 56.033]

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
47693,{},4Yh57FsAyvDtZsq5F7fCWg,"[Hotels & Travel, Public Transportation, Trans...",Inverkeithing,Boreland Rd\nInverkeithing KY11 1NJ,{},56.033777,-3.393267,Inverkeithing Railway Station,[],True,3,3.0,FIF,business


In [101]:
business.groupby(['state']).filter(lambda r: len(r) > 20).state.unique()

array(['PA', 'NC', 'SC', 'WI', 'IL', 'AZ', 'NV', 'QC', 'ON', 'EDH', 'MLN',
       'BW'], dtype=object)

In [102]:
# ignore states with less than 20 businesses... we have 12 states with more than 20 businesses
business_state = business.groupby(['state']).filter(lambda r: len(r) >= 20)

In [103]:
business_state.groupby('state').size()

state
AZ     36500
BW      1055
EDH     3297
IL       808
MLN      161
NC      6835
NV     23591
ON       530
PA      4086
QC      5591
SC       325
WI      3066
dtype: int64

In [106]:
business_state.shape

(85845, 15)

In [111]:
business_state[business_state.review_count > 20].count()

attributes       25673
business_id      25673
categories       25673
city             25673
full_address     25673
hours            25673
latitude         25673
longitude        25673
name             25673
neighborhoods    25673
open             25673
review_count     25673
stars            25673
state            25673
type             25673
dtype: int64

In [127]:
len(business_state[business.state == 'AZ'].merge(review, how='inner', left_on='business_id', right_on='business_id').user_id.unique())

  if __name__ == '__main__':


247028

In [168]:
# To classify a business as Food:
business.attributes.iloc[3]['Good For']

{'breakfast': False,
 'brunch': False,
 'dessert': False,
 'dinner': False,
 'latenight': False,
 'lunch': True}

In [166]:
sum(business.attributes.iloc[3]['Good For'].values())

1

In [76]:
business_state.longitude.max()

8.5492494004572208

In [70]:
business_groups = business_state.groupby('state')

In [71]:
group = business_groups.get_group('EDH')

In [72]:
group.latitude.max()

56.000421000000003

In [766]:
nearby ={}
max_nearbies = 0

business_groups = business_state.groupby('state')

for name, group in business_groups:
    for i in range(0, len(group)):
        if group.latitude.iloc[i] < 80:
            near_group = group[(group.latitude < (group.latitude.iloc[i] + 1)) & (group.latitude > (group.latitude.iloc[i] - 1)) & 
                          (group.longitude < (group.longitude.iloc[i] + 1)) & (group.longitude > (group.longitude.iloc[i] - 1))]
        for j in range(0, len(near_group)):

            distance = vincenty(tuple(group.iloc[i][['latitude','longitude']]), tuple(near_group.iloc[j][['latitude','longitude']])).km
            if distance < 30:
                c += 1
                try:
                    nearby[group.business_id.iloc[i]].extend([near_group.business_id.iloc[j]])
                except KeyError as e:
                    nearby[group.business_id.iloc[i]] = [near_group.business_id.iloc[j]]
                try:
                    nearby[near_group.business_id.iloc[j]].extend([group.business_id.iloc[i]])
                except KeyError as e:
                    nearby[near_group.business_id.iloc[j]] = [group.business_id.iloc[i]]
        if max_nearbies < c:
            max_nearbies = c
    print (len(group))


KeyboardInterrupt: 

In [755]:
business_state[(abs(business_state.latitude) < (business_state.latitude[0] + 1))]

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,"{'Take-out': True, 'Has TV': False, 'Noise Lev...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Tuesday': {'open': '11:00', 'close': '21:00'...",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business
1,"{'Outdoor Seating': False, 'Good For Groups': ...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],True,5,3.0,PA,business
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.014660,Cool Springs Golf Center,[],False,5,2.5,PA,business
3,"{'Good For Groups': True, 'Has TV': True, 'Smo...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'open': '10:00', 'close': '19:00'...",40.408830,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business
4,"{'Parking': {'valet': False, 'validated': Fals...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Thursday': {'open': '11:00', 'close': '20:00...",40.403405,-79.855782,Grand View Golf Club,[],True,3,5.0,PA,business
5,"{'Good For Groups': True, 'Has TV': True, 'Smo...",KayYbHCt-RkbGcPdGOThNg,"[Bars, American (Traditional), Nightlife, Rest...",Carnegie,"141 Hawthorne St\nGreentree\nCarnegie, PA 15106","{'Monday': {'open': '11:00', 'close': '02:00'}...",40.415486,-80.067549,Alexion's Bar & Grill,[Greentree],True,23,4.0,PA,business
6,{'Accepts Credit Cards': True},b12U9TFESStdy7CsTtcOeg,"[Auto Repair, Automotive, Tires]",Carnegie,"718A Hope Hollow Rd\nCarnegie, PA 15106","{'Monday': {'open': '07:30', 'close': '18:00'}...",40.394588,-80.084454,Flynn's Tire & Auto Service,[],True,9,2.5,PA,business
7,{'Good for Kids': True},Sktj1eHQFuVa-M4bgnEh8g,"[Active Life, Mini Golf]",Carnegie,"920 Forsythe Rd\nCarnegie\nCarnegie, PA 15106",{},40.405404,-80.076267,Forsythe Miniature Golf & Snacks,[Carnegie],False,4,4.0,PA,business
8,{},3ZVKmuK2l7uXPE6lXY4Dbg,"[Roofing, Home Services, Decks & Railing, Cont...",Carnegie,"8 Logan St\nCarnegie\nCarnegie, PA 15106",{},40.406281,-80.090390,Quaker State Construction,[Carnegie],True,3,2.5,PA,business
9,{},QoDa50dc7g62xciFygXB9w,"[Veterinarians, Pets]",Pittsburgh,"2080 Greentree Rd\nPittsburgh, PA 15220",{},40.392207,-80.069134,Greentree Animal Clinic,[],True,7,4.0,PA,business


In [None]:
len(nearby)

In [None]:
import pickle
# save the nearby dictionary as it take a long time to build
with open('nearby_businesses_dict.pickle', 'wb') as handle:
    pickle.dump(nearby, handle, protocol=pickle.HIGHEST_PROTOCOL)
# To load the dictionary un-comment the following:___
# with open('nearby_businesses_dict.pickle', 'rb') as handle:
#     b = pickle.load(handle)

In [534]:
# nearby ={}
# max_nearbies = 0
# business_groups = business_state.groupby('state')
# for group in business_groups:
    
    
    
# for i in range(0, len(business)):
#     nearby[business.business_id.iloc[i]]=[]
#     c = 0
#     for j in range(0, len(business)):
#         if i==j: continue
#         distance = vincenty(tuple(business.iloc[i][['latitude','longitude']]), tuple(business.iloc[j][['latitude','longitude']])).km
#         if distance < 30:
#             c += 1
#             nearby[business.business_id.iloc[i]].extend([business.business_id.iloc[j]])        
#     if c > max_nearbies:
#         max_nearbies = c

In [170]:
pd.__version__

'0.19.2'