In [1]:
import json
import yaml
import pandas as pd
import numpy as np

from tqdm import tqdm

In [2]:
businesses = []

In [3]:
with open('RESERVE/yelp_dataset/yelp_academic_dataset_business.json', 'r') as b:
    for business in b:
        business = json.loads(business)
        bus_id, categories = business['business_id'], business['categories']
        attributes = business.get("attributes", None)

        if categories:
            business_to_add = {'business_id': bus_id, 'categories': categories}
            if (attributes) and (attributes.get("Ambience", None) is not None): business_to_add['ambience'] = attributes["Ambience"]
            businesses.append(business_to_add)

In [4]:
len('bvN78flM8NLprQ1a1y5dRg')

22

In [5]:
len(businesses)

160470

In [6]:
restaurants = []
others = []

In [7]:
for business in businesses:
    if 'Restaurants' in business['categories']:
        restaurants.append(business)
    else:
        others.append(business)

In [8]:
categories = set()

In [9]:
for restaurant in restaurants:
    categories.update(restaurant['categories'].split(', '))

In [10]:
with open('utils/cuisine_mapping.json', 'r') as m:
    region_cuisine = json.loads(m.read())

In [11]:
cuisine_region = {}
for region, cuisines in region_cuisine.items():
    cuisines = cuisines.split(', ')
    for cuisine in cuisines:
        cuisine_region[cuisine] = region

In [12]:
cuisines = set(cuisine_region.keys())

In [13]:
restaurants_region = []

In [14]:
total_cuisined = 0

In [15]:
for restaurant in restaurants:
    categories = restaurant['categories'].split(', ')
    found_cuisines = cuisines.intersection(categories)
    if found_cuisines: total_cuisined += 1
    restaurant['categories'] = list(found_cuisines) or ['World']

In [16]:
total_cuisined

23151

In [17]:
len(restaurants)

50763

In [18]:
for restaurant in restaurants:
    restaurant['categories'] = list(set([cuisine_region.get(cuisine, 'World') for cuisine in restaurant['categories']]))

In [19]:
len([r for r in restaurants if len(r['categories']) > 1])

2062

In [20]:
for business in others:
    business['categories'] = ['Non_restaurant']

In [21]:
ambient_data = []

In [22]:
for business in tqdm(businesses):
    try:
        if 'ambience' in business:
            ambient_row = {'business_id': business['business_id']}
            ambient_row.update(yaml.load(business['ambience'], Loader=yaml.FullLoader))
            ambient_data.append(ambient_row)
    except ValueError:
        pass

100%|██████████| 160470/160470 [00:26<00:00, 5954.54it/s]


In [23]:
ambient_df = pd.DataFrame(ambient_data)

In [24]:
ambient_df.head()

Unnamed: 0,business_id,touristy,hipster,romantic,divey,intimate,trendy,upscale,classy,casual
0,6iYb2HFDywm3zjuRg0shjw,False,False,False,False,False,False,False,False,True
1,tCbdrRPZA0oiIYSmHG3J0w,False,False,False,False,False,False,False,False,True
2,D4JtQNTI4X3KcbzacDJsMw,False,False,False,False,False,False,False,False,True
3,HPA_qyMEddpAEtFof02ixg,False,False,False,False,False,False,False,False,True
4,dmbbf3AqeG61_QHRZi1M1w,False,False,False,False,False,False,False,False,False


In [25]:
ambient_df.to_csv('yelp_academic/data/business_ambience.csv', index = False)

In [26]:
for business in businesses:
    if 'ambience' in business:
        del business['ambience']

In [27]:
id_category = {}

In [28]:
for business in businesses:
    categories = {'Africa': False,
                  'N_America': False,
                  'C_America': False,
                  'S_America': False,
                  'Caribbean': False,
                  'E_Asia': False,
                  'S_Asia': False,
                  'SE_Asia': False,
                  'W_Asia': False,
                  'C_Europe': False,
                  'E_Europe': False,
                  'S_Europe': False,
                  'W_Europe': False,
                  'Oceania': False,
                  'World': False,
                  'Non_restaurant': False}
    for category in business['categories']: categories[category] = True
    id_category[business['business_id']] = categories

### Photos: Region

In [29]:
photos_businesses_all = []

In [30]:
with open('yelp_photos/photos.json', 'r') as p:
    for photo in p:
        photo = json.loads(photo)
        photos_businesses_all.append({'photo_id': photo['photo_id'],
                      'business_id': photo['business_id']})

In [31]:
for photo in photos_businesses_all:
    photo.update(id_category[photo['business_id']])

In [32]:
photos_businesses_all = pd.DataFrame(photos_businesses_all)

In [33]:
photos_businesses_all.head()

Unnamed: 0,photo_id,business_id,Africa,N_America,C_America,S_America,Caribbean,E_Asia,S_Asia,SE_Asia,W_Asia,C_Europe,E_Europe,S_Europe,W_Europe,Oceania,World,Non_restaurant
0,Un_Og6jfhazVn7CxszkKEw,R1sIqGfKpF5f3HV3vyNsbg,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,BFE1AFOs27scnnfeBf99ZA,vdT7zlrLB2DL9pStDUs91A,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
2,7t-C0r1JRdoVD9FS7M-N7Q,c5GzrObEdj7eNVOdAxrppg,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,rLnw0d-YYZvT9kR4y7h7_Q,aQa7N5ZbPhCoKYGGB-gqfg,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False
4,Cv5M8MDw8a5NEWvw2AQ4nw,B-s6qOFD75syhwWjp518aA,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [34]:
photos_businesses_all.to_csv('yelp_data/photos_businesses_all.csv', index = False)

In [35]:
photos_restaurants_global = photos_businesses_all.loc[photos_businesses_all['Non_restaurant'] == False]
del photos_restaurants_global['Non_restaurant']

In [36]:
photos_restaurants_global.head()

Unnamed: 0,photo_id,business_id,Africa,N_America,C_America,S_America,Caribbean,E_Asia,S_Asia,SE_Asia,W_Asia,C_Europe,E_Europe,S_Europe,W_Europe,Oceania,World
0,Un_Og6jfhazVn7CxszkKEw,R1sIqGfKpF5f3HV3vyNsbg,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1,BFE1AFOs27scnnfeBf99ZA,vdT7zlrLB2DL9pStDUs91A,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,7t-C0r1JRdoVD9FS7M-N7Q,c5GzrObEdj7eNVOdAxrppg,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False
3,rLnw0d-YYZvT9kR4y7h7_Q,aQa7N5ZbPhCoKYGGB-gqfg,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4,Cv5M8MDw8a5NEWvw2AQ4nw,B-s6qOFD75syhwWjp518aA,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False


In [37]:
photos_restaurants_global.to_csv('yelp_data/photos_restaurants_global.csv', index = False)

In [38]:
photos_restaurants_regional = photos_restaurants_global.loc[photos_restaurants_global['World'] == False]
del photos_restaurants_regional['World']

In [39]:
photos_restaurants_regional.head()

Unnamed: 0,photo_id,business_id,Africa,N_America,C_America,S_America,Caribbean,E_Asia,S_Asia,SE_Asia,W_Asia,C_Europe,E_Europe,S_Europe,W_Europe,Oceania
0,Un_Og6jfhazVn7CxszkKEw,R1sIqGfKpF5f3HV3vyNsbg,False,True,False,False,False,False,False,False,False,False,False,False,False,False
2,7t-C0r1JRdoVD9FS7M-N7Q,c5GzrObEdj7eNVOdAxrppg,False,True,False,False,False,False,False,False,False,False,False,False,False,False
4,Cv5M8MDw8a5NEWvw2AQ4nw,B-s6qOFD75syhwWjp518aA,False,True,False,False,False,False,False,False,False,False,False,False,False,False
8,gPOXcGNQcB2V5pAKCncxOQ,L1nn5Cge3wBUHydmX8XwWA,False,False,False,False,False,True,False,False,False,False,False,False,False,False
9,R0aWKXSB5V1zlDe0frSXSw,ZdGTTzkwTb2bYI5CBQsKVw,False,False,False,False,False,True,False,False,False,False,False,False,False,False


In [40]:
photos_restaurants_regional.to_csv('yelp_data/photos_restaurants_regional.csv', index = False)

In [41]:
len(photos_businesses_all), len(photos_restaurants_global), len(photos_restaurants_regional)

(200000, 170222, 85254)

In [42]:
len(photos_restaurants_global.loc[photos_restaurants_global['Oceania'] == True])

171

In [43]:
len(photos_businesses_all)

200000

In [44]:
from sklearn.model_selection import train_test_split

In [45]:
reg_train, reg_dev = train_test_split(photos_restaurants_regional, test_size = 0.2, random_state = 42)
reg_dev, reg_test = train_test_split(reg_dev, test_size = 0.5, random_state = 42)

In [46]:
def dataframe_regions(df):
    region_counts = {'Africa': 0,
                  'N_America': 0,
                  'C_America': 0,
                  'S_America': 0,
                  'Caribbean': 0,
                  'E_Asia': 0,
                  'S_Asia': 0,
                  'SE_Asia': 0,
                  'W_Asia': 0,
                  'C_Europe': 0,
                  'E_Europe': 0,
                  'S_Europe': 0,
                  'W_Europe': 0,
                  'Oceania': 0}
    for region in region_counts:
        region_counts[region] = len(df.loc[df[region]  == True])
    return region_counts

In [52]:
len(reg_test)

8526

In [54]:
dataframe_regions(reg_test)

{'Africa': 164,
 'N_America': 2042,
 'C_America': 9,
 'S_America': 467,
 'Caribbean': 263,
 'E_Asia': 2682,
 'S_Asia': 337,
 'SE_Asia': 1068,
 'W_Asia': 380,
 'C_Europe': 112,
 'E_Europe': 40,
 'S_Europe': 1440,
 'W_Europe': 608,
 'Oceania': 23}

In [157]:
reg_train = reg_train.reset_index(drop=True)
reg_dev = reg_dev.reset_index(drop=True)
reg_test = reg_test.reset_index(drop=True)

In [158]:
train.to_csv('yelp_data/photos_restaurants_regional_train.csv', index = False)
dev.to_csv('yelp_data/photos_restaurants_regional_dev.csv', index = False)
test.to_csv('yelp_data/photos_restaurants_regional_test.csv', index = False)

In [56]:
ambient_df

Unnamed: 0,business_id,touristy,hipster,romantic,divey,intimate,trendy,upscale,classy,casual
0,6iYb2HFDywm3zjuRg0shjw,False,False,False,False,False,False,False,False,True
1,tCbdrRPZA0oiIYSmHG3J0w,False,False,False,False,False,False,False,False,True
2,D4JtQNTI4X3KcbzacDJsMw,False,False,False,False,False,False,False,False,True
3,HPA_qyMEddpAEtFof02ixg,False,False,False,False,False,False,False,False,True
4,dmbbf3AqeG61_QHRZi1M1w,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
43426,r5Uag1JqYjr2nbxQCVqm8A,False,False,False,False,,,False,True,True
43427,Zl6SUy6x9jqjRu2HbtEO6A,False,False,False,False,False,False,False,False,False
43428,Q78fYV6B6P6GmX07YVgi4g,False,False,False,False,False,,False,False,True
43429,uXdQkuEtvLAzfc3MsO-sTQ,False,False,False,,False,False,False,False,True


### Ambience

In [57]:
photos_ambience = []

In [59]:
with open('yelp_photos/photos.json', 'r') as p:
    for photo in p:
        photo = json.loads(photo)
        photos_ambience.append({'photo_id': photo['photo_id'],
                      'business_id': photo['business_id']})

In [64]:
photos_ambience = pd.DataFrame(photos_ambience)

In [65]:
photos_ambience.head()

Unnamed: 0,photo_id,business_id
0,Un_Og6jfhazVn7CxszkKEw,R1sIqGfKpF5f3HV3vyNsbg
1,BFE1AFOs27scnnfeBf99ZA,vdT7zlrLB2DL9pStDUs91A
2,7t-C0r1JRdoVD9FS7M-N7Q,c5GzrObEdj7eNVOdAxrppg
3,rLnw0d-YYZvT9kR4y7h7_Q,aQa7N5ZbPhCoKYGGB-gqfg
4,Cv5M8MDw8a5NEWvw2AQ4nw,B-s6qOFD75syhwWjp518aA


In [67]:
photos_ambience = pd.merge(photos_ambience, ambient_df, on='business_id')

In [68]:
photos_ambience = photos_ambience[photos_ambience['photo_id'].notna()]

In [69]:
photos_ambience = photos_ambience[photos_ambience['business_id'].notna()]

In [79]:
obj_columns = list(photos_ambience.select_dtypes(include=['object']).columns.values)
photos_ambience[obj_columns] = photos_ambience[obj_columns].replace([None], np.nan).replace('None', np.nan)

In [81]:
photos_ambience = photos_ambience.fillna(False)

In [84]:
photos_ambience = photos_ambience[photos_ambience[['touristy', 'hipster', 'romantic', 'divey', 'intimate', 'trendy', 'upscale', 'classy', 'casual']].any(axis=1)]

In [85]:
amb_train, amb_dev = train_test_split(photos_ambience, test_size = 0.2, random_state = 42)
amb_dev, amb_test = train_test_split(amb_dev, test_size = 0.5, random_state = 42)

In [87]:
def df_ambience(df):
    amb_counts = {'touristy': 0,
                  'hipster': 0,
                  'romantic': 0,
                  'divey': 0,
                  'intimate': 0,
                  'trendy': 0,
                  'upscale': 0,
                  'classy': 0,
                  'casual': 0}
    for amb in amb_counts:
        amb_counts[amb] = len(df.loc[df[amb]  == True])
    return len(df), amb_counts

In [91]:
df_ambience(amb_test)

(15074,
 {'touristy': 724,
  'hipster': 1293,
  'romantic': 463,
  'divey': 318,
  'intimate': 409,
  'trendy': 3135,
  'upscale': 620,
  'classy': 7342,
  'casual': 11363})

In [159]:
amb_train = amb_train.reset_index(drop=True)
amb_dev = amb_dev.reset_index(drop=True)
amb_test = amb_test.reset_index(drop=True)

In [160]:
amb_train.to_csv('yelp_data/business_ambience_train.csv', index = False)
amb_dev.to_csv('yelp_data/business_ambience_dev.csv', index = False)
amb_test.to_csv('yelp_data/business_ambience_test.csv', index = False)

### Restaurant or not

In [131]:
for business in businesses:
    if business['categories'] == ['Non_restaurant']:
        business['restaurant'] = False
    else:
        business['restaurant'] = True
    del business['categories']

In [134]:
for business in businesses:
    if 'ambience' in business:
        del business['ambience']

In [140]:
businesses = pd.DataFrame(businesses)

In [137]:
photos_businesses_all = []

In [138]:
with open('yelp_photos/photos.json', 'r') as p:
    for photo in p:
        photo = json.loads(photo)
        photos_businesses_all.append({'photo_id': photo['photo_id'],
                      'business_id': photo['business_id']})

In [139]:
rest_or_not = pd.DataFrame(photos_businesses_all)

In [141]:
rest_or_not = pd.merge(rest_or_not, businesses, on='business_id')

In [144]:
rest_or_not.head()

Unnamed: 0,photo_id,business_id,restaurant
0,Un_Og6jfhazVn7CxszkKEw,R1sIqGfKpF5f3HV3vyNsbg,True
1,vjjyXgKxc1Wu6v0aaHLFuQ,R1sIqGfKpF5f3HV3vyNsbg,True
2,VWWdSBLn7f2eaY16GruQOg,R1sIqGfKpF5f3HV3vyNsbg,True
3,fSJ2--legv4crDHIJWIOmA,R1sIqGfKpF5f3HV3vyNsbg,True
4,liRGQZfixy5anDcjCqGuPA,R1sIqGfKpF5f3HV3vyNsbg,True


In [145]:
bus_train, bus_dev = train_test_split(rest_or_not, test_size = 0.2, random_state = 42)
bus_dev, bus_test = train_test_split(bus_dev, test_size = 0.5, random_state = 42)

In [163]:
bus_train

Unnamed: 0,photo_id,business_id,restaurant
0,WGp63RIuT6C4qxboG3FgpQ,J7Hs5_PGlZajNRxGptGRvw,True
1,eKa8I6RGgi7W_HOpqFOgtA,LnBBOksOrzUiiDQkjUeMdg,True
2,jx9rraUeZ3JvazMvFLw0yw,MkGPncZD8oqp-AHkLW740g,True
3,8E3MwgLc-ggdDfokoYPd9Q,Ur7o-5FviIC9YmJqsGKR6A,True
4,eLVN8Y2DSSU2Gt4haHxwzA,KbyrRWuEeK6uEbtpAUgIdg,True
...,...,...,...
159995,aMvsc1kffaZU-c2qCRibTg,vBKwHwBqkFdxw4VetbYSxQ,True
159996,Fl4je3JGtt2h_UwrKo3xMw,3u3Xllz8NezyMjH6zsGp9w,True
159997,mErOCowqwADIKpLf98Rgmg,Xge6ajeuR7CEio9vSdFsJA,True
159998,TjjzxVtqKbSwT0NGFERbPA,8tLzmoG8Dluy80_iULO-Xg,True


In [151]:
len(bus_dev)

20000

In [161]:
bus_train = bus_train.reset_index(drop=True)
bus_dev = bus_dev.reset_index(drop=True)
bus_test = bus_test.reset_index(drop=True)

In [162]:
bus_train.to_csv('yelp_data/business_restaurant_train.csv', index = False)
bus_dev.to_csv('yelp_data/business_restaurant_dev.csv', index = False)
bus_test.to_csv('yelp_data/business_restaurant_test.csv', index = False)