## Restaurant Graph Generator

In [1]:
parameters = {
    "features": {
        "attributes": True,
        "categories": (True, 50)
    },
    "distance": 500, # Maximum distance between nodes
    "filename": "../graphs/restaurants_basic",
}

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import json 

### Read Data

In [3]:
restaurants_df = pd.read_csv("../datasets/2017-2018_restaurants.csv")
edges_df = pd.read_csv("../datasets/2017-2018_restaurant-edges.csv")

In [4]:
restaurants_df.head(2)

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes,categories,hours,checkin_count,review_count,raw_stars,stars,weighted_stars,tip_count,first_date,last_date,visit_count,is_open_year_after
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,49,3.714286,3.5,2.827977,4,2017-09-09 04:42:34,2021-01-22 05:20:38,132,True
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,24,3.5,3.5,2.734268,4,2010-03-09 16:02:04,2021-01-21 17:55:35,209,True


In [5]:
edges_df.head(3)

Unnamed: 0,id1,id2,distance
0,Q2vefh0tGhtCGQDK1FI7cw,ssK5vKQ_eN0VyGoYKOmkeQ,441
1,Q2vefh0tGhtCGQDK1FI7cw,tSZTPA7uERhWkKq_jbl3Eg,209
2,Q2vefh0tGhtCGQDK1FI7cw,bSy6VVJIdYPza1Bj9_Eicw,450


In [6]:
# Remove rows where stars is NaN
restaurants_df = restaurants_df[restaurants_df['stars'].notna()]

In [7]:
print(f"There are {len(restaurants_df)} restaurants and {len(edges_df)} edges.")

There are 29963 restaurants and 494203 edges.


### Calculate Popularity Category

In [8]:
from restaurant_utils import calculate_popularity

restaurants_df = calculate_popularity(restaurants_df)

### Extract attributes into columns
**Attributes supported:** RestaurantsPriceRange2, OutdoorSeating, RestaurantsGoodForGroups, BusinessAcceptsCreditCards, GoodForKids, RestaurantsDelivery, Caters

In [9]:
from restaurant_utils import extract_attributes

# Necessary as attributes are stored as JSON
all_business_df = pd.read_json('../yelp/yelp_academic_dataset_business.json', lines=True)
all_business_df = all_business_df[["business_id", "attributes"]]

restaurants_df = extract_attributes(restaurants_df, all_business_df)

In [10]:
restaurants_df.head(3)

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes_x,categories,hours,checkin_count,...,popularity_value,popularity,attributes_y,price_tier,outdoors,good_for_groups,has_credit_card,good_for_kids,has_delivery,caters
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,...,475.040397,2,"{'RestaurantsTableService': 'True', 'WiFi': 'u...",2,1,1,1,0,1,1
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,...,737.162331,2,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",2,1,1,1,1,1,1
2,D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",42,...,261.877324,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...",2,1,1,0,1,1,1


### Get top categories

In [11]:
from restaurant_utils import get_top_categories

top_categories = get_top_categories(restaurants_df, parameters["features"]["categories"][1])

In [12]:
" ".join(top_categories)

'restaurants food nightlife bars american (traditional) coffee & tea sandwiches breakfast & brunch american (new) pizza fast food burgers mexican chinese italian specialty food seafood event planning & services salad desserts bakeries japanese cafes grocery beer wine & spirits shopping sushi bars caterers ice cream & frozen yogurt chicken wings asian fusion food trucks cocktail bars pubs delis sports bars vegetarian juice bars & smoothies mediterranean barbeque thai diners steakhouses arts & entertainment wine bars soup tex-mex gluten-free vegan'

### Generate Graph

In [13]:
G = nx.Graph()
# rest = {} # Mapping from business_id to node index
# current = 0 # Node index

In [14]:
from restaurant_utils import get_categories

for index, row in restaurants_df.iterrows():
#     rest[row["business_id"]] = current
    
    node_label = row["popularity"]
    
    features = []
    
    if parameters["features"]["attributes"]:
        features += [row["price_tier"], row["outdoors"], row["good_for_groups"], row["has_credit_card"], 
                    row["good_for_kids"], row["has_delivery"], row["caters"]]
        
    if parameters["features"]["categories"][0]:
        features += get_categories(row["categories"], top_categories)
    
    G.add_node(row["business_id"], node_label=node_label, node_feature=features)
#     current += 1

In [15]:
for index, row in edges_df.iterrows():
#     try:
#         node1 = rest[row["id1"]]
#         node2 = rest[row["id2"]]
#         dist = row["distance"]
        
#         if dist < parameters["distance"]:
#             G.add_edge(node1, node2)
#     except KeyError:
#         pass
    node1 = row["id1"]
    node2 = row["id2"]
    if node1 not in G.nodes or node2 not in G.nodes:
        continue
    dist = row["distance"]

    if dist <= parameters["distance"]:
        G.add_edge(node1, node2)

### Summary

In [16]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Number of features: {len(list(G.nodes.data())[0][1]['node_feature'])}")

Number of nodes: 29963
Number of edges: 491371
Number of features: 57


### Convert features to Tensor

In [17]:
for i in G.nodes():
    G.nodes[i]["node_feature"] = torch.FloatTensor(G.nodes[i]["node_feature"])

### Write graph to disk

In [18]:
nx.write_gpickle(G, parameters["filename"] + ".gpickle")

In [19]:
with open(parameters["filename"] + ".json", "w") as outfile: 
    json.dump(parameters, outfile)