## Restaurant Graph Generator

In [1]:
parameters = {
    "features": {
        "attributes": True,
        "categories": (True,
                       [(1, 50),  # first one should always point to 1 category matching
                        (2, 50),]),
        "city": True,
        "state": True,
    },
    "distance": 500, # Maximum distance between nodes
    "filename": "../graphs/restaurants_category_combination",
}

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
import torch
import json 

### Read Data

In [3]:
restaurants_df = pd.read_csv("../datasets/2017-2018_restaurants.csv")
edges_df = pd.read_csv("../datasets/2017-2018_restaurant-edges.csv")

In [4]:
restaurants_df.head(2)

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes,categories,hours,checkin_count,review_count,raw_stars,stars,weighted_stars,tip_count,first_date,last_date,visit_count,is_open_year_after
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,49,3.714286,3.5,2.827977,4,2017-09-09 04:42:34,2021-01-22 05:20:38,132,True
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,24,3.5,3.5,2.734268,4,2010-03-09 16:02:04,2021-01-21 17:55:35,209,True


In [5]:
edges_df.head(3)

Unnamed: 0,id1,id2,distance
0,Q2vefh0tGhtCGQDK1FI7cw,ssK5vKQ_eN0VyGoYKOmkeQ,441
1,Q2vefh0tGhtCGQDK1FI7cw,tSZTPA7uERhWkKq_jbl3Eg,209
2,Q2vefh0tGhtCGQDK1FI7cw,bSy6VVJIdYPza1Bj9_Eicw,450


In [6]:
# Remove rows where stars is NaN
restaurants_df = restaurants_df[restaurants_df['stars'].notna()]

In [7]:
print(f"There are {len(restaurants_df)} restaurants and {len(edges_df)} edges.")

There are 29963 restaurants and 494203 edges.


### Calculate Popularity Category

In [8]:
from restaurant_utils import calculate_popularity

restaurants_df = calculate_popularity(restaurants_df)

### Extract attributes into columns
**Attributes supported:** RestaurantsPriceRange2, OutdoorSeating, RestaurantsGoodForGroups, BusinessAcceptsCreditCards, GoodForKids, RestaurantsDelivery, Caters

In [9]:
from restaurant_utils import extract_attributes

# Necessary as attributes are stored as JSON
all_business_df = pd.read_json('../yelp/yelp_academic_dataset_business.json', lines=True)
all_business_df = all_business_df[["business_id", "attributes"]]

restaurants_df = extract_attributes(restaurants_df, all_business_df)

In [10]:
restaurants_df.head(3)

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,categories,hours,checkin_count,review_count,...,is_open_year_after,popularity_value,popularity,RestaurantsPriceRange2,OutdoorSeating,RestaurantsGoodForGroups,BusinessAcceptsCreditCards,GoodForKids,RestaurantsDelivery,Caters
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,49,...,True,475.040397,2,2,1,1,1,0,1,1
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,24,...,True,737.162331,2,2,1,1,1,1,1,1
2,D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,1,"Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",42,28,...,True,261.877324,1,2,1,1,0,1,1,1


### Get top categories

In [11]:
from restaurant_utils import get_top_k_p_combinations, get_cat2idx, get_category_features
top_categories = []

for num_comb, topk in parameters["features"]["categories"][1]:
    top_categories += get_top_k_p_combinations(restaurants_df, num_comb, topk)

In [12]:
top_categories

[('Restaurants',),
 ('Food',),
 ('Nightlife',),
 ('Bars',),
 ('American (Traditional)',),
 ('Coffee & Tea',),
 ('Sandwiches',),
 ('Breakfast & Brunch',),
 ('American (New)',),
 ('Pizza',),
 ('Fast Food',),
 ('Burgers',),
 ('Mexican',),
 ('Chinese',),
 ('Italian',),
 ('Specialty Food',),
 ('Seafood',),
 ('Event Planning & Services',),
 ('Salad',),
 ('Desserts',),
 ('Bakeries',),
 ('Japanese',),
 ('Cafes',),
 ('Grocery',),
 ('Beer',),
 ('Wine & Spirits',),
 ('Shopping',),
 ('Sushi Bars',),
 ('Caterers',),
 ('Ice Cream & Frozen Yogurt',),
 ('Chicken Wings',),
 ('Asian Fusion',),
 ('Food Trucks',),
 ('Cocktail Bars',),
 ('Pubs',),
 ('Delis',),
 ('Sports Bars',),
 ('Vegetarian',),
 ('Juice Bars & Smoothies',),
 ('Mediterranean',),
 ('Barbeque',),
 ('Thai',),
 ('Diners',),
 ('Steakhouses',),
 ('Arts & Entertainment',),
 ('Wine Bars',),
 ('Soup',),
 ('Tex-Mex',),
 ('Gluten-Free',),
 ('Vegan',),
 ('Restaurants', 'Food'),
 ('Food', 'Restaurants'),
 ('Nightlife', 'Bars'),
 ('Restaurants', 'Night

In [15]:
cat2idx, idx2cat = get_cat2idx(restaurants_df, parameters["features"]["categories"][1][0][1])

In [16]:
restaurants_df = get_category_features(restaurants_df, cat2idx, top_categories)

In [17]:
restaurants_df.head(2)

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,categories,hours,checkin_count,review_count,...,popularity,RestaurantsPriceRange2,OutdoorSeating,RestaurantsGoodForGroups,BusinessAcceptsCreditCards,GoodForKids,RestaurantsDelivery,Caters,top_categories_vector,top_categories_combination_vector
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,49,...,2,2,1,1,1,0,1,1,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,24,...,2,2,1,1,1,1,1,1,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


### Encode Extra: City and States

In [18]:
from restaurant_utils import get_encoder, encode_cities_states, to_categorical

In [19]:
encoder = get_encoder(restaurants_df)
restaurants_df = encode_cities_states(restaurants_df, encoder)

### Generate Graph

In [20]:
G = nx.Graph()

In [21]:
# nodes
if parameters["features"]["attributes"]:
    features = restaurants_df[[
        "OutdoorSeating", "RestaurantsGoodForGroups", "BusinessAcceptsCreditCards",
        "GoodForKids", "RestaurantsDelivery", "Caters"
    ]].values

if parameters["features"]["categories"][0]:
    feat = np.stack(restaurants_df["top_categories_vector"].values)
    features = np.concatenate([features, feat], axis=1)

if len(parameters["features"]["categories"][1]) > 1:  # category combinations exists
    feat = np.stack(restaurants_df["top_categories_combination_vector"].values)
    features = np.concatenate([features, feat], axis=1)

if parameters["features"]["city"]:
    feat = to_categorical(restaurants_df["city_idx"],
                          num_classes=restaurants_df["city_idx"].max() + 1)
    features = np.concatenate([features, feat], axis=1)

if parameters["features"]["state"]:
    feat = to_categorical(restaurants_df["state_idx"],
                          num_classes=restaurants_df["state_idx"].max() + 1)
    features = np.concatenate([features, feat], axis=1)

node_label = restaurants_df["popularity"].values  # [num_nodes,]

for index, row in restaurants_df.iterrows():
    G.add_node(row["business_id"],
               node_label=torch.LongTensor([node_label[index]]),
               node_feature=torch.FloatTensor(features[index]),
               node_type="restaurant")

In [22]:
# edges
for index, row in edges_df.iterrows():
    node1 = row["id1"]
    node2 = row["id2"]
    if node1 not in G.nodes or node2 not in G.nodes:
        continue
    dist = row["distance"]

    if dist <= parameters["distance"]:
        G.add_edge(node1, node2, edge_type="vv")

### Summary

In [23]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
print(f"Number of features: {len(list(G.nodes.data())[0][1]['node_feature'])}")

Number of nodes: 29963
Number of edges: 491371
Number of features: 532


### Write graph to disk

In [24]:
nx.write_gpickle(G, parameters["filename"] + ".gpickle")

In [25]:
with open(parameters["filename"] + ".json", "w") as outfile: 
    json.dump(parameters, outfile)