## Generate Network Graph for Restaurants

### Read Data

In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
restaurants_df = pd.read_csv("./../datasets/2017-2018_restaurants.csv")

In [3]:
restaurants_df.head()

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes,categories,hours,checkin_count,review_count,raw_stars,stars,weighted_stars,tip_count,first_date,last_date,visit_count,is_open_year_after
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,49,3.714286,3.5,2.827977,4,2017-09-09 04:42:34,2021-01-22 05:20:38,132,True
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,24,3.5,3.5,2.734268,4,2010-03-09 16:02:04,2021-01-21 17:55:35,209,True
2,D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",42,28,3.678571,3.5,3.284146,3,2010-09-26 04:03:35,2021-01-23 01:43:50,73,True
3,ufCxltuh56FF4-ZFZ6cVhg,Orlando,FL,28.513265,-81.374707,1,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",42,38,4.631579,4.5,3.75586,5,2012-08-19 21:08:57,2020-12-29 16:25:19,85,True
4,dmbbf3AqeG61_QHRZi1M1w,Pine Castle,FL,28.450303,-81.380587,1,"{'BikeParking': 'False', 'RestaurantsPriceRang...","Automotive, American (Traditional), Gas Statio...",,4,3,3.0,3.0,1.789357,1,2015-06-11 16:44:12,2020-11-14 00:49:04,8,True


In [4]:
#edges_df = pd.read_csv("./drive/MyDrive/Colab Notebooks/2017-2018_restaurant-edges.csv")
edges_df = pd.read_csv("./../datasets/2017-2018_restaurant-edges.csv")

In [5]:
edges_df.head()

Unnamed: 0,id1,id2,distance
0,Q2vefh0tGhtCGQDK1FI7cw,ssK5vKQ_eN0VyGoYKOmkeQ,441
1,Q2vefh0tGhtCGQDK1FI7cw,tSZTPA7uERhWkKq_jbl3Eg,209
2,Q2vefh0tGhtCGQDK1FI7cw,bSy6VVJIdYPza1Bj9_Eicw,450
3,Q2vefh0tGhtCGQDK1FI7cw,Z5iQ4LXTbNbYZ5HkpJVWBQ,279
4,Q2vefh0tGhtCGQDK1FI7cw,dYiOhCuOWJRlwOZb1lCBsA,489


In [6]:
print(f"There are {len(restaurants_df)} restaurants and {len(edges_df)} edges.")

There are 30094 restaurants and 494203 edges.


In [7]:
restaurants_df["popularity_value"] = restaurants_df["raw_stars"] * restaurants_df["review_count"] + restaurants_df["raw_stars"].mean() * (restaurants_df["tip_count"] + restaurants_df["checkin_count"])

In [8]:
restaurants_df["popularity_value"].mean()

463.1416345671531

In [9]:
bottom = restaurants_df["popularity_value"].quantile(0.33)
median = restaurants_df["popularity_value"].describe()["50%"]
top = restaurants_df["popularity_value"].quantile(0.67)

print(f"bottom: {bottom}, top: {top}")

bottom: 116.55092953236272, top: 375.5915742046034


In [10]:
def classify_popular(score):
    if score >= top:
        return 2
    if score >= bottom:
        return 1
    return 0

In [11]:
restaurants_df["popularity"] = restaurants_df["popularity_value"].apply(classify_popular)

In [12]:
restaurants_df.head(5)

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes,categories,hours,checkin_count,...,raw_stars,stars,weighted_stars,tip_count,first_date,last_date,visit_count,is_open_year_after,popularity_value,popularity
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,...,3.714286,3.5,2.827977,4,2017-09-09 04:42:34,2021-01-22 05:20:38,132,True,475.040397,2
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,...,3.5,3.5,2.734268,4,2010-03-09 16:02:04,2021-01-21 17:55:35,209,True,737.162331,2
2,D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",42,...,3.678571,3.5,3.284146,3,2010-09-26 04:03:35,2021-01-23 01:43:50,73,True,261.877324,1
3,ufCxltuh56FF4-ZFZ6cVhg,Orlando,FL,28.513265,-81.374707,1,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",42,...,4.631579,4.5,3.75586,5,2012-08-19 21:08:57,2020-12-29 16:25:19,85,True,341.938538,1
4,dmbbf3AqeG61_QHRZi1M1w,Pine Castle,FL,28.450303,-81.380587,1,"{'BikeParking': 'False', 'RestaurantsPriceRang...","Automotive, American (Traditional), Gas Statio...",,4,...,3.0,3.0,1.789357,1,2015-06-11 16:44:12,2020-11-14 00:49:04,8,True,26.653036,0


In [13]:
restaurants_df["stars"].isna().sum()

131

In [14]:
restaurants_df = restaurants_df[restaurants_df['stars'].notna()]

### Extracting Attributes

Used node features: attributes, hours, category

In [15]:
all_business_df = pd.read_json('../yelp/yelp_academic_dataset_business.json', lines=True)
all_business_df = all_business_df[["business_id", "attributes", "hours"]]

In [16]:
all_business_df.head(3)

Unnamed: 0,business_id,attributes,hours
0,6iYb2HFDywm3zjuRg0shjw,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'..."
1,tCbdrRPZA0oiIYSmHG3J0w,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ..."
2,bvN78flM8NLprQ1a1y5dRg,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0..."


In [17]:
attributes = list(all_business_df["attributes"][:3])
attributes

[{'RestaurantsTableService': 'True',
  'WiFi': "u'free'",
  'BikeParking': 'True',
  'BusinessParking': "{'garage': False, 'street': True, 'validated': False, 'lot': False, 'valet': False}",
  'BusinessAcceptsCreditCards': 'True',
  'RestaurantsReservations': 'False',
  'WheelchairAccessible': 'True',
  'Caters': 'True',
  'OutdoorSeating': 'True',
  'RestaurantsGoodForGroups': 'True',
  'HappyHour': 'True',
  'BusinessAcceptsBitcoin': 'False',
  'RestaurantsPriceRange2': '2',
  'Ambience': "{'touristy': False, 'hipster': False, 'romantic': False, 'divey': False, 'intimate': False, 'trendy': False, 'upscale': False, 'classy': False, 'casual': True}",
  'HasTV': 'True',
  'Alcohol': "'beer_and_wine'",
  'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': False, 'dinner': False, 'brunch': False, 'breakfast': False}",
  'DogsAllowed': 'False',
  'RestaurantsTakeOut': 'True',
  'NoiseLevel': "u'average'",
  'RestaurantsAttire': "'casual'",
  'RestaurantsDelivery': 'None'},
 {'R

In [49]:
def get_price(row):
    d = row[1]
    try:
        if "RestaurantsPriceRange2" in d:
            return int(d["RestaurantsPriceRange2"])
        return 1
    except:
        return 1
    
def get_outdoor(row):
    d = row[1]
    try:
        if "OutdoorSeating" in d:
            if d["OutdoorSeating"]:
                return 1
        return 0
    except:
        return 0
    
def get_good4groups(row):
    d = row[1]
    try:
        if "RestaurantsGoodForGroups" in d:
            if d["RestaurantsGoodForGroups"]:
                return 1
        return 0
    except:
        return 0
    
def get_creditcard(row):
    d = row[1]
    try:
        if "BusinessAcceptsCreditCards" in d:
            if d["BusinessAcceptsCreditCards"]:
                return 1
        return 0
    except:
        return 0
    
def get_good4kids(row):
    d = row[1]
    try:
        if "GoodForKids" in d:
            if d["GoodForKids"]:
                return 1
        return 0
    except:
        return 0
    
def get_delivery(row):
    d = row[1]
    try:
        if "RestaurantsDelivery" in d:
            if d["RestaurantsDelivery"]:
                return 1
        return 0
    except:
        return 0
    
def get_caters(row):
    d = row[1]
    try:
        if "Caters" in d:
            if d["Caters"]:
                return 1
        return 0
    except:
        return 0

In [50]:
all_business_df["price_tier"] = all_business_df.apply(get_price, axis=1)
all_business_df["outdoors"] = all_business_df.apply(get_outdoor, axis=1)
all_business_df["good_for_groups"] = all_business_df.apply(get_good4groups, axis=1)
all_business_df["has_credit_card"] = all_business_df.apply(get_creditcard, axis=1)
all_business_df["good_for_kids"] = all_business_df.apply(get_good4kids, axis=1)
all_business_df["has_delivery"] = all_business_df.apply(get_delivery, axis=1)
all_business_df["caters"] = all_business_df.apply(get_caters, axis=1)

In [46]:
all_business_df.head()

Unnamed: 0,business_id,attributes,hours,price_tier,outdoors,good_for_groups,has_credit_card,good_for_kids,has_delivery,caters
0,6iYb2HFDywm3zjuRg0shjw,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",2,1,1,1,0,1,1
1,tCbdrRPZA0oiIYSmHG3J0w,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",2,1,1,1,1,1,1
2,bvN78flM8NLprQ1a1y5dRg,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","{'Thursday': '11:0-18:0', 'Friday': '11:0-18:0...",2,0,0,1,0,0,0
3,oaepsyvc0J17qwi8cfrOWg,"{'RestaurantsPriceRange2': '1', 'BusinessAccep...",,1,0,0,1,1,0,0
4,PE9uqAjdw0E4-8mjGl3wVA,"{'GoodForKids': 'False', 'BusinessParking': '{...","{'Monday': '16:0-19:0', 'Tuesday': '16:0-19:0'...",1,0,0,1,1,0,0


### Extracting Open Hours

In [20]:
# todo

### Extracting Categories

In [21]:
# Finding most popular categories
categories = {}

for index, row in restaurants_df.iterrows():
    cats = [cat.lower().strip() for cat in row["categories"].split(",")]
    for c in cats:
        if c in categories.keys():
            categories[c] += 1
        else:
            categories[c] = 1

In [22]:
categories_list = list(categories.items())

In [23]:
from operator import itemgetter
categories_list = sorted(categories_list, key=itemgetter(1), reverse=True)

In [24]:
top_categories = [name for (name, count) in categories_list[:50]]
top_categories

['restaurants',
 'food',
 'nightlife',
 'bars',
 'american (traditional)',
 'coffee & tea',
 'sandwiches',
 'breakfast & brunch',
 'american (new)',
 'pizza',
 'fast food',
 'burgers',
 'mexican',
 'chinese',
 'italian',
 'specialty food',
 'seafood',
 'event planning & services',
 'salad',
 'desserts',
 'bakeries',
 'japanese',
 'cafes',
 'grocery',
 'beer',
 'wine & spirits',
 'shopping',
 'sushi bars',
 'caterers',
 'ice cream & frozen yogurt',
 'chicken wings',
 'asian fusion',
 'food trucks',
 'cocktail bars',
 'pubs',
 'delis',
 'sports bars',
 'vegetarian',
 'juice bars & smoothies',
 'mediterranean',
 'barbeque',
 'thai',
 'diners',
 'steakhouses',
 'arts & entertainment',
 'wine bars',
 'soup',
 'tex-mex',
 'gluten-free',
 'vegan']

In [25]:
def get_categories_tensor(cats):
    categories = cats.lower()
    l = []
    for top_cat in top_categories:
        if top_cat in categories:
            l.append(1)
        else:
            l.append(0)
    return l

### Merging

In [26]:
restaurants_df = restaurants_df.merge(all_business_df, on="business_id")

In [27]:
print(f"There are {len(restaurants_df)} restaurants.")

There are 29963 restaurants.


In [28]:
restaurants_df.head()

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes_x,categories,hours_x,checkin_count,...,popularity,attributes_y,hours_y,price_tier,outdoors,good_for_groups,has_credit_card,good_for_kids,has_delivery,caters
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,...,2,"{'RestaurantsTableService': 'True', 'WiFi': 'u...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",2,1,1,1,0,1,1
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,...,2,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",2,1,1,1,1,1,1
2,D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",42,...,1,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",2,1,1,0,1,1,1
3,ufCxltuh56FF4-ZFZ6cVhg,Orlando,FL,28.513265,-81.374707,1,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",42,...,1,"{'BusinessParking': '{'garage': False, 'street...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",1,0,0,1,0,1,1
4,dmbbf3AqeG61_QHRZi1M1w,Pine Castle,FL,28.450303,-81.380587,1,"{'BikeParking': 'False', 'RestaurantsPriceRang...","Automotive, American (Traditional), Gas Statio...",,4,...,0,"{'BikeParking': 'False', 'RestaurantsPriceRang...",,1,1,1,1,1,1,0


### Get just one city

In [29]:
restaurants_df = restaurants_df[restaurants_df["state"] == "MA"]

### Create NetworkX Graph

In [30]:
import networkx as nx

In [31]:
G = nx.Graph()

In [32]:
rest = {}
current = 0

In [33]:
for index, row in restaurants_df.iterrows():
    rest[row["business_id"]] = current
    
    node_label = row["popularity"]
    features = [row["price_tier"], row["outdoors"], row["good_for_groups"], row["has_credit_card"], 
                row["good_for_kids"], row["has_delivery"], row["caters"]]
    
    features += get_categories_tensor(row["categories"])
    
    G.add_node(current, node_label=node_label, node_feature=features)
    
    current += 1

In [34]:
for index, row in edges_df.iterrows():
    try:
        node1 = rest[row["id1"]]
        node2 = rest[row["id2"]]
        dist = row["distance"]
        
        if dist < 500:
            G.add_edge(node1, node2)
    except KeyError:
        pass

In [35]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 6192
Number of edges: 130352


In [36]:
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 6192\nNumber of edges: 130352\nAverage degree:  42.1034'

### Node Augmentation

#### Node Degree

In [37]:
degrees_dict = {node:val for (node, val) in G.degree()}
degrees_list = [val for (node, val) in G.degree()]

In [38]:
max_degree = max(degrees_list)

for i in range(0, G.number_of_nodes()):
    deg = degrees_dict[i]
    one_hot_deg = [0] * (max_degree + 1)
    one_hot_deg[deg] = 1
    
    G.nodes[i]["node_feature"] + one_hot_deg

#### Clustering Coefficient

In [39]:
clustering_dict = nx.clustering(G)

In [40]:
clustering_list = [(node, val) for (node, val) in clustering_dict.items()]
clustering_list[:10]

[(0, 0.7973856209150327),
 (1, 0.9515669515669516),
 (2, 1.0),
 (3, 0.8393234672304439),
 (4, 1.0),
 (5, 1.0),
 (6, 0.8827586206896552),
 (7, 1.0),
 (8, 0),
 (9, 1.0)]

In [41]:
for i in range(0, G.number_of_nodes()):
    deg = degrees_dict[i]
    clust = round(clustering_dict[i], 3)
    G.nodes[i]["node_feature"].append(deg)
    G.nodes[i]["node_feature"].append(clust)

### Convert to tensors

In [42]:
for i in range(0, G.number_of_nodes()):
    G.nodes[i]["node_feature"] = torch.FloatTensor(G.nodes[i]["node_feature"])

In [43]:
 G.nodes[0]

{'node_label': 2,
 'node_feature': tensor([ 2.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  1.0000,  1.0000,  0.0000,  0.0000,  1.0000,  1.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,
          0.0000,  1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          1.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  1.0000,
          0.0000, 18.0000,  0.7970])}

In [44]:
nx.write_gpickle(G, "restaurants_MA_3.gpickle")

In [45]:
#from google.colab import files
#files.download("restaurants.gpickle")