## Generate Network Graph for Restaurants

### Read Data

In [1]:
!python --version

Python 3.7.0


In [43]:
import pandas as pd
import numpy as np
import torch

In [44]:
#from google.colab import drive
#drive.mount('/content/drive')

In [45]:
#restaurants_df = pd.read_csv("./drive/MyDrive/Colab Notebooks/2017-2018_restaurants.csv")
restaurants_df = pd.read_csv("./../datasets/2017-2018_restaurants.csv")

In [46]:
restaurants_df.head()

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes,categories,hours,checkin_count,review_count,raw_stars,stars,weighted_stars,tip_count,first_date,last_date,visit_count,is_open_year_after
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,49,3.714286,3.5,2.827977,4,2017-09-09 04:42:34,2021-01-22 05:20:38,132,True
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,24,3.5,3.5,2.734268,4,2010-03-09 16:02:04,2021-01-21 17:55:35,209,True
2,D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,49.251342,-123.101333,1,"{'GoodForKids': 'True', 'Alcohol': ""u'none'"", ...","Restaurants, Thai","{'Monday': '17:0-21:0', 'Tuesday': '17:0-21:0'...",42,28,3.678571,3.5,3.284146,3,2010-09-26 04:03:35,2021-01-23 01:43:50,73,True
3,ufCxltuh56FF4-ZFZ6cVhg,Orlando,FL,28.513265,-81.374707,1,"{'BusinessParking': ""{'garage': False, 'street...","Restaurants, American (New), Bakeries, Dessert...","{'Tuesday': '11:0-18:0', 'Wednesday': '11:0-18...",42,38,4.631579,4.5,3.75586,5,2012-08-19 21:08:57,2020-12-29 16:25:19,85,True
4,dmbbf3AqeG61_QHRZi1M1w,Pine Castle,FL,28.450303,-81.380587,1,"{'BikeParking': 'False', 'RestaurantsPriceRang...","Automotive, American (Traditional), Gas Statio...",,4,3,3.0,3.0,1.789357,1,2015-06-11 16:44:12,2020-11-14 00:49:04,8,True


In [47]:
#edges_df = pd.read_csv("./drive/MyDrive/Colab Notebooks/2017-2018_restaurant-edges.csv")
edges_df = pd.read_csv("./../datasets/2017-2018_restaurant-edges.csv")

In [48]:
edges_df.head()

Unnamed: 0,id1,id2,distance
0,Q2vefh0tGhtCGQDK1FI7cw,ssK5vKQ_eN0VyGoYKOmkeQ,441
1,Q2vefh0tGhtCGQDK1FI7cw,tSZTPA7uERhWkKq_jbl3Eg,209
2,Q2vefh0tGhtCGQDK1FI7cw,bSy6VVJIdYPza1Bj9_Eicw,450
3,Q2vefh0tGhtCGQDK1FI7cw,Z5iQ4LXTbNbYZ5HkpJVWBQ,279
4,Q2vefh0tGhtCGQDK1FI7cw,dYiOhCuOWJRlwOZb1lCBsA,489


In [49]:
print(f"There are {len(restaurants_df)} restaurants and {len(edges_df)} edges.")

There are 30094 restaurants and 494203 edges.


In [50]:
restaurants_df["popularity_value"] = restaurants_df["raw_stars"] * restaurants_df["review_count"] + restaurants_df["raw_stars"].mean() * (restaurants_df["tip_count"] + restaurants_df["checkin_count"])

In [51]:
restaurants_df["popularity_value"].mean()

463.1416345671531

In [52]:
bottom = restaurants_df["popularity_value"].describe()["25%"]
median = restaurants_df["popularity_value"].describe()["50%"]
top = restaurants_df["popularity_value"].describe()["75%"]

In [53]:
def classify_popular(score):
    if score >= top:
        return 2
    if score >= bottom:
        return 1
    return 0

In [54]:
restaurants_df["popularity"] = restaurants_df["popularity_value"].apply(classify_popular)

In [55]:
restaurants_df.head(2)

Unnamed: 0,business_id,city,state,latitude,longitude,is_open,attributes,categories,hours,checkin_count,...,raw_stars,stars,weighted_stars,tip_count,first_date,last_date,visit_count,is_open_year_after,popularity_value,popularity
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,40.017544,-105.283348,1,"{'RestaurantsTableService': 'True', 'WiFi': ""u...","Gastropubs, Food, Beer Gardens, Restaurants, B...","{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",79,...,3.714286,3.5,2.827977,4,2017-09-09 04:42:34,2021-01-22 05:20:38,132,True,475.040397,1
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,45.588906,-122.593331,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Salad, Soup, Sandwiches, Delis, Restaurants, C...","{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",181,...,3.5,3.5,2.734268,4,2010-03-09 16:02:04,2021-01-21 17:55:35,209,True,737.162331,2


In [65]:
restaurants_df["stars"].isna().sum()

131

In [66]:
restaurants_df = restaurants_df[restaurants_df['stars'].notna()]

### Create NetworkX Graph

In [67]:
import networkx as nx

In [68]:
G = nx.Graph()

In [69]:
rest = {}
current = 0

In [70]:
for index, row in restaurants_df.iterrows():
    rest[row["business_id"]] = current
    
    node_id = row["business_id"]
    node_label = row["popularity"]
    features = [row["stars"], row["is_open"]]
    
    G.add_node(current, node_label=node_label, node_feature=torch.FloatTensor(features))
    
    current += 1

In [72]:
for index, row in edges_df.iterrows():
    try:
        node1 = rest[row["id1"]]
        node2 = rest[row["id2"]]
    except KeyError:
        pass
    
    G.add_edge(node1, node2)

In [73]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 29963
Number of edges: 491464


In [63]:
average_coeff = nx.average_clustering(G)
print(f"The average clustering coefficient of the whole graph is: {average_coeff}")

The average clustering coefficient of the whole graph is: 0.782549169417619


In [22]:
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 30094\nNumber of edges: 494203\nAverage degree:  32.8440'

In [74]:
nx.write_gpickle(G, "restaurants.gpickle")

In [31]:
#from google.colab import files
#files.download("restaurants.gpickle")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>