# Analysis of Node Degree of Restaurant Graph

### Preparing the Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
restaurants_df = pd.read_csv("./../datasets/2017-2018_restaurants.csv")

In [3]:
restaurants_df = restaurants_df.drop(["latitude", "longitude", "is_open", "attributes", "categories", "hours",
                                     "first_date", "last_date", "is_open_year_after"], axis=1)
restaurants_df.head(2)

Unnamed: 0,business_id,city,state,checkin_count,review_count,raw_stars,stars,weighted_stars,tip_count,visit_count
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,79,49,3.714286,3.5,2.827977,4,132
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,181,24,3.5,3.5,2.734268,4,209


In [4]:
restaurants_df = restaurants_df[restaurants_df['stars'].notna()]

In [5]:
edges_df = pd.read_csv("./../datasets/2017-2018_restaurant-edges.csv")
edges_df.head(3)

Unnamed: 0,id1,id2,distance
0,Q2vefh0tGhtCGQDK1FI7cw,ssK5vKQ_eN0VyGoYKOmkeQ,441
1,Q2vefh0tGhtCGQDK1FI7cw,tSZTPA7uERhWkKq_jbl3Eg,209
2,Q2vefh0tGhtCGQDK1FI7cw,bSy6VVJIdYPza1Bj9_Eicw,450


In [6]:
print(f"There are {len(restaurants_df)} restaurants and {len(edges_df)} edges.")

There are 29963 restaurants and 494203 edges.


In [7]:
restaurants_df["popularity_value"] = restaurants_df["raw_stars"] * restaurants_df["review_count"] + restaurants_df["raw_stars"].mean() * (restaurants_df["tip_count"] + restaurants_df["checkin_count"])

In [8]:
bottom = restaurants_df["popularity_value"].quantile(0.33)
median = restaurants_df["popularity_value"].describe()["50%"]
top = restaurants_df["popularity_value"].quantile(0.67)

In [9]:
def classify_popular(score):
    if score >= top:
        return 2
    if score >= bottom:
        return 1
    return 0

In [10]:
restaurants_df["popularity"] = restaurants_df["popularity_value"].apply(classify_popular)

### Creating the Graph

In [11]:
import networkx as nx

In [12]:
G = nx.Graph()

In [13]:
rest = {}
current = 0

In [14]:
for index, row in restaurants_df.iterrows():
    rest[row["business_id"]] = current
    node_label = row["popularity"]
    
    G.add_node(current, node_label=node_label)
    
    current += 1

In [15]:
for index, row in edges_df.iterrows():
    try:
        node1 = rest[row["id1"]]
        node2 = rest[row["id2"]]
    except KeyError:
        pass
    
    G.add_edge(node1, node2)

In [16]:
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 29963
Number of edges: 491464


In [17]:
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 29963\nNumber of edges: 491464\nAverage degree:  32.8047'

### Adding Node Degree

In [18]:
degrees = {node:val for (node, val) in G.degree()}

In [19]:
def get_node_degree(business_id):
    bid = rest[business_id] 
    return degrees[bid]

In [20]:
restaurants_df["node_degree"] = restaurants_df["business_id"].apply(get_node_degree)

In [21]:
restaurants_df.head()

Unnamed: 0,business_id,city,state,checkin_count,review_count,raw_stars,stars,weighted_stars,tip_count,visit_count,popularity_value,popularity,node_degree
0,6iYb2HFDywm3zjuRg0shjw,Boulder,CO,79,49,3.714286,3.5,2.827977,4,132,475.040397,2,73
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,OR,181,24,3.5,3.5,2.734268,4,209,737.162331,2,33
2,D4JtQNTI4X3KcbzacDJsMw,Vancouver,BC,42,28,3.678571,3.5,3.284146,3,73,261.877324,1,40
3,ufCxltuh56FF4-ZFZ6cVhg,Orlando,FL,42,38,4.631579,4.5,3.75586,5,85,341.938538,1,38
4,dmbbf3AqeG61_QHRZi1M1w,Pine Castle,FL,4,3,3.0,3.0,1.789357,1,8,26.653036,0,0


### Geographical Analysis

In [22]:
boston_df = restaurants_df[restaurants_df["state"] == "MA"]
vancouver_df = restaurants_df[restaurants_df["state"] == "BC"]
orlando_df = restaurants_df[restaurants_df["state"] == "FL"]
austin_df = restaurants_df[restaurants_df["state"] == "TX"]
portland_df = restaurants_df[restaurants_df["state"].isin(["WA", "OR"])]
atlanta_df = restaurants_df[restaurants_df["state"] == "GA"]
colombus_df = restaurants_df[restaurants_df["state"] == "OH"]
boulder_df = restaurants_df[restaurants_df["state"] == "CO"]

print(f"1. Boston (MA)\t\t{len(boston_df)}")
print(f"2. Vancouver (BC)\t{len(vancouver_df)}")
print(f"3. Orlando (FL)\t\t{len(orlando_df)}")
print(f"4. Austin (TX)\t\t{len(austin_df)}")
print(f"5. Portland (WA/OR)\t{len(portland_df)}")
print(f"6. Atlanta (GA)\t\t{len(atlanta_df)}")
print(f"7. Colombus (OH)\t{len(colombus_df)}")
print(f"8. Boulder (CO)\t\t{len(boulder_df)}")

1. Boston (MA)		6192
2. Vancouver (BC)	3953
3. Orlando (FL)		4781
4. Austin (TX)		3213
5. Portland (WA/OR)	5089
6. Atlanta (GA)		3670
7. Colombus (OH)	2548
8. Boulder (CO)		517


In [29]:
boston_node_deg = boston_df["node_degree"].mean()
vancouver_node_deg = vancouver_df["node_degree"].mean()
orlando_node_deg = orlando_df["node_degree"].mean()
austin_node_deg = austin_df["node_degree"].mean()
portland_node_deg = portland_df["node_degree"].mean()
atlanta_node_deg = atlanta_df["node_degree"].mean()
colombus_node_deg = colombus_df["node_degree"].mean()
boulder_node_deg = boulder_df["node_degree"].mean()

print(f"1. Boston (MA) node degree:\t\t{round(boston_node_deg, 2)}")
print(f"2. Vancouver (BC) node degree:\t\t{round(vancouver_node_deg, 2)}")
print(f"3. Orlando (FL) node degree:\t\t{round(orlando_node_deg, 2)}")
print(f"4. Austin (TX) node degree:\t\t{round(austin_node_deg, 2)}")
print(f"5. Portland (WA/OR) node degree:\t{round(portland_node_deg, 2)}")
print(f"6. Atlanta (GA) node degree:\t\t{round(atlanta_node_deg, 2)}")
print(f"7. Colombus (OH) node degree:\t\t{round(colombus_node_deg, 2)}")
print(f"8. Boulder (CO) node degree:\t\t{round(boulder_node_deg, 2)}")

1. Boston (MA) node degree:		42.16
2. Vancouver (BC) node degree:		55.99
3. Orlando (FL) node degree:		15.72
4. Austin (TX) node degree:		23.68
5. Portland (WA/OR) node degree:	40.96
6. Atlanta (GA) node degree:		23.56
7. Colombus (OH) node degree:		14.9
8. Boulder (CO) node degree:		31.74


In [33]:
boston_pop = boston_df["popularity"].mean()
vancouver_pop = vancouver_df["popularity"].mean()
orlando_pop = orlando_df["popularity"].mean()
austin_pop = austin_df["popularity"].mean()
portland_pop = portland_df["popularity"].mean()
atlanta_pop = atlanta_df["popularity"].mean()
colombus_pop = colombus_df["popularity"].mean()
boulder_pop = boulder_df["popularity"].mean()

print(f"1. Boston (MA) popularity:\t\t{round(boston_pop, 2)}")
print(f"2. Vancouver (BC) popularity:\t\t{round(vancouver_pop, 2)}")
print(f"3. Orlando (FL) popularity:\t\t{round(orlando_pop, 2)}")
print(f"4. Austin (TX) popularity:\t\t{round(austin_pop, 2)}")
print(f"5. Portland (WA/OR) popularity:\t\t{round(portland_pop, 2)}")
print(f"6. Atlanta (GA) popularity:\t\t{round(atlanta_pop, 2)}")
print(f"7. Colombus (OH) popularity:\t\t{round(colombus_pop, 2)}")
print(f"8. Boulder (CO) popularity:\t\t{round(boulder_pop, 2)}")

1. Boston (MA) popularity:		0.93
2. Vancouver (BC) popularity:		0.76
3. Orlando (FL) popularity:		1.03
4. Austin (TX) popularity:		1.2
5. Portland (WA/OR) popularity:		1.15
6. Atlanta (GA) popularity:		1.09
7. Colombus (OH) popularity:		0.81
8. Boulder (CO) popularity:		1.04


In [45]:
def get_correlation(popularities, degrees):
    table = []
    for i in range(len(popularities)):
        table.append([popularities[i], degrees[i]])
    df = pd.DataFrame(table, columns =['popularity', 'node_degree'])
    return df

In [46]:
pops = [boston_pop, vancouver_pop, orlando_pop, austin_pop, portland_pop, atlanta_pop, colombus_pop, boulder_pop]
degs = [boston_node_deg, vancouver_node_deg, orlando_node_deg, austin_node_deg, portland_node_deg, atlanta_node_deg,
       colombus_node_deg, boulder_node_deg]
df = get_correlation(pops, degs)

In [48]:
df[["node_degree", "popularity"]].corr()

Unnamed: 0,node_degree,popularity
node_degree,1.0,-0.31575
popularity,-0.31575,1.0


In [49]:
pops = [boston_pop, vancouver_pop, orlando_pop, austin_pop, portland_pop, atlanta_pop, colombus_pop, boulder_pop]
degs = [boston_node_deg/len(boston_df), 
        vancouver_node_deg/len(vancouver_df), 
        orlando_node_deg/len(orlando_df), 
        austin_node_deg/len(austin_df), 
        portland_node_deg/len(portland_df), 
        atlanta_node_deg/len(atlanta_df),
        colombus_node_deg/len(colombus_df), 
        boulder_node_deg/len(boulder_df)]
df = get_correlation(pops, degs)

In [50]:
df[["node_degree", "popularity"]].corr()

Unnamed: 0,node_degree,popularity
node_degree,1.0,0.025239
popularity,0.025239,1.0


### Popularity Analysis

In [52]:
bottom_df = restaurants_df[restaurants_df["popularity"] == 0]
middle_df = restaurants_df[restaurants_df["popularity"] == 1]
top_df = restaurants_df[restaurants_df["popularity"] == 2]

In [53]:
bottom_node_deg = bottom_df["node_degree"].mean()
middle_node_deg = middle_df["node_degree"].mean()
top_node_deg = top_df["node_degree"].mean()

print(f"1. Bottom node degree:\t\t{round(bottom_node_deg, 2)}")
print(f"2. Middle node degree:\t\t{round(middle_node_deg, 2)}")
print(f"3. Top node degree:\t\t{round(top_node_deg, 2)}")

1. Bottom node degree:		24.32
2. Middle node degree:		30.87
3. Top node degree:		43.28
