# Generating Restaurants Network
Program to generate CSV file with edges between restaurants who are less than DIST_MAX meters away from each other.

In [1]:
import json
import pandas as pd
import numpy as np
import math
import time

### Reading the data

In [2]:
business_df = pd.read_json('./yelp/yelp_academic_dataset_business.json', lines=True)

In [3]:
business_df.head(2)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,921 Pearl St,"{'RestaurantsTableService': 'True', 'WiFi': 'u...",6iYb2HFDywm3zjuRg0shjw,"Gastropubs, Food, Beer Gardens, Restaurants, B...",Boulder,"{'Monday': '11:0-23:0', 'Tuesday': '11:0-23:0'...",1,40.017544,-105.283348,Oskar Blues Taproom,80302,86,4.0,CO
1,7000 NE Airport Way,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",tCbdrRPZA0oiIYSmHG3J0w,"Salad, Soup, Sandwiches, Delis, Restaurants, C...",Portland,"{'Monday': '5:0-18:0', 'Tuesday': '5:0-17:0', ...",1,45.588906,-122.593331,Flying Elephants at PDX,97218,126,4.0,OR


### Cleaning the data

In [4]:
business_df = business_df.dropna()

In [5]:
restaurants_df = business_df.loc[business_df["categories"].str.contains("Restaurants")]

In [6]:
restaurants_df = restaurants_df.drop(columns=['address', 'categories', 'name', 'review_count', 'stars', 'attributes', 'hours'])

In [7]:
restaurants_df.head(10)

Unnamed: 0,business_id,city,is_open,latitude,longitude,postal_code,state
0,6iYb2HFDywm3zjuRg0shjw,Boulder,1,40.017544,-105.283348,80302,CO
1,tCbdrRPZA0oiIYSmHG3J0w,Portland,1,45.588906,-122.593331,97218,OR
5,D4JtQNTI4X3KcbzacDJsMw,Vancouver,1,49.251342,-123.101333,V5V,BC
12,HPA_qyMEddpAEtFof02ixg,Peabody,1,42.541155,-70.973438,01960,MA
13,ufCxltuh56FF4-ZFZ6cVhg,Orlando,1,28.513265,-81.374707,32806,FL
16,GfWJ19Js7wX9rwaHQ7KbGw,Orlando,1,28.350498,-81.542819,32830,FL
22,ynTjh_FdhbG5hY69HsEoaA,Orlando,0,28.381945,-81.510327,32836,FL
26,hcRxdDg7DYryCxCoI8ySQA,Boston,1,42.338544,-71.106842,02215,MA
29,jGennaZUr2MsJyRhijNBfA,Boston,1,42.363442,-71.025781,02128,MA
33,QciD6FbNklaJvUbLExD4Nw,Burnaby,1,49.254436,-123.02005,V5G 1G4,BC


In [8]:
# Remove outliers
restaurants_df = restaurants_df[~restaurants_df["state"].isin(["WY", "VA", "NH", "MN", "KY", "KS", "ABE"])]
restaurants_df = restaurants_df[restaurants_df["longitude"] < 0]
len(restaurants_df)

42638

In [9]:
cities = restaurants_df.groupby("city").size()
unique_cities = cities[cities < 20].keys()
restaurants_df = restaurants_df[~restaurants_df["city"].isin(unique_cities)]

In [10]:
len(restaurants_df)

41664

### Analyzing distribution of locations

In [11]:
restaurants_df.groupby("state").size().sort_values(ascending=False)

state
MA    8367
FL    6413
OR    6267
BC    6059
GA    5049
TX    4544
OH    3643
CO     685
WA     637
dtype: int64

In [12]:
# Can clearly observe the 8 metropolitan areas
restaurants_df.plot.scatter(x="latitude", y="longitude")

<matplotlib.axes._subplots.AxesSubplot at 0x1ac45ab7f60>

In [13]:
restaurants_df.groupby("city").size().sort_values(ascending=False)[:10]

city
Portland     4877
Austin       4211
Vancouver    4174
Atlanta      3479
Orlando      3215
Boston       2298
Columbus     2249
Richmond      694
Cambridge     664
Kissimmee     581
dtype: int64

### Get Dataframes for each Metropolitan Areas

In [14]:
boston_df = restaurants_df[restaurants_df["state"] == "MA"]
vancouver_df = restaurants_df[restaurants_df["state"] == "BC"]
orlando_df = restaurants_df[restaurants_df["state"] == "FL"]
austin_df = restaurants_df[restaurants_df["state"] == "TX"]
portland_df = restaurants_df[restaurants_df["state"].isin(["WA", "OR"])]
atlanta_df = restaurants_df[restaurants_df["state"] == "GA"]
colombus_df = restaurants_df[restaurants_df["state"] == "OH"]
boulder_df = restaurants_df[restaurants_df["state"] == "CO"]

print(f"1. Boston (MA)\t\t{len(boston_df)}")
print(f"2. Vancouver (BC)\t{len(vancouver_df)}")
print(f"3. Orlando (FL)\t\t{len(orlando_df)}")
print(f"4. Austin (TX)\t\t{len(austin_df)}")
print(f"5. Portland (WA/OR)\t{len(portland_df)}")
print(f"6. Atlanta (GA)\t\t{len(atlanta_df)}")
print(f"7. Colombus (OH)\t{len(colombus_df)}")
print(f"8. Boulder (CO)\t\t{len(boulder_df)}")

1. Boston (MA)		8367
2. Vancouver (BC)	6059
3. Orlando (FL)		6413
4. Austin (TX)		4544
5. Portland (WA/OR)	6904
6. Atlanta (GA)		5049
7. Colombus (OH)	3643
8. Boulder (CO)		685


In [15]:
boulder_df.head()

Unnamed: 0,business_id,city,is_open,latitude,longitude,postal_code,state
0,6iYb2HFDywm3zjuRg0shjw,Boulder,1,40.017544,-105.283348,80302,CO
52,Of6xu3pY3eHe2yhiyz2dvg,Boulder,1,40.033678,-105.259103,80301,CO
158,_8Hejg5Q-_izIhLvq2ocnw,Boulder,1,40.070694,-105.200376,80301,CO
339,hfoHYhD4uzqsHQDxKMFnQw,Boulder,0,40.008353,-105.2766,80302,CO
360,Ziix5uzW9hJu5nYDxCmm1Q,Boulder,0,40.017152,-105.255516,80301,CO


### Generate Edges

#### Idea:
- Calculate distance between every location within each metropolitan area (by state)
- Use haversine distance

In [16]:
DIST_MAX = 500

In [17]:
edges = [] # each edge is a tuple (id1, id2) representing the business IDs

In [18]:
metropolitan_areas = [boston_df, vancouver_df, orlando_df, austin_df, portland_df, atlanta_df, colombus_df, boulder_df]

In [19]:
def haversine_distance(lat1, lon1, lat2, lon2):
   r = 6371
   phi1 = np.radians(lat1)
   phi2 = np.radians(lat2)
   delta_phi = np.radians(lat2 - lat1)
   delta_lambda = np.radians(lon2 - lon1)
   a = np.sin(delta_phi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2)**2
   res = r * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
   return np.round(res*1000, 2)

In [20]:
for area_df in metropolitan_areas:
    start_time = time.time()
    print("Processing...")
    
    for index1, row1 in boulder_df.iterrows():
        for index2, row2 in boulder_df[boulder_df.index > index1].iterrows():
            #print(f"{row1[0]}  -  {row2[0]}: {haversine_distance(row1[3], row1[4], row2[3], row2[4])}")
            if haversine_distance(row1[3], row1[4], row2[3], row2[4]) < DIST_MAX:
                edges.append((row1[0], row2[0]))
    
    print(f"Finished area in {time.time() - start_time}")

Processing...
Finished area in 32.36449980735779
Processing...
Finished area in 31.834882020950317
Processing...
Finished area in 32.69323539733887
Processing...
Finished area in 32.51703977584839
Processing...
Finished area in 32.29214429855347
Processing...
Finished area in 33.65261626243591
Processing...
Finished area in 31.943434953689575
Processing...
Finished area in 32.33773374557495


In [21]:
len(edges)

128152

In [24]:
edges_df = pd.DataFrame(edges, columns=['id1', 'id2'])

In [25]:
edges_df.head()

Unnamed: 0,id1,id2
0,6iYb2HFDywm3zjuRg0shjw,Rbt9i4IDFiIBsau020X_xQ
1,6iYb2HFDywm3zjuRg0shjw,DX6G8Vdu9wUx95Tzh6gEwA
2,6iYb2HFDywm3zjuRg0shjw,8zehGz9jnxPqXtOc7KaJxA
3,6iYb2HFDywm3zjuRg0shjw,PYUuu9y8oJFweobxs0CUog
4,6iYb2HFDywm3zjuRg0shjw,EWOt6ZUkPC12D2E34VWnPQ


In [27]:
edges_df.to_csv("./restaurants_edges.csv", index=True)