In [1]:
import pandas as pd
from math import radians, cos, sin, asin, sqrt, log

In [2]:
bu = pd.read_csv("Data/yelp_business_clean.csv")
bu['categories'] = bu['categories'].apply(lambda x: str(x).strip('{}').split(","))
bu.head()

Unnamed: 0,address,take_out,business_id,categories,latitude,longitude,name,review_count,stars,star_count,dates
0,1012 State St,True,-3AooxIkg38UyUdlz5oXdw,"['Italian', 'Nightlife', 'Event Planning & S...",34.421931,-119.702185,Chase Restaurant,436,3.0,"[115, 56, 67, 62, 144]","['2021-12-28 02:40:57', '2021-11-29 16:04:16',..."
1,2036 Cliff Dr,True,-6jvfSJGprbfBD2QrS9zQw,"['Food', 'Grocery']",34.402538,-119.724894,Mesa Produce,19,5.0,"[0, 0, 0, 0, 19]","[0, 0, 0, 0, '2021-10-24 16:08:48']"
2,1417 San Andres St,True,-ALqLSTzkGDMscHdxA1NgA,"['Mexican', 'Restaurants']",34.418221,-119.715795,Su Casa Fresh Mexican Grill,29,4.5,"[0, 2, 3, 7, 17]","[0, '2013-01-24 23:23:09', '2016-08-13 05:59:3..."
3,1905 Cliff Dr,True,-BdYhP-12elmFV7oB1iv4A,"['Gastropubs', 'Nightlife', 'American (New)'...",34.401382,-119.722472,Corner Tap Room,41,5.0,"[0, 0, 2, 5, 37]","[0, 0, '2021-10-21 03:56:27', '2021-10-30 15:4..."
4,31 W Carrillo St,True,-FM4CxOg4XXmX_Ebky_SiQ,"['Breakfast & Brunch', 'Nightlife', 'America...",34.420361,-119.702475,Finch & Fork,1405,4.0,"[63, 82, 132, 379, 778]","['2021-11-21 02:44:59', '2021-06-21 00:01:19',..."


In [3]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 3956 # Radius of earth in kilometers. Use 3956 for miles, or 6371 for km. Determines return value units.
    return c * r

In [4]:
# Removes some basic categories from a row
def remove_cats(row, categories=['\'Food\'', '\'Restaurants\'']):
    return {val for val in row['categories'] if val not in categories}

In [5]:
def row_similarity(r1, r2):
    # Cannot create an edge with a restaurant more than 10 miles away
    distance = haversine(r1['latitude'], r1['longitude'], r2['latitude'], r2['longitude'])
    if distance <= 10:
        sim_score = 0
            
        # Adds to the similarity score if both restaurants have similar categories
        sim_score += len(remove_cats(r1).intersection(remove_cats(r2)))
        
        # Adds to the similarity score if both restaurants are geographically close
        if distance == 0:
            sim_score += 1
        else:
            sim_score += 0.1/distance
        
        # Adds to similarity score if r2 has a high rating and/or a high review count
        sim_score += (log(r2['review_count'])/1.5 * r2['stars'])/12
        
        return sim_score   
    else:
        return 0

In [6]:
def score_all(df, score_threshold=5):
    scores = {}
    
    # Iterates through every combination of rows
    for i1, r1 in df.iterrows():
        for i2, r2 in df.iterrows():
            # Ensures the rows are not the same
            if i1 != i2:
                score = row_similarity(r1,r2)
            
                if score > score_threshold:
                    scores[(i1, i2)] = round(score,3)
                
    return scores

In [7]:
# Generates all the scores
scores = score_all(bu,score_threshold=5)

In [12]:
n1_list = [i[0] for i in scores.keys()]
n2_list = [i[1] for i in scores.keys()]

In [13]:
# Writes the edges to a csv
pd.DataFrame.from_dict({"n1": n1_list, "n2": n2_list, "score": scores.values()}).to_csv("edges.csv")

In [92]:
bu.head(10)

Unnamed: 0,address,take_out,business_id,categories,latitude,longitude,name,review_count,stars,star_count,dates
0,1012 State St,True,-3AooxIkg38UyUdlz5oXdw,"['Italian', 'Nightlife', 'Event Planning & S...",34.421931,-119.702185,Chase Restaurant,436,3.0,"[115, 56, 67, 62, 144]","['2021-12-28 02:40:57', '2021-11-29 16:04:16',..."
1,2036 Cliff Dr,True,-6jvfSJGprbfBD2QrS9zQw,"['Food', 'Grocery']",34.402538,-119.724894,Mesa Produce,19,5.0,"[0, 0, 0, 0, 19]","[0, 0, 0, 0, '2021-10-24 16:08:48']"
2,1417 San Andres St,True,-ALqLSTzkGDMscHdxA1NgA,"['Mexican', 'Restaurants']",34.418221,-119.715795,Su Casa Fresh Mexican Grill,29,4.5,"[0, 2, 3, 7, 17]","[0, '2013-01-24 23:23:09', '2016-08-13 05:59:3..."
3,1905 Cliff Dr,True,-BdYhP-12elmFV7oB1iv4A,"['Gastropubs', 'Nightlife', 'American (New)'...",34.401382,-119.722472,Corner Tap Room,41,5.0,"[0, 0, 2, 5, 37]","[0, 0, '2021-10-21 03:56:27', '2021-10-30 15:4..."
4,31 W Carrillo St,True,-FM4CxOg4XXmX_Ebky_SiQ,"['Breakfast & Brunch', 'Nightlife', 'America...",34.420361,-119.702475,Finch & Fork,1405,4.0,"[63, 82, 132, 379, 778]","['2021-11-21 02:44:59', '2021-06-21 00:01:19',..."
5,2026 De La Vina St,True,-H-fZonTLitwHFY_PLAIjA,"['Pizza', 'Restaurants']",34.42955,-119.717605,Taffy's Pizza,192,4.0,"[26, 18, 12, 45, 97]","['2022-01-15 01:36:56', '2021-06-10 05:15:12',..."
6,718 State St,True,-I7M6l0h46VdiN09Y1NUZw,"['Gastropubs', 'Indian', 'Gluten-Free', 'Ve...",34.419239,-119.698387,Apna Indian Kitchen,246,4.5,"[10, 7, 8, 22, 208]","['2021-09-19 04:30:53', '2021-06-24 19:30:00',..."
7,3126 State St,True,-NXRuUsazxXZZ_OHwmtYtw,"['Sports Bars', 'Tacos', 'Beer', 'Pizza', ...",34.440606,-119.732181,Uptown Bar & Lounge,138,4.0,"[18, 9, 15, 30, 68]","['2021-06-11 03:03:08', '2021-12-28 23:11:32',..."
8,1201 State St,True,-TjT3y-Glfnbt6LbKETAYg,"['Breakfast & Brunch', 'Vegetarian', 'Seafoo...",34.423358,-119.704776,Benchmark Eatery,544,4.0,"[44, 51, 80, 164, 222]","['2021-12-20 22:29:32', '2021-09-27 19:36:07',..."
9,"137 Anacapa St, Ste C",True,-e8RwknT5szoLk9uBZjzcw,"['Wine Tasting Room', 'Beer', 'Arts & Entert...",34.414815,-119.690563,Riverbench Santa Barbara Tasting Room,121,4.0,"[10, 5, 13, 29, 65]","['2021-08-21 14:45:30', '2019-01-19 22:45:42',..."
