##CBO-Restaurant Matching Algorithm


This notebook contains the algorithm for CBO-Resturant Matching.

Utilizing generated data containing the historical data of how CBOs and Resturants interacted, list of all CBOs, and a list of all  Resturants.

This algorithm takes into account cuisine preferences, and location of a selected CBO and compares them to all resturants in the given data and returns the best matches for that CBO.


## Data Loading & Preprocessing

In [None]:
#Importing libraries for project
import pandas as pd
import numpy as np
from collections import Counter
import math
import warnings
from functools import partial

In [None]:
# Load the datasets
cbo_df = pd.read_csv('CBO.csv')
restaurant_df = pd.read_csv('Restaurant.csv')
meals_df = pd.read_csv('Meals.csv')

#Validation of loaded data
print("== Data loaded ==")
print(f"\nDataset sizes:")
print(f"  CBOs: {len(cbo_df)} organizations")
print(f"  Restaurants: {len(restaurant_df)} partners")
print(f"  Total Records: {len(meals_df)} records")

== Data loaded ==

Dataset sizes:
  CBOs: 70 organizations
  Restaurants: 25 partners
  Total Records: 24647 records


##Data Cleaning

In [None]:
#importing needed libraries
import numpy as np
import pandas as pd
import re

#Coordinates of the center of each borough,
#used later for calculating distance between CBOs and resturants for location score
borough_centroids = {
    'Manhattan': (40.7831, -73.9712),
    'Brooklyn': (40.6782, -73.9442),
    'Queens': (40.7282, -73.7949),
    'Bronx': (40.8448, -73.8648),
    'Staten Island': (40.5795, -74.1502)
}

#Function to find distance between two points, in miles, based on
#latitude and longitude. Distance used for location scoring
def haversine_miles(lat1, lon1, lat2, lon2):
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    earth_radius_miles = 3958.7613
    return earth_radius_miles * c

#Function to normalize text in dataframes all lowercase, no special characters
def norm_text(val):
    if pd.isna(val):
        return ''
    txt = str(val).lower().strip()
    txt = re.sub(r'&', ' and ', txt)
    txt = re.sub(r'[^a-z0-9\s;/,\-]', ' ', txt)
    txt = re.sub(r'\s+', ' ', txt).strip()
    return txt

#Function to clean cuisine tags data, and normalize them for categorization
def split_tags(val):
    txt = norm_text(val)
    if txt == '':
        return []
    parts = re.split(r'[;/,]', txt)
    parts = [p.strip() for p in parts if p.strip() != '']
    map_simple = {
        'southeast asian': 'se asian',
        'south east asian': 'se asian',
        'afro latino': 'afro-latino'
    }
    out = []
    for p in parts:
        out.append(map_simple.get(p, p))
    return sorted(list(set(out)))

#Generates distance score based on cutoff of 12 miles.
#Score can only be in the range of 0.0 - 1.0
def distance_score_miles(dist_mi, cutoff=12.0):
    sc = 1.0 - (dist_mi / cutoff)
    return np.clip(sc, 0.0, 1.0)

#Validation for CBO cuisine preferences, checks for variable in cuisine preference
def cuisine_allowed(cbo_tags, rest_tags):
    cbo_set = set(cbo_tags)
    rest_set = set(rest_tags)
    if 'variable' in cbo_set:
        return True
    if 'variable' in rest_set:
        return True
    return len(cbo_set.intersection(rest_set)) > 0


## Matching Algorithm


In [None]:
#Function to find scores for each resturant based on CBO location and cuisine preference
def find_best_restaurants(cbo_name, cbo_df, restaurant_df, top_n=5, w_distance=0.75, w_borough=0.25):
    #input validation for the function
    cbo_name_col = 'Partner/Site' if 'Partner/Site' in cbo_df.columns else cbo_df.columns[0]
    matches = cbo_df[cbo_name_col].astype(str).str.lower().str.contains(str(cbo_name).lower(), na=False)
    if matches.sum() == 0:
        print('CBO not found: ' + str(cbo_name))
        return None

    cbo_df_work = cbo_df.copy()
    rest_df_work = restaurant_df.copy()
    #creation of column with normalized borough names for comparison
    cbo_df_work['Borough_norm'] = cbo_df_work['Borough'].apply(lambda x: str(x).strip() if not pd.isna(x) else x)
    rest_df_work['Borough_norm'] = rest_df_work['Borough/Region'].apply(lambda x: str(x).strip() if not pd.isna(x) else x)

    #creates map objects to iterate though CBO and resturant locations
    cbo_df_work['lat'] = cbo_df_work['Borough_norm'].map(lambda b: borough_centroids.get(b, (np.nan, np.nan))[0])
    cbo_df_work['lon'] = cbo_df_work['Borough_norm'].map(lambda b: borough_centroids.get(b, (np.nan, np.nan))[1])
    rest_df_work['lat'] = rest_df_work['Borough_norm'].map(lambda b: borough_centroids.get(b, (np.nan, np.nan))[0])
    rest_df_work['lon'] = rest_df_work['Borough_norm'].map(lambda b: borough_centroids.get(b, (np.nan, np.nan))[1])

    #uses split tag function to normalize Cuisine Preference and Cuisine for easier comparison
    cbo_df_work['cuisine_tags'] = cbo_df_work['Cuisine Preference'].apply(split_tags)
    rest_df_work['cuisine_tags'] = rest_df_work['Cuisine'].apply(split_tags)

    #Matches cbo_name in find_best_resturants function and selects all rows with a match returning true
    cbo_row = cbo_df_work.loc[matches].iloc[0]
    cbo_lat = cbo_row['lat']
    cbo_lon = cbo_row['lon']
    cbo_borough = cbo_row['Borough_norm']

    #calculates scores for location and borough
    dist_mi = haversine_miles(cbo_lat, cbo_lon, rest_df_work['lat'].values, rest_df_work['lon'].values)
    dist_sc = distance_score_miles(dist_mi, cutoff=12.0)
    borough_sc = (rest_df_work['Borough_norm'].values == cbo_borough).astype(float)

    #Calculates final, total score with location and distance with cuisine actiing as a filter
    #Cuisine is a HARD gate; scoring is distance + borough only.
    #Output of function returned in out_df
    allowed_mask = np.array([cuisine_allowed(cbo_row['cuisine_tags'], tags) for tags in rest_df_work['cuisine_tags']])
    total = (w_distance * dist_sc) + (w_borough * borough_sc)
    out_df = rest_df_work[['Restaurant','Street Address','Borough/Region','Cuisine']].copy()
    out_df['Distance_mi'] = dist_mi
    out_df['Location Score'] = dist_sc
    out_df['Borough_Score'] = borough_sc
    out_df['Cuisine_OK'] = allowed_mask
    out_df['Total Score'] = total
    out_df = out_df.loc[out_df['Cuisine_OK'] == True].copy()
    out_df = out_df.sort_values('Total Score', ascending=False).head(top_n).reset_index(drop=True)
    return out_df

##Matching Algorithm Demonstration

In [None]:
#Demo run of matching algorithm
sample_cbo = 'Plaza Del Sol'

best_matches_new_weights = find_best_restaurants(
    sample_cbo, cbo_df, restaurant_df, top_n=5, w_distance=0.75, w_borough=0.25
)

if best_matches_new_weights is not None:
    print("\nBest matches with new weights:")
    display(best_matches_new_weights)


Best matches with new weights:


Unnamed: 0,Restaurant,Street Address,Borough/Region,Cuisine,Distance_mi,Location Score,Borough_Score,Cuisine_OK,Total Score
0,Estrella Latina,39-07 104th Street,Queens,Caribbean,0.0,1.0,1.0,True,1.0
1,Atomic Wings,159-23 Hillside Avenue,Queens,American,0.0,1.0,1.0,True,1.0
2,Good Eats Bistro,100-15 Ditmars Boulevard,Queens,American; Variable,0.0,1.0,1.0,True,1.0
3,Flavors Corner,1018 41st Avenue,Queens,Global,0.0,1.0,1.0,True,1.0
4,Monkworx,59-38 69th Ave.,Queens,Variable; Latin Carribbean,0.0,1.0,1.0,True,1.0
