# This notebook trys to establish a matching from a foundation ingredient to a recipe ingredient
exact match is preferred, but if not possible, a fuzzy match is also acceptable

In [22]:
from rapidfuzz import process, fuzz

In [23]:
foundation_ingredient_filepath = "resources/clean/foundation_ingredient.csv"
recipe_ingredient_filepath = "resources/clean/recipe_ingredients.csv"

with open(foundation_ingredient_filepath, "r") as foundation_ingredient_file:
    foundation_ingredient_lines = [line.strip() for line in foundation_ingredient_file.readlines()]
with open(recipe_ingredient_filepath, "r") as recipe_ingredient_file:
    recipe_ingredient_lines = [line.strip() for line in recipe_ingredient_file.readlines()]

In [24]:
print("Foundation Ingredient Lines: ", len(foundation_ingredient_lines))
print("Recipe Ingredient Lines: ", len(recipe_ingredient_lines))
foundation_ingredients = set(foundation_ingredient_lines)
recipe_ingredients = set(recipe_ingredient_lines)

Foundation Ingredient Lines:  122
Recipe Ingredient Lines:  234063


In [25]:
from rapidfuzz.process import cdist, extractOne, extract
import tqdm
def find_matches(small_nouns, large_nouns, threshold=80):
    results = {}
    for word in tqdm.tqdm(small_nouns):
        word_lower = word.lower()
        if word_lower in large_nouns:
            results[word] = ("EXACT", word_lower)
        else:
            # Find closest match above threshold, if any
            match = extractOne(word_lower, large_nouns, scorer=fuzz.ratio, score_cutoff=threshold)
            if match:
                results[word] = ("FUZZY", match[0], match[1]) # (type, match, score)
            else:
                results[word] = ("NOT FOUND", None)
    return results

result = find_matches(foundation_ingredients, recipe_ingredients, threshold=80)

100%|██████████| 122/122 [00:00<00:00, 2412.21it/s]


 we do a tiny modification, matching "oat milk" to "vanilla oat milk" instead of "goat milk"

In [31]:
result['oat milk'] = ('EXACT', 'vanilla oat milk')

In [32]:
for key, value in result.items():
    if value[0] == "NOT FOUND":
        print(f"NOT FOUND: {key}")
    elif value[0] == "FUZZY":
        print(f"FUZZY: {key} -> {value[1]} ({value[2]})")
    else:
        print(f"EXACT: {key} -> {value[1]}")

EXACT: rutabaga -> rutabaga
EXACT: blackberry -> blackberry
EXACT: mushroom -> mushroom
EXACT: celery -> celery
EXACT: lentil -> lentil
EXACT: sesame butter -> sesame butter
EXACT: milk -> milk
EXACT: einkorn -> einkorn
EXACT: pea -> pea
EXACT: sorghum grain -> sorghum grain
EXACT: cream -> cream
EXACT: arugula -> arugula
EXACT: eggplant -> eggplant
EXACT: nut -> nut
EXACT: turkey -> turkey
EXACT: garlic -> garlic
EXACT: tomato -> tomato
EXACT: green onion -> green onion
EXACT: cheese -> cheese
EXACT: ketchup -> ketchup
EXACT: hummus -> hummus
EXACT: apple -> apple
EXACT: banana -> banana
EXACT: broccoli -> broccoli
EXACT: pawpaw -> pawpaw
EXACT: ham -> ham
EXACT: orange -> orange
EXACT: peanut -> peanut
EXACT: sweet potatoes -> sweet potatoes
EXACT: bread -> bread
EXACT: olive -> olive
EXACT: chia seeds -> chia seeds
EXACT: almond milk -> almond milk
EXACT: oat -> oat
EXACT: cottage cheese -> cottage cheese
EXACT: cookie -> cookie
EXACT: apricot -> apricot
EXACT: sugar -> sugar
EXACT:

save the matching to a json file. we are now certain that all matching is correct

In [34]:
match_dict = dict()
for key, value in result.items():
    if value[0] == "FUZZY":
        match_dict[key] = value[1]
    elif value[0] == "EXACT":
        match_dict[key] = value[1]
print(match_dict)

{'rutabaga': 'rutabaga', 'blackberry': 'blackberry', 'mushroom': 'mushroom', 'celery': 'celery', 'lentil': 'lentil', 'sesame butter': 'sesame butter', 'milk': 'milk', 'einkorn': 'einkorn', 'pea': 'pea', 'sorghum grain': 'sorghum grain', 'cream': 'cream', 'arugula': 'arugula', 'eggplant': 'eggplant', 'nut': 'nut', 'turkey': 'turkey', 'garlic': 'garlic', 'tomato': 'tomato', 'green onion': 'green onion', 'cheese': 'cheese', 'ketchup': 'ketchup', 'hummus': 'hummus', 'apple': 'apple', 'banana': 'banana', 'broccoli': 'broccoli', 'pawpaw': 'pawpaw', 'ham': 'ham', 'orange': 'orange', 'peanut': 'peanut', 'sweet potatoes': 'sweet potatoes', 'bread': 'bread', 'olive': 'olive', 'chia seeds': 'chia seeds', 'almond milk': 'almond milk', 'oat': 'oat', 'cottage cheese': 'cottage cheese', 'cookie': 'cookie', 'apricot': 'apricot', 'sugar': 'sugar', 'millet': 'millet', 'salt': 'salt', 'butter': 'butter', 'lettuce': 'lettuce', 'yogurt': 'yogurt', 'nectarine': 'nectarine', 'onion rings': 'onion rings', 'pe

In [35]:
import json
matching_result_filepath = "resources/clean/matching_result.json"

with open(matching_result_filepath, "w") as matching_result_file:
    json.dump(match_dict, matching_result_file, indent=4)

also save a separate csv file containing all the matched ingredients that are present in the recipe dataset

In [36]:
import csv
matched_ingredients = sorted(list(match_dict.keys()))
matched_ingredients_filepath = "resources/clean/matched_ingredients.csv"
with open(matched_ingredients_filepath, "w", newline='') as matched_ingredients_file:
    writer = csv.writer(matched_ingredients_file)
    for ingredient in matched_ingredients:
        writer.writerow([ingredient])
