In [1]:
import spacy
import pandas as pd
import regex as re

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
def standardize_ingredient_spacy(ingredient):
    """
    Standardize an ingredient by extracting its head noun using spaCy.
    Returns the lemma (base form) of the first noun found.
    """
    doc = nlp(ingredient)
    for token in doc:
        if token.pos_ == "NOUN":
            return token.lemma_.lower()
    return ingredient.lower()

def standardize_ingredients_list(ingredients):
    """
    Given a list of ingredient strings, standardize each one.
    """
    return [standardize_ingredient_spacy(ing) for ing in ingredients]

In [4]:
def parse_ingredients(cell):
    """
    Given a string that looks like a list, e.g.
    "['6 medium apples (3 pounds/1.5 kg)', 'Galette...']",
    extract and return a list of ingredient strings.
    """
    # Remove leading and trailing whitespace
    cell = cell.strip()
    
    # Optionally remove outer brackets if present
    if cell.startswith('[') and cell.endswith(']'):
        cell = cell[1:-1]
    
    # Use regex to extract substrings between single quotes
    # This pattern will extract text within single quotes.
    ingredients = re.findall(r"'([^']+)'", cell)
    return ingredients


In [5]:
def convert_and_standardize(cell):
    """
    Parse the string cell to extract ingredients and standardize each.
    """
    ingredients = parse_ingredients(cell)
    
    # Debug print to check the parsing result
    print("Original cell:", cell)
    print("Parsed ingredients:", ingredients)
    
    return standardize_ingredients_list(ingredients)


In [6]:
df = pd.read_csv("recipes_with_descriptions.csv")

In [7]:
df["Standardized_Cleaned_Ingredients"] = df["Cleaned_Ingredients"].apply(convert_and_standardize)


Original cell: ['6 medium apples (3 pounds/1.5 kg)', 'Galette dough (page 231)', 'Frangipane (page 234)', '2 tablespoons (1 ounce/30 g) unsalted or salted butter, melted', '4 tablespoons (60 g) granulated or coarse-crystal sugar']
Parsed ingredients: ['6 medium apples (3 pounds/1.5 kg)', 'Galette dough (page 231)', 'Frangipane (page 234)', '2 tablespoons (1 ounce/30 g) unsalted or salted butter, melted', '4 tablespoons (60 g) granulated or coarse-crystal sugar']
Original cell: ['2 large Asian eggplants (about 1/3 pound/155 g each)', '2 extra-large eggs', 'Kosher salt and freshly ground black pepper', '3 tablespoons vegetable oil', '2 tablespoons picked cooked crabmeat (optional)', 'Fish sauce, for serving']
Parsed ingredients: ['2 large Asian eggplants (about 1/3 pound/155 g each)', '2 extra-large eggs', 'Kosher salt and freshly ground black pepper', '3 tablespoons vegetable oil', '2 tablespoons picked cooked crabmeat (optional)', 'Fish sauce, for serving']
Original cell: ['Kosher salt

In [8]:
df.to_csv("ingredients_standardized.csv", index=False)

In [9]:
# cleaned_ingredients = [
#     "kosher salt", 
#     "coarse salt", 
#     "sea salt", 
#     "fresh basil", 
#     "green bell pepper", 
#     "chicken breast",
#     "dried oregano"
# ]

# standardized = [standardize_ingredient_spacy(ing) for ing in cleaned_ingredients]
# print(standardized)