In [15]:
import re
import os
import ast
import string
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from rapidfuzz import process, fuzz
from google.cloud import storage
from io import StringIO
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

from Ingredients_list_setup import download_ingredients_df

[nltk_data] Downloading package stopwords to /Users/pato/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pato/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/pato/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/pato/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/pato/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:

def parse_ingredient(ingredient):
    """
    Input: Ingredient cell of a recipe row from the recipe dataset
    Output: A tuple (quantity, grammage, unit, name) where:
        - 'quantity': How many portions
        - 'grammage': Grammage depending on the unit provided
        - 'unit': Unit of the grammage
        - 'name': Name of the ingredient
    Purpose: Output will be used to generate a proper DataFrame using pd.DataFrame.
    """
    VALID_UNITS = {'g', 'tbsp', 'tsp', 'tspn', 'cup', 'ml', 'l', 'kg', 'oz', 'fl oz'}
    # Preprocessing to remove "/xoz" patterns and fractions like "½oz" when g is already provided
    ingredient = re.sub(r'/\d+oz', '', ingredient)  # Remove patterns like "/9oz"
    ingredient = re.sub(r'/\d+fl oz', '', ingredient)  # Remove patterns like "/9fl oz"
    ingredient = re.sub(r'/\d+[½⅓¼¾]+oz', '', ingredient)  # Remove fractions before "oz"

    # Regex to capture quantity, unit, and name
    pattern = r'(?:(\d+)(?:\s*x\s*(\d+))?)?\s*([a-zA-Z%½⅓¼]+)?\s*(.*)'
    match = re.match(pattern, ingredient)

    if match:
        quantity, sub_quantity, unit, name = match.groups()

        # Default values
        grammage = None
        portion_quantity = 1  # Default quantity if not provided

        # Handle the case of "2 x 80g"
        if sub_quantity:
            portion_quantity = int(quantity)
            grammage = int(sub_quantity)
        elif quantity and unit:
            grammage = int(quantity)
        elif quantity:
            portion_quantity = int(quantity)

        # If no grammage or unit is provided
        if not unit and not grammage:
            name = ingredient.strip()  # Full ingredient name as name

        # Debugging exception : Handling cases where the detected unit is actually the first word of the ingredient name
        if unit and unit not in VALID_UNITS:
            # Move the incorrectly detected unit back into the beggining of the name
            name = f"{unit} {name}".strip()
            unit = None  # Clear the unit, since it's invalid

        # Exception when a fraction of quantity is provided
        # Output example before fixing : 1       NaN     unit  ½ leftover roast chicken, torn into pieces
        # Fix : Check if a fraction is at the beginning of the name and adjust quantity
        fraction_pattern = r'^([½⅓¼¾])\s*(.*)'
        fraction_match = re.match(fraction_pattern, name)
        if fraction_match and portion_quantity == 1:
            fraction, remaining_name = fraction_match.groups()
            try:
                # Fraction to decimal dictionary
                fraction_value = {
                    "½": 0.5,
                    "⅓": 0.33,
                    "¼": 0.25,
                    "¾": 0.75
                }[fraction]
                portion_quantity = fraction_value  # Replacing quantity with the decimal
                name = remaining_name.strip()  # Removing the fraction from the name
            except KeyError:
                pass  # Keep running the code if error

        return (float(portion_quantity), grammage, unit, name.strip())

    # If no pattern is recognized -> Default return
    return (1, None, None, ingredient.strip())



In [12]:
# Step 1: Download data
client = storage.Client()
bucket_name = "recipes-dataset"
bucket = client.bucket(bucket_name)
blob = bucket.blob(f"Recipes/recipes.csv")
content = blob.download_as_text()
recipes = pd.read_csv(StringIO(content))[0:10]
recipe = recipes.copy().drop_duplicates(subset=["title"], keep='first')
print("Recipes downloaded")

ingredients = download_ingredients_df()
print("Ingredients downloaded")

Recipes downloaded
Ingredients downloaded


In [26]:
# Step 2: Process ingredients
all_parsed_ingredients = [
    (title, *parse_ingredient(ingredient))
    for title, ingreds in zip(recipe["title"], recipe["ingredients"])
    for ingredient in ast.literal_eval(ingreds)
    ]
print("All ingredients of all recipes pulled")
all_parsed_ingredients

All ingredients of all recipes pulled


[('15 minute pasta', 1.0, 350, 'g', 'penne pasta'),
 ('15 minute pasta',
  2.0,
  80,
  'g',
  'packs Parma ham, snipped into small pieces'),
 ('15 minute pasta',
  1.0,
  250,
  'g',
  'small brown chestnut mushrooms, halved or quartered'),
 ('15 minute pasta', 1.0, 200, 'g', 'full-fat crème fraîche'),
 ('15 minute pasta', 1.0, 100, 'g', 'Parmesan, grated'),
 ('15 minute pasta', 1.0, 2, 'tbsp', 'chopped fresh parsley'),
 ('15 minute pasta',
  1.0,
  None,
  None,
  'salt and freshly ground black pepper, to taste'),
 ('15 minute pasta', 1.0, None, None, 'green salad'),
 ('15 minute pasta', 1.0, None, None, 'crunchy bread'),
 ('1970s-style chicken curry', 1.0, 30, 'g', 'unsalted butter'),
 ('1970s-style chicken curry', 1.0, 1, None, 'onion , finely chopped'),
 ('1970s-style chicken curry',
  1.0,
  1,
  None,
  'green eating apple, finely chopped'),
 ('1970s-style chicken curry', 2.0, None, None, '2–3 tsp curry powder'),
 ('1970s-style chicken curry', 1.0, 1, 'tbsp', 'plain flour'),
 ('

In [27]:
# Convert to DataFrame directly
flat_recipes_df = pd.DataFrame(
    all_parsed_ingredients,
    columns=["recipe", "quantity", "grammage", "unit", "ingredient"]
    )
print("Parsed ingredients combined into a DataFrame")
flat_recipes_df

Parsed ingredients combined into a DataFrame


Unnamed: 0,recipe,quantity,grammage,unit,ingredient
0,15 minute pasta,1.0,350.0,g,penne pasta
1,15 minute pasta,2.0,80.0,g,"packs Parma ham, snipped into small pieces"
2,15 minute pasta,1.0,250.0,g,"small brown chestnut mushrooms, halved or quar..."
3,15 minute pasta,1.0,200.0,g,full-fat crème fraîche
4,15 minute pasta,1.0,100.0,g,"Parmesan, grated"
...,...,...,...,...,...
122,3D biscuits,1.0,,,"food colouring, if using"
123,3D biscuits,1.0,,,"ready -to-roll icing, in colours of your choic..."
124,Lemon curd ice cream,1.0,290.0,ml,/½ pint double cream
125,Lemon curd ice cream,1.0,1.0,,jar (340g) lemon curd
