In [None]:
from pathlib import Path
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
from dataclasses import dataclass

from src.data import load_data, parse_json_columns

In [12]:
### 
data_path = Path("data/raw/recipes.csv")

df = load_data(path=data_path)

df = parse_json_columns(df=df)

# df.head()

print(len(df))

df = df.unique()

print(len(df))

for ingredients in df["ingredients"]:
    for ingredient in ingredients:
        if "tomato" in ingredient:
            print(ingredient)

62126
62126
1 ½ teaspoons tomato paste
1 (28 ounce) can diced tomatoes
1 large tomato, chopped
1 ripe tomato, chopped
2 (10.75 ounce) cans low sodium tomato soup
8 ounces cherry tomatoes, halved
1 (14.5 ounce) can stewed tomatoes
1 (10 ounce) can diced tomatoes and green chiles (such as RO*TEL®)
1 (6 ounce) can tomato paste
2 tablespoons tomato paste
1 slice tomato
3 roma (plum) tomatoes, diced
1 (14.5 ounce) can diced tomatoes
1 tomato
2 Roma tomatoes
1 (14.5 ounce) can diced tomatoes, drained
¼ cup cherry tomatoes for garnish
1 (14.5 ounce can) tomatoes
1 pint cherry tomatoes
6 Roma (plum) tomatoes, chopped
1 (16 ounce) can whole peeled tomatoes, chopped
1 small tomato, chopped
3 tablespoons tomato paste
4 roma (plum) tomatoes, diced
1 (10 ounce) can diced tomatoes with green chile peppers, with liquid
2 ripe fresh tomatoes, chopped
3 (6 ounce) cans tomato paste
1 (15 ounce) can tomato sauce
1 ½ tablespoons tomato paste
1 small tomato, diced
1 roma tomato, sliced
1 (14.5 ounce) cans 

In [None]:
@dataclass
class Recipe:
    category: str
    title: str
    ingredient_list: list[str]
    directions: list[str]

In [None]:
import re

MEASUREMENT_WORDS = {
    "can", "cup", "cups", "ounce", "ounces", "tablespoon", "tablespoons", "tsp",
    "tbsp", "teaspoon", "teaspoons", "tablespoon", "tablespoons", "oz",
    "pound", "pounds", "lb", "lbs", "gram", "grams", "kilogram", "kg",
    "ml", "liter", "liters", "pinch", "dash"
}

DESCRIPTIVE_WORDS = {
    "small", "medium", "large", "chopped", "minced", "frozen", "quart", "frying",
    "thin", "strips", "cut", "cleaned", "scaled", "peeled", "taste", "freshly", "ground",
    "seeded", "sliced", "fresh", "stripped", "cracked", "softened", "finely", "diced", "ripe",
    "pitted", "thinly", "crushed", "unbaked", "miniature", "julienned", "cooled", "drained", "rinsed",
    "divided", "stemmed", "bottled", "package", "shredded", "heated", "bite-sized", "pieces",
    "thawed", "uncooked", "jar", "caps", "slices", "dice", "matchsticksize", "lengths", "cubes", "thickly",
    "homemade", 
}


STOP_WORDS = {
    "of", "and", "for", "to", "into", "or", "more", "optional", "as", "needed", "per", "deveined",
    "inch", "such", "degrees", "f", "c", "container", "if", "enough", "cover", "removed", "bags", "the"
}

def clean_ingredient(ingredient: str):
    ingredient = ingredient.lower()

    # Remove punctuation and numbers
    ingredient = re.sub(r"[\'\-\(\)\/\d\.,⅛¼⅓½⅔¾™]+", "", ingredient)

    measurement_pattern = r"\b(" + "|".join(MEASUREMENT_WORDS) + r")\b"
    descriptive_pattern = r"\b(" + "|".join(DESCRIPTIVE_WORDS) + r")\b"
    stopword_pattern = r"\b(" + "|".join(STOP_WORDS) + r")\b"

    ingredient = re.sub(measurement_pattern, "", ingredient)
    ingredient = re.sub(descriptive_pattern, "", ingredient)
    ingredient = re.sub(stopword_pattern, "", ingredient)

    return ingredient.strip()


ingredients = df["ingredients"]
unique_ingredients = set()


for ingredient_list in ingredients:
    for ingredient in ingredient_list:
        ingredient = clean_ingredient(ingredient)
        unique_ingredients.add(ingredient)



In [42]:
unique_ingredients

{'',
 'shoestring fries',
 'onion green bell pepper  celery mix',
 'dry green lentils',
 'strawberry halves',
 'castelvetrano olives   dried',
 'fluid  bottle apple cider',
 'lean breakfast turkey sausage links',
 'homemade chicken broth  lowsodium canned broth',
 'spicy brown mustard',
 'green grapes   garnish',
 'loaf   white bread',
 'salmon fillets',
 'cooked green lentils',
 'boneless beef sirloin  beef top round steaks " thick',
 'unsweetened coconut milk wellshaken   thai kitchen',
 'whole bran cereal',
 'apple sauce  plum preserves  serving',
 'fireroasted  green chile peppers',
 'sugarfree vanilla extract',
 'chile pepper',
 'mashed avocado from mexico',
 'bunch  spinach   torn  bitesized',
 'red bell pepper   bitesize',
 'jars prepared pasta sauce',
 'green garlic garlic scapes',
 'extravirgin olive oil plus   garnish',
 'fluid  jigger vodka',
 'skinless boneless chicken breast halves pounded',
 'black forest ham',
 'white onions halved     rings',
 'microwavable white rice  