# Base Preprocessing

## ⬇️ Imports

In [46]:
%%bash
pip install -r ../../requirements.txt



In [1]:
import pandas as pd
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import os

ps = PorterStemmer()

## Downloading the dataset

In [3]:
data  = pd.read_csv('../data/RAW_recipes.csv')
data.set_index('id', inplace=True)
columns = ["tags", "steps", "ingredients", "nutrition"]

for i in columns:
    data[i] = data[i].apply(ast.literal_eval)

data.drop(columns=["contributor_id", "submitted"], inplace=True, errors="ignore")
data.dropna(subset=["name"], inplace=True)
data = data[data['minutes'] < 300]
data.head()

Unnamed: 0_level_0,name,minutes,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
137739,arriba baked winter squash mexican style,55,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
31490,a bit different breakfast pizza,30,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
112140,all in the kitchen chili,130,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13
59389,alouette potatoes,45,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11
44061,amish tomato ketchup for canning,190,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8


### Stop-word removing

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
additional = {
    "minutes", "easiest", "ever", "aww", "i", "can", "t", "believe", "it", "s", "stole", "the", "idea", "from","mirj", "andrea", " s ", "andreas",
    "viestad", "andes", "andersen", "an", "ana", "amy", "2 ww points", "on demand", "anelia", "amazing",
    "ashley", "ashton", "amazing", "make", "house", "smell", "malcolm", "amazingly", "killer", "perfect",
    "addictive", "leave", "u", "licking", "ur", "finger", "clean", "th", "recipe", "special", "time", "favorite",
    "aunt", "jane", "soft", "and", "moist", "licking", "famous", "non fruitcake", "true", "later",
    "nonbeliever", "believer", "comfort", "ultimate", "lover", "love", "easy", "ugly", "cc", "uncle", "bill", "tyler",
    "unbelievably", "unbelievable", "healthy", "fat", "free", "un", "melt", "mouth", "ummmmm", "umm", "ummmy", "nummy", "ummmm", "unattended",
    "unbaked", "ultra", "ultimately", "yummy", "rich", "quick", "rachael", "ray", "fail", "party", "florence",
    "fast", "light", "low", "carb", "snack", "wedding", "anniversary", "anne", "marie", "annemarie", "annette", "funicello", "syms",
    "byrn", "mike", "willan", "summer", "autumn", "winter", "spring", "burrel", "anna", "tres", "sweet", "uber",
    "homemade", "ann","best","j", "anite", "anitas", "anman", "angie", "angry", "simple", "difficult", "andy", "andrew", "ancient", "still", "another", "best", "go",
    "grant", "grandma", "amusement", "park", "instruction", "kitchen", "test", "ww", "almost", "empty", "dressing", "instant", "like", "le", "virtually",
    "home", "made", "guilt", "guilty", "delicious", "parfait", "forgotten", "forget", "forevermama", "diet", "can", "real", "former",
    "miss", "fabulous", "forever", "authentic", "fortnum", "mason", "kid", "foolproof", "football", "season", "diabetic",
    "two", "small", "one", "three", "four", "five", "thanksgiving", "dream", "foothill", "paula", "deen", "food", "processor", "safari", "processor",
    "traditional", "forbidden", "flavorful", "grandmag", "grandmama", "grandmaman", "grandma", "grandmom", "lena", "alicia", "alisa", "alice", "ali", "bit", "different",
    "eat", "family", "global", "gourmet", "yam", "yam", "emotional", "balance", "tonight", "feel", "cooking", "got", "birthday", "air", "way", "mr", "never", "weep", "half",
    "anything", "pour", "put", "fork", "say", "stove", "top", "thought", "prize", "winning", "add", "ad", "good", "better", "da", "style", "even", "bran", "fake", "fire", "beautiful"
    "l", "game", "day", "hate", "world", "minute", "type", "starbucks", "biggest", "dressed", "summertime", "elmer", "johnny", "depp", "c", "p", "h", "clove", "er", "star", "week",
    "affair", "elegant", "student", "z", "whole", "lotta", "w", "z", "b", "aaron", "craze", "a", "abc", "absolute", "absolut", "absolutely", "perfection", "delightful", "lazy", "morning",
    "abuelo", "abuelito", "abuelita", "abuela", "acadia", "accidental", "adam", "little", "interest", "addicting", "addie", "adele", "adelaide", "adi", "adie", "adriana",
    "adult", "affordable", "alison", "holst", "purpose", "allegheny", "allegedly", "original", "allergic", "ex", "allergy", "allergen", "allen", "poorman", "backyard",
    "alton", "brown", "whatever", "anthony", "anytime", "april", "fool", "ya", "fooled", "sandra", "lee", "edna", "emma", "emy", "evy", "eva", 'evelyn', "fannie", "fanny", "flo", "gladys", "helen", "grace", "ira", "irma",
    "isse", "jean", "janet", "jenny", "juju", "judy", "kathy", "kathi", "kellie", "kelly", "laura", "lee", "kay", "kathleen", "laura", "lee", "lesley", "lil", "linda", "liz", "lois", "louisse",
    "mag", 'martguerite', "margie", "marge", "maggie", "martha", "marylin", "marion", "mary", "marthy", "melody", "michel", "meda", "millie", "muriel", "myrna", "nelda", "nancy", "paulie", "phillis", "rae", "rebecca",
    "rose", "sadie", "sarah", "sara", "sue", "susan", "teresa", "theresa", "auntie", "em", "barbara", "barb", "irene", "lolo", "lori", "lu", "maebelle",
    "aunty", "aussie", "aurora", "austin", "l", "q"
    
    }
stop_words.update(additional) 

# Function to clean recipe names
def clean_recipe_names(recipes):
    cleaned_recipes = []
    
    for recipe in recipes:

        recipe = recipe.lower()
        recipe = re.sub(r'[^a-z\s]', '', recipe)
        
        recipe_words = recipe.split()
        
        # Lemmatize first
        recipe_words = [lemmatizer.lemmatize(word) for word in recipe_words]
        
        # Then remove stopwords
        recipe_words = [word for word in recipe_words if word not in stop_words]
        
        cleaned_recipe = " ".join(recipe_words)
        cleaned_recipes.append(cleaned_recipe)
    
    return cleaned_recipes


data['name'] = clean_recipe_names(data['name'])

data.dropna(subset=['name', 'description'], inplace=True)
data.reset_index(inplace=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maxboc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/maxboc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Columns Normalization

In [5]:
data["description"] = data["description"].apply(lambda x: x.lower())

columns = ["tags", "steps", "ingredients"]

for c in columns:
    data[c] = data[c].apply(lambda x : [s.lower() for s in x])

data.head()


Unnamed: 0,id,name,minutes,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,137739,arriba baked squash mexican,55,"[60-minutes-or-less, time-to-make, course, mai...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7
1,31490,breakfast pizza,30,"[30-minutes-or-less, time-to-make, course, mai...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6
2,112140,chili,130,"[time-to-make, course, preparation, main-dish,...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13
3,59389,alouette potato,45,"[60-minutes-or-less, time-to-make, course, mai...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11
4,44061,amish tomato ketchup canning,190,"[weeknight, time-to-make, course, main-ingredi...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8


In [5]:
""" No row to drop was found
print(len(data))
data.dropna(subset=columns, inplace=True, how="any")
len(data)
"""

' No row to drop was found\nprint(len(data))\ndata.dropna(subset=columns, inplace=True, how="any")\nlen(data)\n'

In [6]:
# create strings from string lists
data["steps_strings"] = data["steps"].apply(lambda x : ' '.join(x))

### Unit Management and Stemming

What we manage so far :
- Some of the mispelled terms (e.g. "minuet" instead of "minute") 
- Standardization for units (mL to L, °F to °C ...)
- Some weird notations like "2-3 kg", "2 to 3 cups" "2 1/2 hours" 

In [7]:
import re
import pandas as pd
from nltk.stem import PorterStemmer
from rapidfuzz import process, fuzz

# Define the stemmer
ps = PorterStemmer()
STANDARD_VOLUME = "liter"
STANDARD_TIME = "minute"
STANDARD_WEIGHT = "gram"
STANDARD_TEMP = "celsius"

# Unit conversion factors to standardized units
unit_conversions = {
    # Volume conversions to liter
    "cup": 0.24,         # 1 cup = 0.24 liters
    "quart": 0.95,       # 1 quart = 0.95 liters
    "mL": 0.001,         # 1 mL = 0.001 liters
    
    # Time conversions to minutes
    "second": 1/60,      # 1 second = 1/60 minutes
    "hours": 60,         # 1 hour = 60 minutes
    "week": 10080,       # 1 week = 10080 minutes
    
    # Weight conversions to grams
    "pound": 453.59,     # 1 pound = 453.59 grams
    "ounce": 28.35,      # 1 ounce = 28.35 grams
    "kg": 1000,          # 1 kg = 1000 grams
    "g": 1,              # 1 g = 1 gram

    # Inch to centimeter conversion
    "inch": 2.54,        # 1 inch = 2.54 cm
    "inches": 2.54,      # 1 inch = 2.54 cm
}

# Mapping of units to their standard category
unit_categories = {
    # Volume units
    "cup": STANDARD_VOLUME,
    "quart": STANDARD_VOLUME,
    "mL": STANDARD_VOLUME,
    "liter": STANDARD_VOLUME,
    
    # Time units
    "second": STANDARD_TIME,
    "minutes": STANDARD_TIME,
    "hours": STANDARD_TIME,
    "week": STANDARD_TIME,
    
    # Weight units
    "pound": STANDARD_WEIGHT,
    "ounce": STANDARD_WEIGHT,
    "kg": STANDARD_WEIGHT,
    "g": STANDARD_WEIGHT,
    "gram": STANDARD_WEIGHT,

    # Measurement units
    "inch": "cm",
    "inches": "cm",
    
    # Temperature units
    "°c": STANDARD_TEMP,
    "°f": STANDARD_TEMP,
    "celsius": STANDARD_TEMP,
    "fahrenheit": STANDARD_TEMP,
}

common_units = list(unit_categories.keys())

typo_corrections = {
    "gram": "g",
    "gm": "g",
    "lb": "pound",
    "oz": "ounce",
    "kilogram": "kg",
    "centimetr": "cm",
    "centimet": "cm",
    "mm": "mL",
    "millimet": "mL",
    "millilit": "mL",
    "centigrad": "°c",
    "litr": "liter",
    "cupof": "cup of",
    "talbespoon": "tablespoon",
    "tablespon": "tablespoon",
    "tablesppoon": "tablespoon",
    "tblpss": "tablespoon",
    "tbso": "tablespoon",
    "tbspn": "tablespoon",
    "tbslp": "tablespoon",
    "tsbp": "tablespoon",
    "tlbsp": "tablespoon",
    "tablestoon": "tablespoon",
    "tablepoon": "tablespoon",
    "teasppon": "teaspoon",
    "teapsoon": "teaspoon",
    "teaspon": "teaspoon",
    "teaspoom": "teaspoon",
    "cu": "cup",
    
    # Temperature corrections
    "f": "°f",
    "fahrenheit": "°f",
    "c": "°c",
    "celsius": "°c",
    "degrees f": "°f",
    "degrees c": "°c",
    "degree f": "°f",
    "degree c": "°c",

    # Times
    "min" : "minutes",
    "minuet": "minutes",
    "minutesthen": "minutes",
    "minutesor": "minutes",
    "minutesyour": "minutes",
    "minuet": "minutes",
    "miniut": "minutes",
    "mimut": "minutes",
    "mionut": "minutes",
    "mintur": "minutes",
    "mkinut": "minutes",
    "mminut": "minutes",
    "munut": "minutes",
    "minuest": "minutes",
    "minunet": "minutes",
    "mintes": "minutes",
    "mutes": "minutes",
    "mutesr": "minutes",
    "minutesr": "minutes",
    "minuteslong": "minutes long",
    "minutesbrush": "minutes brush",
    "minnut": "minutes",
    "minuteuntil": "minutes until",
    "minutesm": "minutes",
    "nminut": "minutes",
    "minit": "minutes",
    "minutu": "minutes",
    "mihnut": "minutes",
    "mintut": "minutes",
    "minutr": "minutes",
    "ninut": "minutes",
    "minutew": "minutes",
    "minutess": "minutes",
    "minutesssssssss": "minutes",
    "minuteswil": "minutes will",
    "seccond": "second",
    "secong": "second",
    "seceond": "second",
    "housr": "hours",
    "houir": "hours",
    "hoursin": "hours",
    "hoursovernight": "hours overnight",
    "secon": "second",
    "seccond": "second",
    "secong": "second",
    "seceond": "second",
    "wk": "week",
    "hr": "hours",
    "b": "lb",
    "z": "oz",
    "″": "inch",
    '"': "inch" 
}

def fahrenheit_to_celsius(f):
    """Convert Fahrenheit to Celsius and round to 2 decimal places"""
    return round((f - 32) * 5.0 / 9.0, 2)
def standardize_temperature(text):
    """
    Process temperature mentions in text, converting Fahrenheit to Celsius,
    but preserving temperatures that are already in Celsius.
    Also avoids confusing time durations with temperatures.
    """
    # First handle explicit Celsius (just standardize format without changing the value)
    # Handle this BEFORE Fahrenheit to avoid double conversion
    c_pattern = r'(\d+(?:\.\d+)?)\s*(?:degrees?\s*c|°c|\^c|celsius)'
    result_text = re.sub(c_pattern, 
                     lambda m: f"{float(m.group(1))} {STANDARD_TEMP}", 
                     text, 
                     flags=re.IGNORECASE)
    
    # Also handle direct C notation
    direct_c_pattern = r'(\d+(?:\.\d+)?)([°]?c\b)'
    result_text = re.sub(direct_c_pattern, 
                     lambda m: f"{float(m.group(1))} {STANDARD_TEMP}", 
                     result_text, 
                     flags=re.IGNORECASE)
    
    # Now handle explicit Fahrenheit temperatures, which we definitely want to convert
    # Handle explicit Fahrenheit with degree symbol or text
    f_pattern = r'(\d+(?:\.\d+)?)\s*(?:degrees?\s*f|°f|\^f|fahrenheit)'
    
    def process_temp(match):
        temp_value = float(match.group(1))
        return f"{fahrenheit_to_celsius(temp_value)} {STANDARD_TEMP}"
    
    result_text = re.sub(f_pattern, process_temp, result_text, flags=re.IGNORECASE)
    
    # Handle direct F notation
    direct_f_pattern = r'(\d+(?:\.\d+)?)([°]?f\b)'
    result_text = re.sub(direct_f_pattern, process_temp, result_text, flags=re.IGNORECASE)
    
    # For standalone "350 degrees" without explicit F/C, we should be careful
    # We'll only convert if we can strongly infer it's Fahrenheit (like high cooking temps)
    pattern_standalone_degrees = r'(\d+(?:\.\d+)?)\s*(?:degrees?|°)(?!\s*[fc]|\s*fahrenheit|\s*celsius)'
    
    def process_ambiguous_temp(match):
        value = float(match.group(1))
        # Only convert if very likely Fahrenheit (high cooking temps > 200)
        if value > 200:
            return f"{fahrenheit_to_celsius(value)} {STANDARD_TEMP}"
        # For temperatures that could be either F or C (e.g., 100 degrees),
        # preserve the original text to avoid incorrect conversions
        return match.group(0)
    
    result_text = re.sub(pattern_standalone_degrees, process_ambiguous_temp, result_text, flags=re.IGNORECASE)
    
    # Then handle cooking context temperatures (preheat, heat, bake, etc.) CAREFULLY
    # We need to avoid matching phrases like "bake 15 minutes"
    # This pattern specifically looks for temperatures WITHOUT explicit Celsius indication
    cooking_temp_pattern = r'((?:preheat|heat|oven|temperature|temp)(?:\s+to)?)\s+(\d+(?:\.\d+)?)(?:\s*(?:degrees?|°)|\b)(?!\s*(?:minute|min|hour|sec|day|week))(?!\s*(?:c|celsius|°c))'
    
    def process_cooking_temp(match):
        context = match.group(1)
        value = float(match.group(2))
        # For cooking temperatures:
        # - Values below 100: Could be C, don't convert
        # - Values 100-200: Ambiguous zone, examine more carefully
        # - Values above 200: Very likely F, convert to C
        if value > 200:
            return f"{context} {fahrenheit_to_celsius(value)} {STANDARD_TEMP}"
        # For ambiguous or likely Celsius values, preserve original
        return match.group(0)
    
    result_text = re.sub(cooking_temp_pattern, process_cooking_temp, result_text, flags=re.IGNORECASE)
    
    # Special case for "bake at X" or "cook at X" where X is a temperature
    # Only match cases NOT explicitly marked as Celsius
    bake_at_pattern = r'((?:bake|cook)(?:\s+at)?)\s+(\d+(?:\.\d+)?)(?:\s*(?:degrees?|°)|\b)(?!\s*(?:minute|min|hour|sec|day|week))(?!\s*(?:c|celsius|°c))'
    
    def process_bake_temp(match):
        context = match.group(1)
        value = float(match.group(2))
        # Similar logic as cooking temperatures
        # Only convert values that are very likely to be Fahrenheit
        if value > 200:
            return f"{context} {fahrenheit_to_celsius(value)} {STANDARD_TEMP}"
        # For ambiguous temperatures or temperatures likely in Celsius already, preserve original
        return match.group(0)
    
    result_text = re.sub(bake_at_pattern, process_bake_temp, result_text, flags=re.IGNORECASE)
    
    return result_text

def standardize_measurements(text):
    """
    Handle measurement-specific standardizations, especially for dimensions like 9x5"
    """
    # Keep the dimension format (NxM) but convert each number from inches to cm
    dimension_pattern = r'(\d+(?:\.\d+)?)x(\d+(?:\.\d+)?)(?:"|″|inch(?:es)?)?'
    result = re.sub(dimension_pattern, 
                    lambda m: f"{float(m.group(1)) * 2.54}x{float(m.group(2)) * 2.54} cm", 
                    text)
    
    # Handle single inch measurements
    inch_pattern = r'(\d+(?:\.\d+)?)(?:"|″|inch(?:es)?)'
    result = re.sub(inch_pattern, 
                    lambda m: f"{float(m.group(1)) * 2.54} cm", 
                    result)
    
    return result

def correct_term(word):
    """Apply fuzzy matching to correct typos in unit terms"""
    # If it is a number return the word
    if not any(c.isalpha() for c in word):
        return word

    # Check if in mapping
    if word in typo_corrections:
        return typo_corrections[word]
      
    # Fuzzy matching
    match, score, _ = process.extractOne(word, common_units, scorer=fuzz.ratio)
    if score > 80:
        return match
    return word

def parse_range(word, next_word=None):
    """
    Detects numeric ranges like "2-3" or "2 to 3" and returns their mean as a float.
    E.g. => 2-3 kgs becomes 2.5 kgs
    """
    if re.match(r"^\d+(\.\d+)?-\d+(\.\d+)?$", word):  # "2-3"
        start, end = map(float, word.split("-"))
        return (start + end) / 2
    if next_word and word.isdigit() and next_word == "to":
        return "to" 
    return None

def standardize_units(text):
    """
    Main function to standardize all units in text.
    This refactored approach processes different unit types in separate passes.
    """
    # Step 1: First convert temperatures (to avoid conflicts with other patterns)
    result = standardize_temperature(text)
    
    # Step 2: Handle measurement standardizations (inches, dimensions)
    result = standardize_measurements(result)
    
    # Step 3: Now process the remaining units
    words = result.lower().split()
    result_words = []
    i = 0
    
    while i < len(words):
        word = words[i]
        next_word = words[i+1] if i + 1 < len(words) else ""
        next2_word = words[i+2] if i + 2 < len(words) else ""
        next3_word = words[i+3] if i + 3 < len(words) else ""

        # Handle fractions like "1 / 2 inch"
        if (
            i + 2 < len(words)
            and re.match(r"^\d+(\.\d+)?$", word)
            and words[i+1] == "/"
            and re.match(r"^\d+(\.\d+)?$", words[i+2])
        ):
            numerator = float(word)
            denominator = float(words[i+2])
            fraction_value = numerator / denominator
            
            # Check if there's a unit after the fraction
            if i + 3 < len(words):
                corrected_unit = correct_term(words[i+3])
                if corrected_unit in unit_categories:
                    category = unit_categories[corrected_unit]
                    converted = fraction_value * unit_conversions.get(corrected_unit, 1)
                    result_words.append(f"{converted} {category}")
                    i += 4  # Move past the fraction and the unit
                    continue
            
            # If no unit or unrecognized unit, just keep the fraction as a decimal
            result_words.append(str(fraction_value))
            i += 3
            continue

        # Handle "2-3 kg"
        value = parse_range(word, next_word)
        if isinstance(value, float) and next_word:
            corrected_unit = correct_term(next_word)
            if corrected_unit in unit_categories:
                category = unit_categories[corrected_unit]
                converted = value * unit_conversions.get(corrected_unit, 1)
                result_words.append(f"{converted} {category}")
                i += 2
                continue

        # Handle "2 to 3 kg"
        if value == "to" and next2_word.replace('.', '', 1).isdigit() and next3_word:
            average = (float(word) + float(next2_word)) / 2
            corrected_unit = correct_term(next3_word)
            if corrected_unit in unit_categories:
                category = unit_categories[corrected_unit]
                converted = average * unit_conversions.get(corrected_unit, 1)
                result_words.append(f"{converted} {category}")
                i += 4
                continue

        # Handle regular value + unit
        if re.match(r"^\d+(\.\d+)?$", word) and next_word:
            corrected_unit = correct_term(next_word)
            if corrected_unit in unit_categories:
                category = unit_categories[corrected_unit]
                converted = float(word) * unit_conversions.get(corrected_unit, 1)
                result_words.append(f"{converted} {category}")
                i += 2
                continue

        # Default: stem and append
        result_words.append(ps.stem(correct_term(word)))
        i += 1

    return " ".join(result_words)

data["steps_string_standardize"] = data["steps_strings"].apply(standardize_units)

The following function is only here to compare what we had before, and what we have after the conversions

In [8]:
units = ["liter", "minute", "gram", "hours", "cup", "quart", "mL", "second", "week", 
         "pound", "ounce", "kg", "oz", "g", "°c", "°f", "fahrenheit", "degre", "celsius"]


# Function to find differences only on number + unit
def number_unit_diff(row):
    pattern = r'(\d+(?:\.\d+)?)\s*(' + '|'.join(units) + r')'

    matches1 = set(re.findall(pattern, row['steps_string_standardize']))
    matches2 = set(re.findall(pattern, row['steps_strings']))

    diff1 = matches1 - matches2
    diff2 = matches2 - matches1
    
    return {
        'only_in_standardize': list(diff1),
        'only_in_not_standardize': list(diff2)
    }

# Apply the function row-wise
diffs = data.apply(number_unit_diff, axis=1, result_type='expand')

# Concatenate
result = pd.concat([data, diffs], axis=1)

In [9]:
result[["id", "only_in_standardize", "only_in_not_standardize"]]

Unnamed: 0,id,only_in_standardize,only_in_not_standardize
0,137739,"[(40.0, minute), (176.67, celsius)]","[(350, degre), (40, minute)]"
1,31490,"[(17.5, minute), (5.0, minute), (103.33, celsi...","[(20, minute), (5, minute), (425, degre)]"
2,112140,"[(150.0, minute), (420.0, minute)]","[(3, hours), (8, hours)]"
3,59389,"[(120.0, minute), (15.0, minute), (176.67, cel...","[(15, minute), (2, hours)]"
4,44061,"[(30.0, minute)]","[(2, hours)]"
...,...,...,...
216681,486161,"[(5.0, minute), (30.0, minute)]","[(5, minute), (30, minute)]"
216682,493372,[],[]
216683,308080,[],[]
216684,298512,"[(95.56, celsius), (7.0, minute), (10.0, minute)]","[(10, minute), (8, minute)]"


Test case with 1/2 inch

In [10]:
print(data[data["id"] == 298512]["steps_strings"].iloc[0])
print(data[data["id"] == 298512]["steps_string_standardize"].iloc[0])

place melted butter in a large mixing bowl and add each remaining ingredient as listed , beating well after each addition to achieve a smooth dough let dough stand uncovered 10 minutes make golf-ball-sized shapes of the dough and insert a flat , wooden popsicle stick into one side of each ball carefully flatten each ball of dough to a 1 / 2 inch thick patty place 3 inches apart on greased baking sheets bake at 400f for 6-8 minutes or til just browned around the edges , but still white on tops of cookies remove from the oven and smear the top of each cookie generously with heated , canned , ready-to-spread frosting frostings may be tinted with few drops food coloring once it has been heated let cookies cool on baking sheet 10 minutes before carefully removing to paper towels
place melt butter in a larg mix bowl and add each remain ingredi as list , beat well after each addit to achiev a smooth dough let dough stand uncov 10.0 minute make golf-ball-s shape of the dough and insert a flat 

Test case with a mention of "350 degrees" but no unit

In [11]:
137739
print(data[data["id"] == 137739]["steps_strings"].iloc[0])
print(data[data["id"] == 137739]["steps_string_standardize"].iloc[0])

make a choice and proceed with recipe depending on size of squash , cut into half or fourths remove seeds for spicy squash , drizzle olive oil or melted butter over each cut squash piece season with mexican seasoning mix ii for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece season with sweet mexican spice mix bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin be careful not to burn the squash especially if you opt to use sugar or butter if you feel more comfortable , cover the squash with aluminum foil the first half hour , give or take , of baking if desired , season with salt
make a choic and proceed with recip depend on size of squash , cut into half or fourth remov seed for spici squash , drizzl oliv oil or melt butter over each cut squash piec season with mexican season mix ii for sweet squash , drizzl melt honey , butter , grate piloncillo over each cut squash piec sea

Test case with inch, degrees (^f)

In [12]:
print(data[data["id"] == 59389]["steps_strings"].iloc[0])
print(data[data["id"] == 59389]["steps_string_standardize"].iloc[0])

place potatoes in a large pot of lightly salted water and bring to a gentle boil cook until potatoes are just tender drain place potatoes in a large bowl and add all ingredients except the"alouette" mix well and transfer to a buttered 8x8 inch glass baking dish with 2 inch sides press the potatoes with a spatula to make top as flat as possible set aside for 2 hours at room temperature preheat oven to 350^f spread"alouette" evenly over potatoes and bake 15 minutes divide between plates garnish with finely diced red and yellow bell peppers
place potato in a larg pot of lightli salt water and bring to a gentl boil cook until potato are just tender drain place potato in a larg bowl and add all ingredi except the"alouette" mix well and transfer to a butter 20.32x20.32 cm inch glass bake dish with 5.08 cm side press the potato with a spatula to make top as flat as possibl set asid for 120.0 minute at room temperatur preheat oven to 176.67 celsius spread"alouette" evenli over potato and bake 

Test case with celsius and Fahrenheit

In [13]:
print(data[data["id"] == 417632]["steps_strings"].iloc[0])
print(data[data["id"] == 417632]["steps_string_standardize"].iloc[0])

preheat the oven to 200 celsius / 395 fahrenheit rinse the corn in cold water 2-3 times and drain puree corn , apple juice and 1 egg until smooth the puree will smell strongly of corn , but don't worry the cake won't taste like this in a blender , whisk egg whites until very stiff meanwhile , transfer the corn to a bowl and add oats add baking powder , vanilla and spices to the corn / oat mixture and stir very well the dough will have a thick oatmeal-like texture if you want , you can add 1 tsp of peanut butter for extra flavour cut the banana into coins and each coin into approximately 6 pieces chop the walnuts add banana and walnuts to the corn / oat mixture use a big flat spoon and scoop an amount equal of 4 tablespoons of the very stiff egg whites into the corn / oat mixture and stir until even add the corn / oat mixture to the remaining egg whites , but don't stir until even just mix lightly there will be puffs of egg white here and there add baking sheets to a 10x20 square baking

### Tokenization

In [14]:
data.columns

Index(['id', 'name', 'minutes', 'tags', 'nutrition', 'n_steps', 'steps',
       'description', 'ingredients', 'n_ingredients', 'steps_strings',
       'steps_string_standardize'],
      dtype='object')

In [15]:
""" seems to have no effect
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger", "lemmatizer"])
texts = data["steps_string_standardize"].astype(str).tolist()
num_cpus = os.cpu_count()
n_process = max(1, int(num_cpus * 0.7)) if num_cpus else 1
docs = list(nlp.pipe(texts, batch_size=50, n_process=n_process))
data["steps_tokens"] = [[token.text for token in doc] for doc in docs]
"""

' seems to have no effect\nnlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger", "lemmatizer"])\ntexts = data["steps_string_standardize"].astype(str).tolist()\nnum_cpus = os.cpu_count()\nn_process = max(1, int(num_cpus * 0.7)) if num_cpus else 1\ndocs = list(nlp.pipe(texts, batch_size=50, n_process=n_process))\ndata["steps_tokens"] = [[token.text for token in doc] for doc in docs]\n'

### Adding Additional features

We add here the following additional features :
- number of token
- average length per token
- We convert the list of strings for ingredients into a simple string
- We do the same for the tags

In [None]:
""" seems to have no effect
data["token_count"] = data["steps_tokens"].apply(len)
data["avg_token_length"] = data["steps_tokens"].apply(lambda tokens: np.mean([len(token) for token in tokens]) if tokens else 0)
data["ingredients_text"] = data["ingredients"].apply(lambda x: ' '.join(x))
data["ingredients_text"] = data["ingredients"].astype(str)
"""

data["tags"] = data["tags"].apply(
    lambda tags: [tag for tag in tags if not any(keyword in tag.lower() for keyword in ["minute", "time", "hours", "preparation"])]
)
data["tags_text"] = data["tags"].apply(lambda x: ' '.join(x))
data["tags_text"] = data["tags"].astype(str)

In [17]:
tags_with_minute = data["tags"].apply(lambda tags: [tag for tag in tags if "make" in tag.lower()])
tags_with_minute = tags_with_minute[tags_with_minute.apply(len) > 0]
tags_with_minute

0         [time-to-make]
1         [time-to-make]
2         [time-to-make]
3         [time-to-make]
4         [time-to-make]
               ...      
216681    [time-to-make]
216682    [time-to-make]
216683    [time-to-make]
216684    [time-to-make]
216685    [time-to-make]
Name: tags, Length: 214818, dtype: object

We convert the nutritional list into calores, total_fat, sugar, sodium, protein, saturated_fat and carbohydrates.

In [17]:
import ast

# Function to convert the nutrition string into separate columns
def expand_nutrition_column(data):
    data['nutrition'] = data['nutrition'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    if data['nutrition'].apply(lambda x: isinstance(x, list)).all():
        data[['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']] = pd.DataFrame(data['nutrition'].to_list(), index=data.index)
    
        data.drop(columns=['nutrition'], inplace=True)
        
    return data
data = expand_nutrition_column(data)
data.drop(columns=['nutrition_score'], inplace=True, errors='ignore')

data.head()

Unnamed: 0,id,name,minutes,tags,n_steps,steps,description,ingredients,n_ingredients,steps_strings,steps_string_standardize,ingredients_text,tags_text,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,137739,arriba baked squash mexican,55,"[60-minutes-or-less, time-to-make, course, mai...",11,"[make a choice and proceed with recipe, depend...",autumn is my favorite time of year to cook! th...,"[winter squash, mexican seasoning, mixed spice...",7,make a choice and proceed with recipe dependin...,make a choic and proceed with recip depend on ...,"['winter squash', 'mexican seasoning', 'mixed ...","['60-minutes-or-less', 'time-to-make', 'course...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,breakfast pizza,30,"[30-minutes-or-less, time-to-make, course, mai...",9,"[preheat oven to 425 degrees f, press dough in...",this recipe calls for the crust to be prebaked...,"[prepared pizza crust, sausage patty, eggs, mi...",6,preheat oven to 425 degrees f press dough into...,preheat oven to 103.33 celsius °c press dough ...,"['prepared pizza crust', 'sausage patty', 'egg...","['30-minutes-or-less', 'time-to-make', 'course...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,chili,130,"[time-to-make, course, preparation, main-dish,...",6,"[brown ground beef in large pot, add chopped o...",this modified version of 'mom's' chili was a h...,"[ground beef, yellow onions, diced tomatoes, t...",13,brown ground beef in large pot add chopped oni...,brown ground beef in larg pot add chop onion t...,"['ground beef', 'yellow onions', 'diced tomato...","['time-to-make', 'course', 'preparation', 'mai...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,alouette potato,45,"[60-minutes-or-less, time-to-make, course, mai...",11,[place potatoes in a large pot of lightly salt...,"this is a super easy, great tasting, make ahea...","[spreadable cheese with garlic and herbs, new ...",11,place potatoes in a large pot of lightly salte...,place potato in a larg pot of lightli salt wat...,"['spreadable cheese with garlic and herbs', 'n...","['60-minutes-or-less', 'time-to-make', 'course...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,amish tomato ketchup canning,190,"[weeknight, time-to-make, course, main-ingredi...",5,"[mix all ingredients& boil for 2 1 / 2 hours ,...",my dh's amish mother raised him on this recipe...,"[tomato juice, apple cider vinegar, sugar, sal...",8,"mix all ingredients& boil for 2 1 / 2 hours , ...","mix all ingredients& boil for 2 30.0 minute , ...","['tomato juice', 'apple cider vinegar', 'sugar...","['weeknight', 'time-to-make', 'course', 'main-...",352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [18]:
data.drop(columns=['ingredients', 'steps', 'steps_strings', 'tags'], inplace=True, errors='ignore')

In [19]:
data.head()

Unnamed: 0,id,name,minutes,n_steps,description,n_ingredients,steps_string_standardize,ingredients_text,tags_text,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,137739,arriba baked squash mexican,55,11,autumn is my favorite time of year to cook! th...,7,make a choic and proceed with recip depend on ...,"['winter squash', 'mexican seasoning', 'mixed ...","['60-minutes-or-less', 'time-to-make', 'course...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,breakfast pizza,30,9,this recipe calls for the crust to be prebaked...,6,preheat oven to 103.33 celsius °c press dough ...,"['prepared pizza crust', 'sausage patty', 'egg...","['30-minutes-or-less', 'time-to-make', 'course...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,chili,130,6,this modified version of 'mom's' chili was a h...,13,brown ground beef in larg pot add chop onion t...,"['ground beef', 'yellow onions', 'diced tomato...","['time-to-make', 'course', 'preparation', 'mai...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,alouette potato,45,11,"this is a super easy, great tasting, make ahea...",11,place potato in a larg pot of lightli salt wat...,"['spreadable cheese with garlic and herbs', 'n...","['60-minutes-or-less', 'time-to-make', 'course...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,amish tomato ketchup canning,190,5,my dh's amish mother raised him on this recipe...,8,"mix all ingredients& boil for 2 30.0 minute , ...","['tomato juice', 'apple cider vinegar', 'sugar...","['weeknight', 'time-to-make', 'course', 'main-...",352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [20]:
data = data[data['steps_string_standardize'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]

In [21]:
data.reset_index(inplace=True)

### Saving for reuse

In [22]:
data.to_csv('../data/preprocessed_recipe.csv', index=False)