In [1]:
import pandas as pd
import numpy as np
from typing import Self, List, Tuple, Any
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import re
import nltk
from nltk.stem import WordNetLemmatizer
import ast
from scipy.sparse import hstack
import os

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
additional = {
    "minutes", "easiest", "ever", "aww", "i", "can", "t", "believe", "it", "s", "stole", "the", "idea", "from","mirj", "andrea", " s ", "andreas",
    "viestad", "andes", "andersen", "an", "ana", "amy", "2 ww points", "on demand", "anelia", "amazing",
    "ashley", "ashton", "amazing", "make", "house", "smell", "malcolm", "amazingly", "killer", "perfect",
    "addictive", "leave", "u", "licking", "ur", "finger", "clean", "th", "recipe", "special", "time", "favorite",
    "aunt", "jane", "soft", "and", "moist", "licking", "famous", "non fruitcake", "true", "later",
    "nonbeliever", "believer", "comfort", "ultimate", "lover", "love", "easy", "ugly", "cc", "uncle", "bill", "tyler",
    "unbelievably", "unbelievable", "healthy", "fat", "free", "un", "melt", "mouth", "ummmmm", "umm", "ummmy", "nummy", "ummmm", "unattended",
    "unbaked", "ultra", "ultimately", "yummy", "rich", "quick", "rachael", "ray", "fail", "party", "florence",
    "fast", "light", "low", "carb", "snack", "wedding", "anniversary", "anne", "marie", "annemarie", "annette", "funicello", "syms",
    "byrn", "mike", "willan", "summer", "autumn", "winter", "spring", "burrel", "anna", "tres", "sweet", "uber",
    "homemade", "ann","best","j", "anite", "anitas", "anman", "angie", "angry", "simple", "difficult", "andy", "andrew", "ancient", "still", "another", "best", "go",
    "grant", "grandma", "amusement", "park", "instruction", "kitchen", "test", "ww", "almost", "empty", "dressing", "instant", "like", "le", "virtually",
    "home", "made", "guilt", "guilty", "delicious", "parfait", "forgotten", "forget", "forevermama", "diet", "can", "real", "former",
    "miss", "fabulous", "forever", "authentic", "fortnum", "mason", "kid", "foolproof", "football", "season", "diabetic",
    "two", "small", "one", "three", "four", "five", "thanksgiving", "dream", "foothill", "paula", "deen", "food", "processor", "safari", "processor",
    "traditional", "forbidden", "flavorful", "grandmag", "grandmama", "grandmaman", "grandma", "grandmom", "lena", "alicia", "alisa", "alice", "ali", "bit", "different",
    "eat", "family", "global", "gourmet", "yam", "yam", "emotional", "balance", "tonight", "feel", "cooking", "got", "birthday", "air", "way", "mr", "never", "weep", "half",
    "anything", "pour", "put", "fork", "say", "stove", "top", "thought", "prize", "winning", "add", "ad", "good", "better", "da", "style", "even", "bran", "fake", "fire", "beautiful"
    "l", "game", "day", "hate", "world", "minute", "type", "starbucks", "biggest", "dressed", "summertime", "elmer", "johnny", "depp", "c", "p", "h", "clove", "er", "star", "week",
    "affair", "elegant", "student", "z", "whole", "lotta", "w", "z", "b", "aaron", "craze", "a", "abc", "absolute", "absolut", "absolutely", "perfection", "delightful", "lazy", "morning",
    "abuelo", "abuelito", "abuelita", "abuela", "acadia", "accidental", "adam", "little", "interest", "addicting", "addie", "adele", "adelaide", "adi", "adie", "adriana",
    "adult", "affordable", "alison", "holst", "purpose", "allegheny", "allegedly", "original", "allergic", "ex", "allergy", "allergen", "allen", "poorman", "backyard",
    "alton", "brown", "whatever", "anthony", "anytime", "april", "fool", "ya", "fooled", "sandra", "lee", "edna", "emma", "emy", "evy", "eva", 'evelyn', "fannie", "fanny", "flo", "gladys", "helen", "grace", "ira", "irma",
    "isse", "jean", "janet", "jenny", "juju", "judy", "kathy", "kathi", "kellie", "kelly", "laura", "lee", "kay", "kathleen", "laura", "lee", "lesley", "lil", "linda", "liz", "lois", "louisse",
    "mag", 'martguerite', "margie", "marge", "maggie", "martha", "marylin", "marion", "mary", "marthy", "melody", "michel", "meda", "millie", "muriel", "myrna", "nelda", "nancy", "paulie", "phillis", "rae", "rebecca",
    "rose", "sadie", "sarah", "sara", "sue", "susan", "teresa", "theresa", "auntie", "em", "barbara", "barb", "irene", "lolo", "lori", "lu", "maebelle",
    "aunty", "aussie", "aurora", "austin", "l", "q"
    
    }
STOPWORDS = set(stopwords.words('english'))
STOPWORDS.update(additional)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Baseline Pipeline (TFIDF)

In [20]:
def clean_recipe_names(recipes) -> List:
    """Basic cleaning and lemmatization of recipe names."""
    cleaned_recipes = []
    lemmatizer = WordNetLemmatizer()
    for recipe in recipes:
        recipe = recipe.lower()
        recipe = re.sub(r'[^a-z\s]', '', recipe)
        words = recipe.split()
        words = [lemmatizer.lemmatize(word) for word in words]
        words = [word for word in words if word not in STOPWORDS]
        cleaned_recipes.append(" ".join(words))
    return cleaned_recipes

def baseline_pipeline():
    data = pd.read_csv("../../data/RAW_recipes.csv")
    data.set_index('id', inplace=True)
    for col in ["tags", "steps", "ingredients", "nutrition"]:
        data[col] = data[col].apply(ast.literal_eval)

    data.drop(columns=["contributor_id", "submitted"], inplace=True, errors="ignore")
    data.dropna(subset=["name"], inplace=True)

    data['cleaned_name'] = clean_recipe_names(data['name'])
    data["description"] = data["description"].apply(lambda x: x.lower() if isinstance(x, str) else "")

    for col in ["tags", "steps", "ingredients"]:
        data[col] = data[col].apply(lambda x: [s.lower() for s in x if isinstance(s, str)] if isinstance(x, list) else [])

    data.dropna(subset=["tags", "steps", "ingredients"], inplace=True, how="any")
    data.reset_index(inplace=True)

    data['combined_text'] = (
        data['cleaned_name'] + ' ' +
        data['description'].fillna('') + ' ' +
        data['ingredients'].apply(lambda x: " ".join(x)) + ' ' +
        data['steps'].apply(lambda x: " ".join(x))
    )

    # TF-IDF vectorizer
    print("Applying TF-IDF")
    tfidf = TfidfVectorizer(max_features=1000)
    X_text = tfidf.fit_transform(data['combined_text'])

    # Basic numerical features
    X_numeric = data[["n_steps", "n_ingredients"]].fillna(0)

    X_final = hstack([X_text, X_numeric])

    y = data["minutes"]

    return X_final, y

### Simple Pipeline TFIDF

In [3]:
def clean_recipe_names(recipes) -> List:
    """Basic cleaning and lemmatization of recipe names."""
    cleaned_recipes = []
    lemmatizer = WordNetLemmatizer()
    for recipe in recipes:
        recipe = recipe.lower()
        recipe = re.sub(r'[^a-z\s]', '', recipe)
        words = recipe.split()
        words = [lemmatizer.lemmatize(word.strip()) for word in words]
        words = [word for word in words if word not in STOPWORDS]
        cleaned_recipes.append(" ".join(words))
    return cleaned_recipes
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack

def simple_pipeline():
    data = pd.read_csv("../../data/RAW_recipes.csv").sample(1000)
    data.set_index('id', inplace=True)
    for col in ["tags", "steps", "ingredients", "nutrition"]:
        data[col] = data[col].apply(ast.literal_eval)

    data.drop(columns=["contributor_id", "submitted"], inplace=True, errors="ignore")
    data.dropna(subset=["name"], inplace=True)

    data['name'] = clean_recipe_names(data['name'])
    for col in ["tags", "steps", "ingredients"]:
        data[col] = data[col].apply(lambda x: [s.lower() for s in x if isinstance(s, str)] if isinstance(x, list) else [])

    data.dropna(subset=["tags", "steps", "ingredients"], inplace=True, how="any")
    data.reset_index(inplace=True)
    
    # Join ingredients into one string per recipe
    data['ingredients'] = data['ingredients'].apply(lambda x: " ".join(x))

    print("Applying Bag of Words (CountVectorizer)")
    bow = CountVectorizer(max_features=1000)
    X_text = bow.fit_transform(data['ingredients'])

    X_numeric = data[["n_steps", "n_ingredients"]].fillna(0)

    X_final = hstack([X_text, X_numeric])
    y = data["minutes"]

    return X_final, y


In [4]:
X, y = simple_pipeline()

Applying Bag of Words (CountVectorizer)


<COOrdinate sparse matrix of dtype 'int64'
	with 17245 stored elements and shape (1000, 1002)>

### Analyze on some recipes edge case

In [29]:
data = pd.read_csv("../../data/RAW_recipes.csv")

In [31]:
data.set_index('id', inplace=True)

In [37]:
data.iloc[11]

name                                  better than sex  strawberries
minutes                                                        1460
contributor_id                                                41531
submitted                                                2002-10-03
tags              ['weeknight', 'time-to-make', 'course', 'main-...
nutrition             [734.1, 66.0, 199.0, 10.0, 10.0, 117.0, 28.0]
n_steps                                                           8
steps             ['crush vanilla wafers into fine crumbs and li...
description       simple but sexy. this was in my local newspape...
ingredients       ['vanilla wafers', 'butter', 'powdered sugar',...
n_ingredients                                                     7
Name: 42198, dtype: object

In [36]:
y[(y > 250)]

11        1460
12        2970
13         525
15         500
27         495
          ... 
231231     255
231354    2900
231548     540
231572    2895
231602     290
Name: minutes, Length: 12048, dtype: int64