# Classification Preprocessing

## ⬇️ Imports

In [None]:
import pandas as pd
import ast
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.ensemble import RandomForestClassifier
import spacy
import os

## 🛠️ Preprocessing
### Downloading the dataset

In [2]:
data  = pd.read_csv('../../data/RAW_recipes.csv')
data.set_index('id', inplace=True)
columns = ["tags", "steps", "ingredients", "nutrition"]

for i in columns:
    data[i] = data[i].apply(ast.literal_eval)

data.drop(columns=["contributor_id", "submitted"], inplace=True, errors="ignore")
data.dropna(subset=["name"], inplace=True)
data = data[data['minutes'] < 300]

### Stop Words Removal & Tokenization on recipe Name

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the lemmatizer and stopwords list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
additional = {
    "minutes", "easiest", "ever", "aww", "i", "can", "t", "believe", "it", "s", "stole", "the", "idea", "from","mirj", "andrea", " s ", "andreas",
    "viestad", "andes", "andersen", "an", "ana", "amy", "2 ww points", "on demand", "anelia", "amazing",
    "ashley", "ashton", "amazing", "make", "house", "smell", "malcolm", "amazingly", "killer", "perfect",
    "addictive", "leave", "u", "licking", "ur", "finger", "clean", "th", "recipe", "special", "time", "favorite",
    "aunt", "jane", "soft", "and", "moist", "licking", "famous", "non fruitcake", "true", "later",
    "nonbeliever", "believer", "comfort", "ultimate", "lover", "love", "easy", "ugly", "cc", "uncle", "bill", "tyler",
    "unbelievably", "unbelievable", "healthy", "fat", "free", "un", "melt", "mouth", "ummmmm", "umm", "ummmy", "nummy", "ummmm", "unattended",
    "unbaked", "ultra", "ultimately", "yummy", "rich", "quick", "rachael", "ray", "fail", "party", "florence",
    "fast", "light", "low", "carb", "snack", "wedding", "anniversary", "anne", "marie", "annemarie", "annette", "funicello", "syms",
    "byrn", "mike", "willan", "summer", "autumn", "winter", "spring", "burrel", "anna", "tres", "sweet", "uber",
    "homemade", "ann","best","j", "anite", "anitas", "anman", "angie", "angry", "simple", "difficult", "andy", "andrew", "ancient", "still", "another", "best", "go",
    "grant", "grandma", "amusement", "park", "instruction", "kitchen", "test", "ww", "almost", "empty", "dressing", "instant", "like", "le", "virtually",
    "home", "made", "guilt", "guilty", "delicious", "parfait", "forgotten", "forget", "forevermama", "diet", "can", "real", "former",
    "miss", "fabulous", "forever", "authentic", "fortnum", "mason", "kid", "foolproof", "football", "season", "diabetic",
    "two", "small", "one", "three", "four", "five", "thanksgiving", "dream", "foothill", "paula", "deen", "food", "processor", "safari", "processor",
    "traditional", "forbidden", "flavorful", "grandmag", "grandmama", "grandmaman", "grandma", "grandmom", "lena", "alicia", "alisa", "alice", "ali", "bit", "different",
    "eat", "family", "global", "gourmet", "yam", "yam", "emotional", "balance", "tonight", "feel", "cooking", "got", "birthday", "air", "way", "mr", "never", "weep", "half",
    "anything", "pour", "put", "fork", "say", "stove", "top", "thought", "prize", "winning", "add", "ad", "good", "better", "da", "style", "even", "bran", "fake", "fire", "beautiful"
    "l", "game", "day", "hate", "world", "minute", "type", "starbucks", "biggest", "dressed", "summertime", "elmer", "johnny", "depp", "c", "p", "h", "clove", "er", "star", "week",
    "affair", "elegant", "student", "z", "whole", "lotta", "w", "z", "b", "aaron", "craze", "a", "abc", "absolute", "absolut", "absolutely", "perfection", "delightful", "lazy", "morning",
    "abuelo", "abuelito", "abuelita", "abuela", "acadia", "accidental", "adam", "little", "interest", "addicting", "addie", "adele", "adelaide", "adi", "adie", "adriana",
    "adult", "affordable", "alison", "holst", "purpose", "allegheny", "allegedly", "original", "allergic", "ex", "allergy", "allergen", "allen", "poorman", "backyard",
    "alton", "brown", "whatever", "anthony", "anytime", "april", "fool", "ya", "fooled", "sandra", "lee", "edna", "emma", "emy", "evy", "eva", 'evelyn', "fannie", "fanny", "flo", "gladys", "helen", "grace", "ira", "irma",
    "isse", "jean", "janet", "jenny", "juju", "judy", "kathy", "kathi", "kellie", "kelly", "laura", "lee", "kay", "kathleen", "laura", "lee", "lesley", "lil", "linda", "liz", "lois", "louisse",
    "mag", 'martguerite', "margie", "marge", "maggie", "martha", "marylin", "marion", "mary", "marthy", "melody", "michel", "meda", "millie", "muriel", "myrna", "nelda", "nancy", "paulie", "phillis", "rae", "rebecca",
    "rose", "sadie", "sarah", "sara", "sue", "susan", "teresa", "theresa", "auntie", "em", "barbara", "barb", "irene", "lolo", "lori", "lu", "maebelle",
    "aunty", "aussie", "aurora", "austin", "l", "q"
    
    }
stop_words.update(additional) 

# Function to clean recipe names
def clean_recipe_names(recipes):
    cleaned_recipes = []
    
    for recipe in recipes:

        recipe = recipe.lower()
        recipe = re.sub(r'[^a-z\s]', '', recipe)
        
        recipe_words = recipe.split()
        
        # Lemmatize first
        recipe_words = [lemmatizer.lemmatize(word) for word in recipe_words]
        
        recipe_words = [word for word in recipe_words if word not in stop_words]
        
        cleaned_recipe = " ".join(recipe_words)
        cleaned_recipes.append(cleaned_recipe)
    
    return cleaned_recipes


data['name'] = clean_recipe_names(data['name'])

data.dropna(subset=['name', 'description'], inplace=True)
data.reset_index(inplace=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/maxboc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/maxboc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
data["description"].head()

0    autumn is my favorite time of year to cook! th...
1    this recipe calls for the crust to be prebaked...
2    this modified version of 'mom's' chili was a h...
3    this is a super easy, great tasting, make ahea...
4    my dh's amish mother raised him on this recipe...
Name: description, dtype: object

### Cleaning Cuisine, tags, steps and ingredients features

In [4]:
data["description"] = data["description"].apply(lambda x: x.lower())
columns = ["tags", "steps", "ingredients"]
for c in columns:
    data[c] = data[c].apply(lambda x: [s.lower() for s in x])

data["steps_strings"] = data["steps"].apply(lambda x: ' '.join(x))
data["ingredients_text"] = data["ingredients"].apply(lambda x: ' '.join(x))
data["tags_text"] = data["tags"].apply(lambda x: ' '.join(x))

In [5]:
TAG2CLASS = {
    # North America – United States
    "american": ["North America – United States"],
    "north-american": ["North America – United States"],
    "northeastern-united-states": ["North America – United States"],
    "californian": ["North America – United States"],
    "native-american": ["North America – United States"],
    "pennsylvania-dutch": ["North America – United States"],
    "hawaiian": ["North America – United States"],

    # North America – Canada
    "canadian": ["North America – Canada"],
    "british-columbian": ["North America – Canada"],
    "quebec": ["North America – Canada"],

    # Central America & Caribbean
    "mexican": ["Mexican", "Central America & Caribbean"],
    "costa-rican": ["Central America & Caribbean"],
    "guatemalan": ["Central America & Caribbean"],
    "caribbean": ["Central America & Caribbean"],
    "cuban": ["Central America & Caribbean"],
    "puerto-rican": ["Central America & Caribbean"],
    "creole": ["Central America & Caribbean"],

    # South America
    "argentine": ["South America"],
    "brazilian": ["South America"],
    "peruvian": ["South America"],
    "chilean": ["South America"],
    "colombian": ["South America"],
    "venezuelan": ["South America"],
    "ecuadorean": ["South America"],

    # Europe – Western
    "french": ["French", "Europe – Western"],
    "english": ["Europe – Western"],
    "scottish": ["Europe – Western"],
    "irish": ["Europe – Western"],
    "welsh": ["Europe – Western"],
    "dutch": ["Europe – Western"],
    "belgian": ["Europe – Western"],
    "austrian": ["Europe – Western"],
    "german": ["Europe – Western"],
    "italian": ["Italian", "Europe – Western"],
    "portuguese": ["Europe – Western"],
    "spanish": ["Spanish", "Europe – Western"],
    "greek": ["Greek", "Europe – Western"],

    # Europe – Northern
    "swedish": ["Europe – Northern"],
    "norwegian": ["Europe – Northern"],
    "finnish": ["Europe – Northern"],
    "icelandic": ["Europe – Northern"],
    "danish": ["Europe – Northern"],

    # Europe – Eastern/Central
    "russian": ["Europe – Eastern"],
    "hungarian": ["Europe – Eastern"],
    "czech": ["Europe – Eastern"],
    "georgian": ["Europe – Eastern"],

    # Middle East & North Africa (MENA)
    "middle-eastern": ["Middle East & North Africa"],
    "turkish": ["Middle East & North Africa"],
    "lebanese": ["Middle East & North Africa"],
    "iranian-persian": ["Middle East & North Africa"],
    "iraqi": ["Middle East & North Africa"],
    "palestinian": ["Middle East & North Africa"],
    "saudi-arabian": ["Middle East & North Africa"],
    "egyptian": ["Middle East & North Africa"],
    "moroccan": ["Middle East & North Africa"],
    "libyan": ["Middle East & North Africa"],
    "algerian": ["Middle East & North Africa"],
    "tunisian": ["Middle East & North Africa"],

    # Asia – East
    "chinese": ["Chinese", "Asia – East"],
    "beijing": ["Chinese", "Asia – East"],
    "chinese-new-year": ["Chinese", "Asia – East"],
    "japanese": ["Japanese", "Asia – East"],
    "korean": ["Korean", "Asia – East"],
    "mongolian": ["Asia – East"],

    # Asia – Southeast
    "vietnamese": ["Asia – Southeast"],
    "indonesian": ["Asia – Southeast"],
    "malaysian": ["Asia – Southeast"],
    "cambodian": ["Asia – Southeast"],
    "laotian": ["Asia – Southeast"],
    "thai": ["Thai", "Asia – Southeast"],
    "polynesian": ["Asia – Southeast"],

    # Asia – South
    "pakistani": ["Asia – South"],
    "nepalese": ["Asia – South"],
    "indian": ["Indian", "Asia – South"],

    # Jewish Diaspora
    "jewish-ashkenazi": ["Jewish Diaspora", "Middle East & North Africa"],
    "jewish-sephardi": ["Jewish Diaspora", "Middle East & North Africa"],

    # Catch-all / generic
    "asian": ["Asia – General"],
    "european": ["Europe – General"],
}
PRIORITIZED_TAGS = [
    # Europe – Western
    "french", "english", "scottish", "irish", "welsh",
    "dutch", "belgian", "austrian", "german", "italian",
    # North America – United States
    "american", "north-american", "northeastern-united-states",
    "californian", "native-american", "pennsylvania-dutch",
    "hawaiian",
    # North America – Canada
    "canadian", "british-columbian", "quebec",
    # Central America & Caribbean
    "mexican", "costa-rican", "guatemalan",
    "caribbean", "cuban", "puerto-rican", "creole",
    # South America
    "argentine", "brazilian", "peruvian",
    "chilean", "colombian", "venezuelan", "ecuadorean",
    # Europe – Northern
    "swedish", "norwegian", "finnish", "icelandic", "danish",
    # Europe – Eastern/Central
    "russian", "hungarian", "czech", "georgian",
    # Europe – Southern
    "portuguese",
    # Middle East & North Africa (MENA)
    "middle-eastern", "turkish", "lebanese",
    "iranian-persian", "iraqi", "palestinian",
    "saudi-arabian", "egyptian", "moroccan", "libyan",
    # Sub-Saharan Africa
    "south-african", "ethiopian", "nigerian",
    "angolan", "sudanese",
    # Asia – East
    "chinese", "beijing", "chinese-new-year",
    "japanese", "korean", "mongolian",
    # Asia – Southeast
    "vietnamese", "indonesian", "malaysian",
    "cambodian", "laotian",
    "polynesian",
    # Asia – South
    "pakistani", "nepalese",
    # Jewish Diaspora
    "jewish-ashkenazi", "jewish-sephardi",
    # Finally, the generic catch-alls
    "asian", "european",
]

_tag_rank = {tag: idx for idx, tag in enumerate(PRIORITIZED_TAGS)}

def map_tags_to_cuisines(tags, cleaned_name):
    known = set()

    for t in tags:
        if t in TAG2CLASS:
            known.add(t)

    for tag in TAG2CLASS.keys():
        if tag.lower() in cleaned_name.lower():
            known.add(tag)

    if not known:
        return []
    
    all_cuisines = []
    for tag in known:
        all_cuisines.extend(TAG2CLASS[tag])

    unique_cuisines = []
    for cuisine in all_cuisines:
        if cuisine not in unique_cuisines:
            unique_cuisines.append(cuisine)

    def get_cuisine_priority(cuisine):
        cuisine_tags = []
        for tag in known:
            if cuisine in TAG2CLASS[tag]:
                cuisine_tags.append(tag)
        
        best_rank = min(_tag_rank.get(tag, float('inf')) for tag in cuisine_tags)
        return best_rank

    sorted_cuisines = sorted(unique_cuisines, key=get_cuisine_priority)
    
    return sorted_cuisines


data['cuisines'] = data.apply(lambda row: map_tags_to_cuisines(row['tags'], row['name']), axis=1)

data = data[data['cuisines'].apply(len) > 0]

### Filtering rare cuisines

In [6]:
cuisine_counts = data['cuisines'].value_counts()

min_samples = 50
valid_cuisines = cuisine_counts[cuisine_counts >= min_samples].index.tolist()
data = data[data['cuisines'].isin(valid_cuisines)].copy()

### Expand nutrition features

In [7]:
import ast  
def expand_nutrition_column(data):
    data['nutrition'] = data['nutrition'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    if data['nutrition'].apply(lambda x: isinstance(x, list)).all():
        data[['calories', 'total_fat', 'sugar', 'sodium', 'protein', 'saturated_fat', 'carbohydrates']] = pd.DataFrame(data['nutrition'].to_list(), index=data.index)
        data.drop(columns=['nutrition'], inplace=True)
        
    return data

data = expand_nutrition_column(data)

print(data.head())


       id                          name  minutes  \
0  137739   arriba baked squash mexican       55   
1   31490               breakfast pizza       30   
4   44061  amish tomato ketchup canning      190   
5   25274               marinated olive       15   
6   67888                 barbecued rib      120   

                                                tags  n_steps  \
0  [60-minutes-or-less, time-to-make, course, mai...       11   
1  [30-minutes-or-less, time-to-make, course, mai...        9   
4  [weeknight, time-to-make, course, main-ingredi...        5   
5  [15-minutes-or-less, time-to-make, course, mai...        4   
6  [weeknight, time-to-make, course, main-ingredi...       10   

                                               steps  \
0  [make a choice and proceed with recipe, depend...   
1  [preheat oven to 425 degrees f, press dough in...   
4  [mix all ingredients& boil for 2 1 / 2 hours ,...   
5  [toast the fennel seeds and lightly crush them...   
6  [in a medium 

### TF-IDF on steps & ingredients

In [8]:
steps_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
steps_features = steps_vectorizer.fit_transform(data["steps_strings"])

ingredients_vectorizer = TfidfVectorizer(max_features=500, stop_words='english')
ingredients_features = ingredients_vectorizer.fit_transform(data["ingredients_text"])

numerical_features = data[['n_steps', 'n_ingredients']].values #,'token_count', 'avg_token_length']].values

feature_matrices = [
    steps_features.toarray(),
    ingredients_features.toarray(),
    numerical_features
]

### Combining Features

In [1]:
X = np.hstack(feature_matrices)

data['primary_cuisine'] = data['cuisines'].apply(lambda cuisines: cuisines[0] if len(cuisines) > 0 else None)

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['primary_cuisine'])

NameError: name 'np' is not defined

In [None]:
X.head()