In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/recipenlg/RecipeNLG_paper.pdf
/kaggle/input/recipenlg/RecipeNLG_license.png
/kaggle/input/recipenlg/RecipeNLG_dataset.csv
/kaggle/input/recipenlg/RecipeNLG_code/README.md
/kaggle/input/recipenlg/RecipeNLG_code/requirements.txt
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/manual-checkup.ipynb
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/05-advanced-duplicates-removal.ipynb
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/extra-02-join-ner-url.ipynb
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/crawling_prep.ipynb
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/01-single_dataset.ipynb
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/recipes1M-transform.ipynb
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/.gitignore
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/README.md
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scripts/06-comparisons_plots.ipynb
/kaggle/input/recipenlg/RecipeNLG_code/scraping-scr

In [2]:
!pip install tqdm



In [1]:
import pandas as pd
import re
import random
import ast
from tqdm import tqdm

# --- CONFIGURATION ---
file_path = '/kaggle/input/recipenlg/RecipeNLG_dataset.csv' 
output_filename = 'strict_filtered_balanced_sample_large_6_ingredients_no_bucket.csv'

# Columns to use for keyword searching (Flavor/Action detection)
search_cols = ['ingredients', 'directions', 'NER']

# Limit rows to process (Set to None for full file)
limit_rows = 2231142 

# --- COMMENTED OUT: We don't need a limit anymore ---
# samples_per_keyword = 150 

# --- 1. SAMPLING KEYWORDS ---
# (Technically we don't need these anymore since we aren't bucketing, 
# but I'm keeping the definitions here just in case you want them later)
keywords = {
    'sweet': ['sugar', 'honey', 'maple syrup'],
    'sour': ['vinegar', 'lemon'],
    'umami': ['soy sauce', 'parmesan', 'mushrooms'],
    'bitter': ['cocoa', 'kale'],
    'salty': ['salt', 'olives', 'feta', 'bacon'],
    'fat': ['butter', 'oil', 'cream', 'cheese'],
    'spicy': ['chili', 'black pepper', 'hot sauce'],
    'crunchy': ['nuts', 'breadcrumbs'],
    'creamy': ['cream', 'cheese'],
    'starchy': ['potato', 'rice', 'pasta'],
    'heat_action': ['saute', 'pan-fry', 'roast', 'bake', 'boil', 'simmer', 'braise', 'grill', 'broil'],
    'mech_action': ['chop', 'dice', 'mince', 'slice', 'mix', 'stir', 'whisk', 'blend', 'marinate', 'season']
}

# --- 2. STRICT VALIDATION FILTER ---
filter_dict = {
    'sweet': [
        'sugar', 'honey', 'maple syrup', 'dates', 'agave', 'molasses', 'brown sugar', 'confectioners\' sugar', 'cane syrup',
        'stevia', 'monk fruit', 'coconut sugar', 'jaggery', 'corn syrup', 'rice syrup', 'barley malt', 'sucanat',
        'caramel', 'butterscotch', 'frosting', 'icing', 'fondant', 'jam', 'jelly', 'marmalade', 'preserves',
        'fruit compote', 'dulce de leche', 'sweetened condensed milk', 'apple sauce', 'mashed banana', 'ripe mango',
        'pineapple', 'berries', 'grapes', 'melon', 'peaches', 'plums', 'cherries', 'figs', 'raisins', 'dried apricots',
        'prunes', 'sweet potato', 'carrots', 'beets', 'corn', 'peas', 'bell peppers', 'onions (caramelized)',
        'balsamic glaze', 'hoisin sauce', 'teriyaki sauce', 'ketchup', 'barbecue sauce', 'sweet chili sauce',
        'vanilla', 'almond extract', 'coconut milk', 'sweetened yogurt', 'sweet cream', 'marzipan', 'nougat'
    ],
    'sour': [
        'vinegar', 'white vinegar', 'apple cider vinegar', 'red wine vinegar', 'balsamic vinegar', 'rice vinegar',
        'champagne vinegar', 'sherry vinegar', 'malt vinegar', 'distilled vinegar', 'lemon', 'lime', 'yogurt',
        'buttermilk', 'sour cream', 'crème fraîche', 'kefir', 'sauerkraut', 'kimchi', 'pickles', 'pickled vegetables',
        'tamarind', 'sumac', 'verjus', 'sorrel', 'rhubarb', 'green apples', 'green grapes', 'gooseberries',
        'cranberries', 'currants', 'passion fruit', 'tart cherries', 'unripe mango', 'green papaya', 'citrus zest',
        'fermented foods', 'kombucha', 'sourdough', 'sour candies', 'citric acid', 'tartaric acid', 'malic acid'
    ],
    'umami': [
        'soy sauce', 'tamari', 'coconut aminos', 'fish sauce', 'oyster sauce', 'worcestershire sauce', 'hoisin sauce',
        'miso', 'doubanjiang', 'gochujang', 'parmesan cheese', 'pecorino', 'aged gouda', 'blue cheese', 'nutritional yeast',
        'anchovy', 'sardines', 'mackerel', 'bonito flakes', 'dashi', 'kombu', 'nori', 'seaweed', 'mushrooms',
        'shiitake', 'porcini', 'morel', 'chanterelle', 'enoki', 'maitake', 'truffle', 'truffle oil', 'tomato paste',
        'sun-dried tomatoes', 'roasted tomatoes', 'caramelized onions', 'roasted garlic', 'marmite', 'vegemite',
        'bovril', 'beef stock', 'chicken stock', 'vegetable stock', 'bone broth', 'demiglace', 'gravy', 'browned meat',
        'dry-aged beef', 'cured meats', 'fermented beans', 'black garlic', 'soybean paste', 'msg', 'autolyzed yeast extract'
    ],
    'bitter': [
        'cocoa', 'dark chocolate', 'cacao nibs', 'coffee', 'espresso', 'espresso powder', 'matcha', 'green tea',
        'black tea', 'tonic water', 'quinine', 'angostura bitters', 'campari', 'aperol', 'vermouth', 'absinthe',
        'grapefruit', 'bitter melon', 'endive', 'escarole', 'radicchio', 'kale', 'collard greens', 'mustard greens',
        'arugula', 'watercress', 'dandelion greens', 'brussels sprouts', 'broccoli rabe', 'artichokes', 'asparagus',
        'eggplant', 'saffron', 'turmeric', 'fenugreek', 'szechuan peppercorn', 'citrus pith', 'almond skins',
        'walnuts', 'hazelnuts', 'pistachios', 'sesame seeds', 'burnt sugar', 'charred vegetables', 'smoked ingredients',
        'neem', 'gentian root', 'wormwood', 'cascara', 'hops'
    ],
    'salty': [
        'salt', 'sea salt', 'kosher salt', 'himalayan salt', 'flaky salt', 'smoked salt', 'garlic salt', 'celery salt',
        'soy sauce', 'tamari', 'liquid aminos', 'fish sauce', 'anchovy paste', 'capers', 'olives', 'green olives',
        'kalamata olives', 'castelvetrano olives', 'feta cheese', 'goat cheese', 'halloumi', 'queso fresco', 'cotija',
        'blue cheese', 'gorgonzola', 'pecorino', 'parmesan', 'aged cheddar', 'cured meats', 'bacon', 'pancetta',
        'guanciale', 'prosciutto', 'serrano ham', 'speck', 'salami', 'pepperoni', 'soppressata', 'chorizo', 'sausage',
        'salted butter', 'salted nuts', 'pretzels', 'crackers', 'potato chips', 'popcorn', 'pickles', 'kimchi',
        'sauerkraut', 'fermented vegetables', 'bouillon', 'stock cubes', 'miso paste', 'oyster sauce', 'worcestershire sauce',
        'teriyaki sauce', 'hoisin sauce', 'salted egg yolk', 'salt-packed sardines', 'salt cod', 'biltong', 'jerky'
    ],
    'fat': [
        'butter', 'unsalted butter', 'clarified butter', 'ghee', 'brown butter', 'compound butter',
        'cream', 'heavy cream', 'whipping cream', 'light cream', 'half-and-half', 'sour cream', 'crème fraîche',
        'mascarpone', 'cream cheese', 'neufchâtel', 'oil', 'olive oil', 'extra virgin olive oil', 'vegetable oil',
        'canola oil', 'grapeseed oil', 'avocado oil', 'coconut oil', 'peanut oil', 'sesame oil', 'truffle oil',
        'walnut oil', 'almond oil', 'sunflower oil', 'corn oil', 'lard', 'schmaltz', 'duck fat', 'goose fat',
        'beef tallow', 'suet', 'bacon fat', 'rendered fat', 'cheese', 'hard cheeses', 'soft cheeses', 'aged cheeses',
        'fresh cheeses', 'avocado', 'nuts', 'nut butters', 'seeds', 'seed butters', 'egg yolk', 'foie gras',
        'fatty fish', 'marbled meat', 'shortening', 'margarine', 'copha', 'mayonnaise', 'aioli', 'ranch dressing',
        'caesar dressing', 'tahini', 'chocolate', 'cocoa butter'
    ],
    'spicy': [
        'chili', 'chili pepper', 'jalapeño', 'serrano', 'habanero', 'scotch bonnet', 'ghost pepper', 'carolina reaper',
        'cayenne pepper', 'red pepper flakes', 'crushed red pepper', 'chili powder', 'ancho chili', 'chipotle',
        'guajillo', 'pasilla', 'arbol', 'piri piri', 'bird\'s eye chili', 'thai chili', 'szechuan pepper',
        'black pepper', 'white pepper', 'green pepper', 'pink pepper', 'long pepper', 'tellicherry pepper',
        'wasabi', 'horseradish', 'mustard', 'dijon mustard', 'english mustard', 'whole grain mustard',
        'hot mustard', 'mustard seeds', 'ginger', 'fresh ginger', 'pickled ginger', 'galangal', 'turmeric',
        'horseradish root', 'radish', 'daikon', 'watercress', 'arugula', 'garlic', 'raw garlic', 'fermented garlic',
        'onion', 'raw onion', 'shallot', 'leek', 'chives', 'hot sauce', 'tabasco', 'sriracha', 'sambal',
        'harissa', 'gochujang', 'zhoug', 'chermoula', 'peri peri sauce', 'buffalo sauce', 'nashville hot sauce',
        'cajun seasoning', 'creole seasoning', 'berbere', 'ras el hanout', 'curry powder', 'curry paste'
    ],
    'heat_action': [
        'saute', 'sauté', 'pan-fry', 'shallow fry', 'deep fry', 'air fry', 'stir-fry', 'flash fry',
        'roast', 'bake', 'broil', 'grill', 'griddle', 'sear', 'blacken', 'char', 'torch',
        'boil', 'parboil', 'blanch', 'shock', 'simmer', 'poach', 'steam', 'pressure cook',
        'slow cook', 'braise', 'stew', 'confit', 'sous-vide', 'temper', 'toast', 'warm',
        'reheat', 'reduce', 'reduce sauce', 'glaze', 'caramelize', 'candy', 'crystallize',
        'smoke', 'hot smoke', 'cold smoke', 'grill', 'barbecue', 'rotisserie', 'spit-roast',
        'pan roast', 'oven roast', 'roast whole', 'roast pieces', 'bake blind', 'bake covered',
        'bake uncovered', 'broil high', 'broil low', 'grill marks', 'grill pan', 'plancha',
        'teppanyaki', 'hibachi', 'tandoor', 'clay oven', 'wood-fired', 'coal-fired', 'gas grill',
        'electric grill', 'induction cook', 'microwave', 'solar cook', 'fire pit', 'campfire cook',
        'dutch oven', 'tagine', 'casserole', 'hot pot', 'fondue', 'raclette', 'stone grill'
    ],
    'crunchy': [
        'nuts', 'almonds', 'walnuts', 'pecans', 'hazelnuts', 'peanuts', 'cashews', 'pistachios', 'macadamia nuts',
        'breadcrumbs', 'panko', 'croutons', 'crispy onions', 'fried shallots', 'tempura flakes',
        'seeds', 'pumpkin seeds', 'sunflower seeds', 'sesame seeds', 'flax seeds', 'chia seeds',
        'granola', 'muesli', 'cereal', 'cornflakes', 'rice crispies',
        'raw vegetables', 'celery', 'carrot sticks', 'bell peppers', 'radishes', 'jicama', 'water chestnuts',
        'apple slices', 'pear slices', 'fried items', 'fried chicken skin', 'cracklings', 'pork rinds',
        'chips', 'potato chips', 'tortilla chips', 'plantain chips', 'kale chips', 'parsnip chips',
        'crackers', 'water crackers', 'breadsticks', 'pretzels', 'biscotti',
        'toasted elements', 'toasted coconut', 'candied nuts', 'praline', 'brittle',
        'sugar glass', 'hard candy', 'crispy bacon', 'prosciutto chips', 'crispy sage', 'fried herbs',
        'puffed grains', 'popcorn', 'rice cakes', 'lavash crackers', 'phyllo pastry', 'wonton strips'
    ],
    'creamy': [
        'cream', 'heavy cream', 'whipping cream', 'double cream', 'clotted cream', 'creme fraiche',
        'sour cream', 'mascarpone', 'cream cheese', 'ricotta', 'cottage cheese', 'quark',
        'soft cheeses', 'brie', 'camembert', 'burrata', 'mozzarella', 'fresh cheese',
        'custard', 'creme anglaise', 'pastry cream', 'creme patissiere', 'flan', 'panna cotta',
        'avocado', 'guacamole', 'mashed avocado',
        'purees', 'mashed potatoes', 'sweet potato puree', 'butternut squash puree', 'cauliflower puree',
        'hummus', 'baba ganoush', 'tahini sauce', 'bean dip', 'refried beans',
        'mayonnaise', 'aioli', 'remoulade', 'tartar sauce', 'ranch dressing', 'caesar dressing',
        'creamy soups', 'bisque', 'chowder', 'veloute', 'cream of mushroom', 'cream of tomato',
        'yogurt', 'greek yogurt', 'labneh', 'skyr', 'kefir',
        'coconut milk', 'coconut cream', 'cashew cream', 'almond cream',
        'butter', 'compound butter', 'beurre blanc', 'beurre monte',
        'ganache', 'chocolate mousse', 'cremeux', 'pot de creme',
        'ice cream', 'gelato', 'sorbet', 'frozen yogurt',
        'condensed milk', 'evaporated milk', 'dulce de leche',
        'egg-based sauces', 'hollandaise', 'bearnaise', 'mayonnaise',
        'pureed legumes', 'lentil puree', 'white bean puree', 'black bean dip'
    ],
    'starchy': [
        'potato', 'russet potato', 'yukon gold', 'red potato', 'fingerling', 'sweet potato', 'yam',
        'rice', 'white rice', 'brown rice', 'jasmine rice', 'basmati rice', 'arborio rice', 'sushi rice',
        'pasta', 'spaghetti', 'penne', 'fettuccine', 'macaroni', 'lasagna', 'orzo', 'couscous',
        'bread', 'white bread', 'whole wheat', 'sourdough', 'baguette', 'ciabatta', 'naan', 'pita',
        'flour', 'all-purpose flour', 'bread flour', 'whole wheat flour', 'cornmeal', 'semolina',
        'grains', 'quinoa', 'barley', 'farro', 'freekeh', 'millet', 'oats', 'oatmeal',
        'corn', 'corn kernels', 'polenta', 'grits', 'cornbread', 'tortillas', 'corn tortillas',
        'legumes', 'lentils', 'chickpeas', 'black beans', 'kidney beans', 'pinto beans',
        'root vegetables', 'parsnips', 'turnips', 'rutabaga', 'celeriac', 'taro', 'cassava',
        'winter squash', 'butternut squash', 'acorn squash', 'pumpkin', 'kabocha',
        'processed starches', 'tapioca', 'arrowroot', 'potato starch', 'cornstarch',
        'dumplings', 'gnocchi', 'spaetzle', 'pierogi', 'potato dumplings',
        'breakfast cereals', 'cream of wheat', 'malt-o-meal', 'grits',
        'plantains', 'green plantains', 'ripe plantains',
        'bread products', 'stuffing', 'bread pudding', 'french toast',
        'pastry', 'pie crust', 'puff pastry', 'shortcrust', 'phyllo dough'
    ]
}

# --- PRE-PROCESSING ---

# 1. Build the STRICT Allowed Set (Flatten the filter_dict)
allowed_ingredients = set()
for category, items in filter_dict.items():
    for item in items:
        allowed_ingredients.add(item.lower())

# --- COMMENTED OUT: We don't need regex matching for sampling anymore ---
# 2. Build the Sampling Regex (From the keywords dict)
# term_map = {}
# for category, terms in keywords.items():
#     for term in terms:
#         term_map[term.lower()] = category
# regex_pattern = re.compile(r'\b(' + '|'.join(map(re.escape, term_map.keys())) + r')\b', re.IGNORECASE)

# --- EXECUTION ---

# --- MODIFIED: Use a single list instead of a reservoir dict ---
# reservoir = {term: [] for term in term_map.keys()}
all_valid_recipes = [] # <--- New storage for everything

print(f"Scanning file with STRICT filtering (Limit: {limit_rows})...")

chunk_size = 50000
with tqdm(total=limit_rows, unit='rows') as pbar:
    
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, on_bad_lines='skip', nrows=limit_rows):
        
        # --- COMMENTED OUT: We don't need combined text for regex scanning anymore ---
        # chunk['combined_text'] = chunk[search_cols].fillna('').astype(str).agg(' '.join, axis=1).str.lower()
        
        for idx, row in chunk.iterrows():
            
            # --- STEP 1: STRICT INGREDIENT CHECK ---
            try:
                # NER column format is usually "['onion', 'garlic']"
                ner_list = ast.literal_eval(row['NER']) 
                
                # --- NEW FILTER ADDED HERE ---
                # Valid only if ingredient count is greater or equal than 6
                if len(ner_list) < 6:
                    is_valid = False
                else:
                    # Verify EVERY ingredient is in our allowed list
                    is_valid = True
                    for ingredient in ner_list:
                        if ingredient.lower() not in allowed_ingredients:
                            is_valid = False
                            break
                
                if not is_valid:
                    continue # Skip this row completely
                    
            except (ValueError, SyntaxError):
                continue # Skip rows with malformed NER data

            # --- STEP 2: SAVE EVERYTHING (NO SAMPLING) ---
            # If we get here, the recipe is valid. Keep it.
            all_valid_recipes.append(row)

            # --- COMMENTED OUT: The original sampling/bucketing logic ---
            # text = row['combined_text']
            # found_terms = set(regex_pattern.findall(text))
            # 
            # for term in found_terms:
            #     term = term.lower()
            #     
            #     if len(reservoir[term]) < samples_per_keyword:
            #         reservoir[term].append(row)
            #     else:
            #         # Random replacement to ensure distribution across file
            #         r = random.randint(0, total_valid_rows)
            #         if r < samples_per_keyword:
            #             reservoir[term][r] = row
            #             
            # total_valid_rows += 1
            
        pbar.update(len(chunk))

# --- CONSOLIDATION ---
print("\nConsolidating results...")

# --- MODIFIED: Create dataframe directly from the list ---
# all_selected_rows = []
# stats = {cat: 0 for cat in keywords.keys()}

# for term, rows in reservoir.items():
#     category = term_map[term]
#     stats[category] += len(rows)
#     all_selected_rows.extend(rows)

final_df = pd.DataFrame(all_valid_recipes).drop_duplicates(subset=['title', 'link'])
if 'combined_text' in final_df.columns:
    final_df = final_df.drop(columns=['combined_text'])

print(f"Finished! Found {len(final_df)} unique valid recipes.")
# print(stats) # Stats no longer apply since we aren't categorizing
final_df.to_csv(output_filename, index=False)

Scanning file with STRICT filtering (Limit: 2231142)...


100%|██████████| 2231142/2231142 [03:17<00:00, 11287.98rows/s]


Consolidating results...
Finished! Found 2879 unique valid recipes.



