In [36]:
from datasets import load_dataset
import pandas as pd #version 2.3.3
import numpy as np #version 2.3.1
import seaborn as sns #version 0.13.2
import matplotlib #version 3.10.0
import matplotlib.pyplot as plt
import statsmodels.api as sm #version 0.14.5

In [64]:
data_stream = load_dataset(
    "openfoodfacts/product-database",
    split="food",
    streaming=True # Use streaming for massive datasets
    )
df_initial = pd.DataFrame(data_stream.take(1000))

Starting data load


In [116]:
targeted_keywords = ["name", "Quantity", "Brands", "Categories", 
                    "Manufacturing", "Stores", "Country", 
                     "Ingredients","Origin", "nutriments"]

In [117]:
def filter_df_by_keywords(df, keywords):
    """
    Filters a DataFrame to include only columns whose names contain any of the given keywords.
    """
    # Create a list of columns where the column name (lowercased) contains any of the keywords
    relevant_cols = []
    for col in df.columns: 
        for keyword in keywords:
            if keyword.lower() in col.lower():
                relevant_cols.append(col)
    return relevant_cols

target_col = filter_df_by_keywords(df_initial, targeted_keywords)
target_col

['brands_tags',
 'brands',
 'categories',
 'categories_tags',
 'categories_properties',
 'ciqual_food_name_tags',
 'generic_name',
 'ingredients_analysis_tags',
 'ingredients_from_palm_oil_n',
 'ingredients_n',
 'ingredients_original_tags',
 'ingredients_original_tags',
 'ingredients_percent_analysis',
 'ingredients_tags',
 'ingredients_text',
 'ingredients_with_specified_percent_n',
 'ingredients_with_unspecified_percent_n',
 'ingredients_without_ciqual_codes_n',
 'ingredients_without_ciqual_codes',
 'ingredients',
 'known_ingredients_n',
 'manufacturing_places_tags',
 'manufacturing_places',
 'nutriments',
 'origins_tags',
 'origins',
 'product_name',
 'product_quantity_unit',
 'product_quantity',
 'quantity',
 'serving_quantity',
 'stores_tags',
 'stores',
 'unknown_ingredients_n']

In [119]:
items_to_remove = ["categories", "categories_properties	","ingredients_analysis_tags", 
                   "ingredients_from_palm_oil_n", "quantity", "ingredients_text",
                  "ingredients_with_specified_percent_n", "ingredients_with_unspecified_percent_n",
                  "ingredients_without_ciqual_codes_n", "ingredients_without_ciqual_codes",
                 " known_ingredients_n"]
removal_set = set(items_to_remove)
cleaned_list = [
        item for item in target_col
        if item not in removal_set
    ]

print(cleaned_list)
df_initial[cleaned_list].shape

['brands_tags', 'brands', 'categories_tags', 'categories_properties', 'ciqual_food_name_tags', 'generic_name', 'ingredients_n', 'ingredients_original_tags', 'ingredients_original_tags', 'ingredients_percent_analysis', 'ingredients_tags', 'ingredients', 'known_ingredients_n', 'manufacturing_places_tags', 'manufacturing_places', 'nutriments', 'origins_tags', 'origins', 'product_name', 'product_quantity_unit', 'product_quantity', 'serving_quantity', 'stores_tags', 'stores', 'unknown_ingredients_n']


(1000, 25)

In [14]:
def filter_target_products(example):
    """
    Filters records to find products classified as 'chocolate' AND
    containing 'cocoa' in the ingredients list, safely handling lists, strings,
    and lists of dictionaries for both fields.
    """
    
    # Process Categories (Must contain 'chocolate') ---
    # Try 'categories_tags', fall back to 'categories' if needed
    categories_data = example.get('categories_tags', example.get('categories', ''))
    
    searchable_categories = ''
    if isinstance(categories_data, list):
        # Check if the list contains dictionaries 
        if categories_data and isinstance(categories_data[0], dict):
            # Extract the 'name' or 'id' from each dictionary item
            string_parts = [item.get('name', item.get('id', '')) for item in categories_data if isinstance(item, dict)]
            searchable_categories = ' '.join(string_parts).lower()
        else:
            # Assume it's a list of strings and join them
            searchable_categories = ' '.join(categories_data).lower()
    else:
        # It's a string, NaN, or other single value
        searchable_categories = str(categories_data).lower()
        
    has_chocolate_category = 'chocolate' in searchable_categories
    
    
    # --- 2. Process Ingredients (Must contain 'cocoa') ---
    # Prioritize 'ingredients_text' (the clean string), fall back to 'ingredients'
    ingredients_data = example.get('ingredients_text', example.get('ingredients', ''))
    
    searchable_ingredients = ''
    if isinstance(ingredients_data, list):
        # Check if the list contains dictionaries (the cause of the TypeError)
        if ingredients_data and isinstance(ingredients_data[0], dict):
            # Critical fix: Extract the 'text' key from the dictionary objects
            string_parts = [item.get('text', '') for item in ingredients_data if isinstance(item, dict)]
            searchable_ingredients = ' '.join(string_parts).lower()
        else:
            # Assume it's a list of strings
            searchable_ingredients = ' '.join(ingredients_data).lower()
    else:
        # It's a string (like the pre-joined ingredients_text), NaN, or other single value
        searchable_ingredients = str(ingredients_data).lower()
    
    has_cocoa_ingredient = 'cocoa' in searchable_ingredients
    
    return has_chocolate_category and has_cocoa_ingredient



In [15]:
print("Starting data load and targeted filtering...")

data_stream = load_dataset(
    "openfoodfacts/product-database",
    split="food",
    streaming=True # Use streaming for massive datasets
)

filtered_data_stream = data_stream.filter(filter_target_products)
df_initial = pd.DataFrame(filtered_data_stream.take(5000))

df_keywords_filtered = filter_df_by_keywords(df_initial, keywords)

Starting data load and targeted filtering...


KeyboardInterrupt: 