In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
csv_path = "/kaggle/input/foodcom-recipes-and-reviews/recipes.csv"

df = pd.read_csv(csv_path)

#print(df.head())

In [None]:
import re
import html

# Ensure column names are clean
df.columns = df.columns.str.strip()

# Select only columns that exist in df
selected_columns = ["RecipeId", "Name", "RecipeIngredientQuantities", "RecipeIngredientParts", 
                    "Calories", "FatContent", "CarbohydrateContent", "FiberContent", 
                    "ProteinContent", "RecipeInstructions"]

existing_columns = [col for col in selected_columns if col in df.columns]
df_subset = df[existing_columns].copy()  # Use .copy() to prevent modification issues

print("Before applying function:", df_subset.columns.tolist())

# Function to clean 'c(...)' format
def clean_c_format(text):
    if not isinstance(text, str) or text.strip() == "":
        return ""  # Always return a string
    
    match = re.match(r'^c\((.*)\)$', text.strip())
    cleaned = match.group(1) if match else text.strip()
    
    cleaned_parts = re.findall(r'"(.*?)"', cleaned)
    cleaned = " ".join(cleaned_parts) if cleaned_parts else cleaned

    return html.unescape(cleaned) or ""

# Apply cleaning function
for col in ["RecipeIngredientQuantities", "RecipeIngredientParts", "RecipeInstructions"]:
    if col in df_subset.columns:
        df_subset[col] = df_subset[col].apply(clean_c_format)

print("After applying function:", df_subset.columns.tolist())
print(df_subset.head())

In [None]:
data = df_subset.head(500).to_dict(orient="records")
print(data[0])

In [None]:
columns = ["RecipeId", "Name"]
df_units = df[columns].head(500)  # Get first 500 rows

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

# Popular units pattern
POPULAR_UNITS = ["cups", "cup", "tablespoons", "tablespoon", "tbsp", "tsp", "teaspoons", "teaspoon", 
                 "grams", "gram", "kilograms", "kilogram", "kg", "ounces", "ounce", "oz",
                 "pounds", "pound", "lb", "milliliters", "milliliter", "ml", "liters", "liter", "l"]
UNITS_PATTERN = r"^(" + "|".join(POPULAR_UNITS) + r")"

# Function to modify name for URL
def modify_data(rid, name):
    return rid, name.lower().replace(" ", "-")

# Function to extract units
def extract_unit(text):
    if not isinstance(text, str) or text.strip() == "":
        return None  

    text = text.lower().strip()
    matches = re.findall(UNITS_PATTERN, text)  # Find all unit matches
    return max(matches, key=len) if matches else None  # Pick longest match (plural first)

# Function to scrape and clean units
def scrape_units(df_unit):
    unit_data = []
    
    for _, row in df_unit.iterrows():
        rid, name = modify_data(row["RecipeId"], row["Name"])
        url = f'https://www.food.com/recipe/{name}-{rid}'
        
        try:
            response = requests.get(url)
            response.raise_for_status()  
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch {url}: {e}")
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract units
        cleaned_units = []
        for li in soup.find_all('li', style="display: contents"):
            ingredient_text = li.find('span', class_='ingredient-text')
            if ingredient_text:
                raw_unit = ingredient_text.get_text(strip=True).split(maxsplit=1)[0]  # First word
                cleaned_unit = extract_unit(raw_unit)  # Clean using regex
                if cleaned_unit:
                    cleaned_units.append(cleaned_unit)
        
        unit_data.append({"RecipeId": rid, "Units": cleaned_units})
    
    return pd.DataFrame(unit_data)

# Run the function
df_units = scrape_units(df_units)

# Display first rows
print(df_units.head())


In [None]:
units = df_units.to_dict(orient="records")

print(units[0])

In [None]:
data = [ {k.lower(): v for k, v in row.items()} for row in data]
units = [ {k.lower(): v for k, v in row.items()} for row in units]
print(units[0])

In [None]:
!pip install supabase


from supabase import create_client
from kaggle_secrets import UserSecretsClient

url = UserSecretsClient().get_secret("SUPABASE_URL")
api_key = UserSecretsClient().get_secret("SUPABASE_KEY")
supabase = create_client(url, api_key)

response = (
    supabase.table("foodcom")
    .insert(data)
    .execute()
)

response = (
    supabase.table("units")
    .insert(units)
    .execute()
)