<a href="https://colab.research.google.com/github/NoraHK3/DataSciProject/blob/main/data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **1- CSV Translation**

The original dataset contained a mix of **Arabic and English** text — some dish names and ingredients were already in English, while others were still in Arabic.
To make the data consistent and ready for analysis, this step translates **all Arabic parts** into **English**, while keeping the existing English text as it is.

It uses **Google Translate** through the `deep-translator` library, with extra tools to keep translations accurate:

* **Overrides (`overrides_expanded.json`)** – custom manual translations that replace Google’s output for specific words (e.g., “ليمون أسود” → “black lime”).
* **Cache (`translation_cache.csv`)** – stores previous translations to keep results consistent and faster.
* **Post-fixes** – fixes common translation mistakes automatically (e.g., “black lemon” → “black lime”).
* **Classification handling** – splits multi-part text like “رز | دجاج” and translates each piece separately (“rice | chicken”).

**Why we did this:**
Mixing two languages made the data inconsistent. Translating everything to English first ensures that the later cleaning and ingredient normalization steps work correctly and uniformly.

**Output:**
 `SaudiFoodFile_english_FIXED.csv` — fully English, consistent version of the dataset


In [None]:
OVERRIDES_JSON = "overrides_expanded.json"

In [None]:
# ============================================
# Translator (Deep) with Expanded Overrides + Cache Purge + Debug
# ============================================

!pip install -q pandas deep-translator

import os, re, json, pandas as pd
from deep_translator import GoogleTranslator

INPUT_CSV  = "SaudiFoodFile.csv"
OUTPUT_CSV = "SaudiFoodFile_english_FIXED.csv"
CACHE_CSV  = "translation_cache.csv"

# Prefer expanded overrides if present
OVR_EXP   = "overrides_expanded.json"
OVR_BASE  = "overrides.json"
OVERRIDES_JSON = OVR_EXP if os.path.exists(OVR_EXP) else OVR_BASE

HANDLE_CLASSIFICATIONS = True  # split 'classifications' by '|'
TRANSLATE_COLS = None          # None -> all object columns

# ---------- helpers ----------
AR_DIAC = re.compile(r"[\u0610-\u061A\u064B-\u065F\u06D6-\u06ED]")
def norm_ar(s: str) -> str:
    s = AR_DIAC.sub("", s)
    s = s.replace("\u0640","")
    s = s.replace("أ","ا").replace("إ","ا").replace("آ","ا")
    s = s.replace("ى","ي").replace("ئ","ي").replace("ؤ","و").replace("ٱ","ا")
    return s

def key_norm(x: str) -> str:
    return norm_ar(str(x).strip().lower())

POST_FIX = {
    "black lemon": "black lime",
    "nail": "cloves",
    "cardamon": "cardamom",
    "yougurt": "yogurt",
    "youghurt": "yogurt",
}

def apply_postfix(en: str) -> str:
    return POST_FIX.get(str(en).strip().lower(), str(en).strip())

# ---------- load data ----------
# CSV
try:
    df = pd.read_csv(INPUT_CSV, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(INPUT_CSV, encoding="cp1256")

# Overrides
if os.path.exists(OVERRIDES_JSON):
    with open(OVERRIDES_JSON, "r", encoding="utf-8") as f:
        OV = json.load(f)
else:
    OV = {}

# Normalized override view (for Arabic variant matching)
OV_NORM = {key_norm(k): v for k, v in OV.items() if re.search(r"[\u0600-\u06FF]", k)}
print(f"🔧 Using overrides file: {OVERRIDES_JSON}")
print(f"   Loaded overrides: {len(OV)} (normalized Arabic keys: {len(OV_NORM)})")
# show a few samples for sanity
for i,(k,v) in enumerate(list(OV.items())[:8]):
    print(f"   • {k}  ->  {v}")
    if i>=7: break

# Cache (load then purge entries that now have overrides)
if os.path.exists(CACHE_CSV):
    cache_df = pd.read_csv(CACHE_CSV)
    CACHE = dict(cache_df.values)  # {raw: english}
else:
    CACHE = {}

def override_lookup(text: str):
    if text in OV:
        return OV[text]
    kn = key_norm(text)
    if kn in OV_NORM:
        return OV_NORM[kn]
    return None

# Purge cache entries that should now be overridden
purged = 0
to_del = []
for raw in list(CACHE.keys()):
    if override_lookup(raw):
        to_del.append(raw)
for raw in to_del:
    CACHE.pop(raw, None)
    purged += 1
print(f"🧹 Purged {purged} cache entries that now have overrides")

translator = GoogleTranslator(source="auto", target="en")

def translate_text(text: str) -> str:
    if pd.isna(text) or str(text).strip() == "":
        return text
    s = str(text).strip()

    # 1) override wins (exact or normalized)
    ov = override_lookup(s)
    if ov:
        return ov

    # 2) cache
    if s in CACHE:
        return CACHE[s]

    # 3) machine translation
    try:
        en = translator.translate(s) or s
        en = apply_postfix(en)
    except Exception:
        en = s  # keep original on error

    CACHE[s] = en
    return en

def translate_classifications_cell(cell: str) -> str:
    parts = [p.strip() for p in str(cell).split("|")]
    out = []
    for p in parts:
        if not p:
            continue
        ov = override_lookup(p)
        en = ov if ov else translate_text(p)
        out.append(str(en).lower())
    return " | ".join(out)

# ---------- choose columns ----------
obj_cols = [c for c in df.columns if df[c].dtype == "object"]
cols = obj_cols if TRANSLATE_COLS is None else [c for c in TRANSLATE_COLS if c in df.columns]
print(f"📝 Translating columns: {cols}")

# ---------- translate ----------
for col in cols:
    print(f"➡️  Translating: {col}")
    if HANDLE_CLASSIFICATIONS and col.lower() == "classifications":
        df[col] = df[col].astype(str).apply(translate_classifications_cell)
    else:
        df[col] = df[col].apply(translate_text)

# ---------- save ----------
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
pd.DataFrame(list(CACHE.items()), columns=["raw","english"]).to_csv(CACHE_CSV, index=False)

print(f"✅ Done: {OUTPUT_CSV}")
print(f"💾 Cache: {CACHE_CSV}")
print(f"✍️ Overrides file in use: {OVERRIDES_JSON}")

# **2- Data Cleaning**

## **Data Cleaning - Changing dish name**

**clean dish name, removing irrelavent extra discription**

This code cleans and standardizes dish names in the Saudi food dataset.
 1. It removes unnecessary words and descriptions (like "for Saudi National Day",
   "how to make", "traditional", etc.) from the dish names.
2. It then standardizes different spellings or variations of the same dish
  (e.g., "kabsah", "kbsa" → "Kabsa", "shaksoka" → "Shakshuka").
  
 3. Finally, it shows before/after examples, reports the most common dish names,
 and saves the cleaned dataset as 'SaudiFoodFile_cleaned.csv' for later use.

In [None]:
import pandas as pd
import numpy as np
import re

# Load the data
df = pd.read_csv('SaudiFoodFile_english_FIXED.csv')

# Display initial data info
print("Initial data shape:", df.shape)
print("\nFirst few rows:")
print(df.head())

# Task 1: Clean dish names - remove extra descriptions
def clean_dish_name(name):
    """
    Remove extra descriptions from dish names like 'for Saudi National Day',
    'how to make', 'Saudi style', etc.
    """
    # Common patterns to remove
    patterns_to_remove = [
        r'for saudi national day',
        r'how to make',
        r'saudi style',
        r'saudi',
        r'traditional',
        r'the saudi',
        r'method for',
        r'according to',
        r'with.*',
        r'for.*',
        r'the hijazi way',
        r'hijazi',
        r'recipe',
        r'easy',
        r'authentic',
        r'copycat',
        r'slow-?roast',
        r'no bake',
        r'healthy',
        r'vegetarian',
        r'stuffed',
        r'baked',
        r'grilled',
        r'roasted',
        r'creamy',
        r'spiced',
        r'middle eastern'
    ]

    cleaned_name = name.lower().strip()

    # Remove patterns
    for pattern in patterns_to_remove:
        cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE)

    # Remove extra spaces and punctuation
    cleaned_name = re.sub(r'[^\w\s]', ' ', cleaned_name)  # Remove punctuation
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name)  # Remove extra spaces
    cleaned_name = cleaned_name.strip()

    # Remove common measurement/portion descriptions
    portion_patterns = [
        r'\([^)]*\)',  # Remove anything in parentheses
        r'\bwhole grain\b',
        r'\bhalf a piece\b',
        r'\bhalf piece\b',
        r'\bquarter\b',
        r'\bone person\b',
        r'\bperson\b',
        r'\bplain\b',
        r'\bwith rice\b',
        r'\bwithout rice\b'
    ]

    for pattern in portion_patterns:
        cleaned_name = re.sub(pattern, '', cleaned_name, flags=re.IGNORECASE)

    # Final cleanup
    cleaned_name = re.sub(r'\s+', ' ', cleaned_name).strip()

    # Title case for consistency
    cleaned_name = cleaned_name.title()

    return cleaned_name

# Task 2: Standardize dish name variations
def standardize_dish_name(name):
    """
    Standardize variations of dish names (kabsa/kabsah/kbsa -> kabsa)
    """
    standardization_map = {
        r'\bkabsah?\b': 'Kabsa',
        r'\bkbsa\b': 'Kabsa',
        r'\bkleija\b': 'Kleja',
        r'\bkulaija\b': 'Kleja',
        r'\bklija\b': 'Kleja',
        r'\bshaksoka\b': 'Shakshuka',
        r'\bshakshuka\b': 'Shakshuka',
        r'\bshaksuka\b': 'Shakshuka',
        r'\bbasbousa\b': 'Basbousa',
        r'\bbasbosa\b': 'Basbousa',
        r'\bjareesh\b': 'Jareesh',
        r'\bjarish\b': 'Jareesh',
        r'\bgreesh\b': 'Jareesh',
        r'\bgroats\b': 'Jareesh',
        r'\bmaqshoosh\b': 'Maqshush',
        r'\bmaqshush\b': 'Maqshush',
        r'\bmutabbaq\b': 'Mutabak',
        r'\bmutabak\b': 'Mutabak',
        r'\bsaleeq\b': 'Saleek',
        r'\bsaliq\b': 'Saleek',
        r'\bsaleek\b': 'Saleek',
        r'\bsulait?\b': 'Saleek',
        r'\bmaamoul\b': 'Mamoul',
        r'\bmamoul\b': 'Mamoul',
        r'\bmadhbi\b': 'Madhbi',
        r'\bmadghog\b': 'Madhghut',
        r'\bmadjou?h\b': 'Madhghut',
        r'\bmadfoon\b': 'Madfun',
        r'\bmadfoun\b': 'Madfun',
        r'\bmandi\b': 'Mandi',
        r'\bzurbian\b': 'Zurbian',
        r'\bzerbian\b': 'Zurbian',
        r'\bshrimp\b': 'Shrimp',
        r'\bshurbian\b': 'Shrimp',
        r'\bsambosa\b': 'Sambusa',
        r'\bsambousek\b': 'Sambusa',
        r'\bsamosa\b': 'Sambusa',
        r'\bmagloba\b': 'Maqluba',
        r'\bmaqluba\b': 'Maqluba',
        r'\bmakloubeh\b': 'Maqluba',
        r'\bmoussaka\b': 'Musaqa',
        r'\bmoussaqa\b': 'Musaqa',
        r'\bmusakaa\b': 'Musaqa',
        r'\bmolokhia\b': 'Mulukhiyah',
        r'\bmolokhiya\b': 'Mulukhiyah',
        r'\bmulukhiyah\b': 'Mulukhiyah',
        r'\bmargog\b': 'Marqouq',
        r'\bmarqouk\b': 'Marqouq',
        r'\bmarqooq\b': 'Marqouq',
        r'\bmatazeez\b': 'Mataziz',
        r'\bmogalgal\b': 'Muqalqal',
        r'\bmqalqal\b': 'Muqalqal',
        r'\bhemees\b': 'Hamees',
        r'\bhemen\b': 'Hamees',
        r'\bmohalabiya\b': 'Muhalabiya',
        r'\bmohala\b': 'Muhalabiya',
        r'\bkunafa\b': 'Kunafa',
        r'\bknafeh\b': 'Kunafa',
        r'\bsabeeb\b': 'Sabeeb',
        r'\bsabib\b': 'Sabeeb',
        r'\btaheena\b': 'Tahini',
        r'\btainna\b': 'Tahini',
        r'\btahini\b': 'Tahini',
        r'\bfatteh\b': 'Fatteh',
        r'\bfateh\b': 'Fatteh',
        r'\bfreekeh\b': 'Freekeh',
        r'\bfreekey\b': 'Freekeh',
        r'\bhashweh\b': 'Hashu',
        r'\bhashu\b': 'Hashu',
        r'\bmujadara\b': 'Mujaddara',
        r'\bmujaddara\b': 'Mujaddara',
        r'\bzaatar\b': 'Zaatar',
        r'\bza\'atar\b': 'Zaatar'
    }

    standardized_name = name
    for pattern, replacement in standardization_map.items():
        standardized_name = re.sub(pattern, replacement, standardized_name, flags=re.IGNORECASE)

    return standardized_name

# Apply cleaning and standardization
print("\nApplying data cleaning...")

# Create cleaned dish names
df['cleaned_dish_name'] = df['dish_name'].apply(clean_dish_name)
df['standardized_dish_name'] = df['cleaned_dish_name'].apply(standardize_dish_name)

# Show before and after examples
print("\nName cleaning examples:")
sample_size = min(10, len(df))
for i in range(sample_size):
    print(f"Original: {df['dish_name'].iloc[i]}")
    print(f"Cleaned: {df['cleaned_dish_name'].iloc[i]}")
    print(f"Standardized: {df['standardized_dish_name'].iloc[i]}")
    print("-" * 50)

# Show most common dish names after standardization
print("\nMost common standardized dish names:")
print(df['standardized_dish_name'].value_counts().head(20))

# Check for remaining variations
print("\nChecking for remaining variations (sample):")
unique_names = df['standardized_dish_name'].unique()
for name in sorted(unique_names)[:30]:  # Show first 30
    print(f"  - {name}")

# Save the cleaned data
df_cleaned = df.copy()
# You can choose to replace the original dish_name or keep both
df_cleaned['dish_name_original'] = df['dish_name']
df_cleaned['dish_name'] = df['standardized_dish_name']

# Drop temporary columns
df_cleaned = df_cleaned.drop(['cleaned_dish_name', 'standardized_dish_name'], axis=1)

print(f"\nFinal data shape: {df_cleaned.shape}")
print("\nFirst few rows of cleaned data:")
print(df_cleaned[['dish_name_original', 'dish_name']].head(15))

# Save to new CSV file
output_filename = 'SaudiFoodFile_cleaned.csv'
df_cleaned.to_csv(output_filename, index=False)
print(f"\nCleaned data saved to: {output_filename}")

# Additional analysis: Show name standardization results
print("\n" + "="*80)
print("NAME STANDARDIZATION SUMMARY")
print("="*80)

# Group similar names to show standardization effect
name_groups = {}
for orig, new in zip(df['dish_name'], df_cleaned['dish_name']):
    if new not in name_groups:
        name_groups[new] = []
    if orig not in name_groups[new]:
        name_groups[new] = sorted(name_groups[new] + [orig])

print("\nStandardization groups (showing first 15 groups):")
count = 0
for standardized_name, original_names in name_groups.items():
    if len(original_names) > 1:  # Only show names that had variations
        print(f"\n{standardized_name}:")
        for orig_name in original_names:
            print(f"  - {orig_name}")
        count += 1
        if count >= 15:
            break

## **Data Cleaning - Changing image name**

**Changing image name (make it like the dish name )**



 Purpose: Standardize image file names in the CSV based on dish names, ensure uniqueness,
          and save the result for downstream use.

What it does:
 1) Loads 'SaudiFoodFile_cleaned.csv' and inspects dish_name quality (missing/non-string).
2) Builds clean image file names from dish_name:
    - lowercase, remove special chars, replace spaces/dashes with underscores,
    - keep the original file extension (e.g., .jpg, .png),
    - fallback to original image base name if dish_name is missing.
 3) Ensures uniqueness by appending _2, _3, ... for duplicates.
 4) Reports examples and a summary (duplicate groups, most common dish names, short names).
5) Writes a new CSV 'SaudiFoodFile_final_cleaned.csv' with:
   - image_file_original (old),
   - image_file (new standardized).
 Note: This updates names in the CSV only. It does NOT rename files on disk.

In [None]:
import pandas as pd
import re
import os
import numpy as np

# Load the cleaned data
df = pd.read_csv('SaudiFoodFile_cleaned.csv')

# Display initial data info
print("Initial data shape:", df.shape)
print("\nFirst few rows:")
print(df[['dish_name', 'image_file']].head())

# Check for missing or non-string values in dish_name
print(f"\nData types: {df['dish_name'].dtype}")
print(f"Missing values in dish_name: {df['dish_name'].isna().sum()}")
print(f"Non-string values sample: {df[df['dish_name'].apply(lambda x: not isinstance(x, str))].head()}")

# Function to create clean image filename from dish name
def create_image_filename(dish_name, original_image_file):
    """
    Create clean image filename based on dish name and handle duplicates
    """
    # Handle NaN or non-string values
    if not isinstance(dish_name, str) or pd.isna(dish_name):
        # Use original image file name as fallback
        base_name = os.path.splitext(os.path.basename(original_image_file))[0]
        clean_name = base_name.lower()
    else:
        # Clean the dish name for filename
        clean_name = dish_name.lower()

    # Remove special characters and replace spaces with underscores
    clean_name = re.sub(r'[^\w\s-]', '', clean_name)
    clean_name = re.sub(r'[-\s]+', '_', clean_name)

    # Keep the file extension from original
    file_extension = os.path.splitext(original_image_file)[1]

    # Create base filename
    base_filename = f"{clean_name}{file_extension}"

    return base_filename

# Apply image filename creation
print("\nCreating standardized image filenames...")

# Create base image filenames
df['base_image_file'] = df.apply(
    lambda row: create_image_filename(row['dish_name'], row['image_file']),
    axis=1
)

# Handle duplicates by adding incremental IDs
print("\nHandling duplicate image filenames...")

# Count occurrences and add IDs to duplicates
duplicate_count = {}
df['new_image_file'] = ""

for idx, row in df.iterrows():
    base_name = row['base_image_file']

    if base_name in duplicate_count:
        duplicate_count[base_name] += 1
        # Add ID to duplicate (before extension)
        name_without_ext, ext = os.path.splitext(base_name)
        final_name = f"{name_without_ext}_{duplicate_count[base_name]}{ext}"
    else:
        duplicate_count[base_name] = 1
        final_name = base_name

    df.at[idx, 'new_image_file'] = final_name

# Show before and after examples
print("\nImage filename standardization examples:")
sample_size = min(20, len(df))
for i in range(sample_size):
    print(f"Dish: {df['dish_name'].iloc[i]}")
    print(f"Original image: {df['image_file'].iloc[i]}")
    print(f"New image: {df['new_image_file'].iloc[i]}")
    print("-" * 60)

# Show duplicates that were handled
duplicates = {name: count for name, count in duplicate_count.items() if count > 1}
if duplicates:
    print(f"\nFound {len(duplicates)} image names with duplicates:")
    for name, count in list(duplicates.items())[:15]:
        print(f"  - {name}: {count} occurrences")

    # Show specific examples of duplicate resolution
    print("\nExamples of duplicate resolution:")
    for duplicate_name in list(duplicates.keys())[:10]:
        matching_rows = df[df['base_image_file'] == duplicate_name]
        print(f"\n{duplicate_name}:")
        for _, row in matching_rows.iterrows():
            print(f"  - {row['new_image_file']} (from: {row['dish_name']})")
else:
    print("\nNo duplicate image names found!")

# Create the final dataframe
df_final = df.copy()
df_final['image_file_original'] = df['image_file']
df_final['image_file'] = df['new_image_file']

# Drop temporary columns
df_final = df_final.drop(['base_image_file', 'new_image_file'], axis=1)

print(f"\nFinal data shape: {df_final.shape}")

# Save to new CSV
output_filename = 'SaudiFoodFile_final_cleaned.csv'
df_final.to_csv(output_filename, index=False)
print(f"\nFinal cleaned data saved to: {output_filename}")

# Summary statistics
print("\n" + "="*80)
print("IMAGE FILENAME STANDARDIZATION SUMMARY")
print("="*80)
print(f"Total dishes: {len(df_final)}")
print(f"Unique original image names: {df['image_file'].nunique()}")
print(f"Unique new image names: {df_final['image_file'].nunique()}")
print(f"Duplicates handled: {len(duplicates)}")

# Show most common dish names and their image files
print("\nMost common dish names and their new image files:")
common_dishes = df_final['dish_name'].value_counts().head(15)
for dish, count in common_dishes.items():
    matching_images = df_final[df_final['dish_name'] == dish]['image_file'].tolist()
    print(f"\n{dish} (appears {count} times):")
    for img in matching_images:
        print(f"  - {img}")

# Show problematic cases (very short names or empty names)
print("\nChecking for problematic dish names:")
short_names = df_final[df_final['dish_name'].str.len() < 3] if 'dish_name' in df_final.columns else pd.DataFrame()
if len(short_names) > 0:
    print("Very short dish names found:")
    for _, row in short_names.iterrows():
        print(f"  - '{row['dish_name']}' -> {row['image_file']}")

# Show the complete mapping for verification
print("\nComplete filename mapping (first 30 entries):")
print("Dish Name -> Original Image -> New Image")
for i in range(min(30, len(df_final))):
    dish_name = df_final['dish_name'].iloc[i] if isinstance(df_final['dish_name'].iloc[i], str) else "MISSING_NAME"
    print(f"{dish_name} -> {df_final['image_file_original'].iloc[i]} -> {df_final['image_file'].iloc[i]}")

# Additional: Show any rows with missing dish names
missing_dish_names = df_final[df_final['dish_name'].isna()]
if len(missing_dish_names) > 0:
    print(f"\nWARNING: Found {len(missing_dish_names)} rows with missing dish names:")
    for idx, row in missing_dish_names.iterrows():
        print(f"  - Row {idx}: Original image: {row['image_file_original']}, New image: {row['image_file']}")

## **Data Cleaning - Image File Renaming**

 **Image File Renaming (Done in a Separate Colab) with the name (renaming images file)**

 In this step, which was performed in a separate Colab notebook,
 we renamed all the image files on disk to match their corresponding
 standardized names in the CSV file
  
  
  what this step do: Rename image files on disk to match the standardized image names
          listed in the CSV file.
 What it does:
 1) Reads the CSV (which contains the mapping between old and new image names).
2) Finds each original image file in your folder.
 3) Renames it to the corresponding new standardized name.
4) Creates a backup (optional) before renaming, to keep the original files safe.
 5) Reports missing or renamed files for verification.
#
 Notes:
 - This step actually changes filenames in your images folder, unlike the earlier
   CSV-only step that just updated name references in the file.
 - Make sure to set the correct folder path for your images before running.

## **Data Cleaning - Ingredient Cleaning**



### **Ingredient Cleaning -Step 1 — Removing Extra Columns**

Before starting ingredient cleaning, the dataset contained two unnecessary columns at the end.
This step removes them to keep the file clean and consistent.



**Actions performed:**

1.    Checked that the file exists.

1.    Dropped the last two columns using positional indexing.

1.    Saved the updated version for the next steps.

**Outputs:**
'Standerlized_file_cleaned.csv'

In [None]:
# ===========================================
# 🧹 Remove Last Two Columns from a CSV (by path)
# ===========================================

import pandas as pd
import os

# 1️⃣ Set your file path
input_path = "SaudiFoodFile_final_cleaned.csv"   # 🔹 Change this to your actual file path
output_path = "/content/Standerlized_file_cleaned.csv"

# 2️⃣ Make sure the file exists
if not os.path.exists(input_path):
    raise FileNotFoundError(f"❌ File not found at: {input_path}")

# 3️⃣ Load the CSV
df = pd.read_csv(input_path)
print("✅ Original shape:", df.shape)

# 4️⃣ Drop the last two columns
df_dropped = df.iloc[:, :-2]
print("✅ New shape after removing last two columns:", df_dropped.shape)

# 5️⃣ Show which columns were deleted
removed_cols = df.columns[-2:].tolist()
print("🗑️ Removed columns:", removed_cols)

# 6️⃣ Save the cleaned CSV
df_dropped.to_csv(output_path, index=False)
print(f"✅ Cleaned file saved to: {output_path}")




### **Ingredient Cleaning - Step 2 — Ingredient Cleaning**

This step cleans and standardizes all ingredient information before the consistency check.
Some dishes originally had messy or incomplete ingredient lists — for example:

> `"olive oil / tomatoes - onion, chilli"` or sometimes just `"unknown"`

The goal here is to convert everything into clear, structured ingredient lists such as:

> `["olive oil", "tomato", "onion", "chili"]`
> and to replace unclear or missing entries like `["unknown"]` with the clear label `["unknown_ingredients"]`.

**What this step does:**

1. **Splits ingredients correctly:**
   Separates text using real separators (`|`, `/`, `,`, `;`, Arabic commas, or dashes) without breaking multi-word names.

2. **Protects multi-word ingredients:**
   Keeps terms like *olive oil* or *tomato paste* together as one ingredient.

3. **Removes non-ingredient words:**
   Drops extra words such as *add*, *garnish*, *with*, or *hot* that aren’t actual ingredients.

4. **Normalizes plurals and spellings:**
   Converts plurals (*tomatoes → tomato*) and unifies spellings (*chilli*, *chilies* → *chili*).

5. **Handles unknown or missing entries:**
   Any ingredient cell that is empty or simply says `"not found"` or `"unknown"` becomes `["unknown"]` to keep the format consistent.

6. **Embedding + Clustering (semantic cleaning):**
   group similar ingredient names (like *cardamon* and *cardamom*) into one canonical form.

7. **Creates standardized ingredient lists:**
   Every dish ends up with a clean list of consistent, machine-readable ingredients.

**Outputs:**

* `SaudiFoodFile_standardized.csv` → the final cleaned ingredient lists per dish. will be used in the next step.
* `ingredient_clusters_report.csv` → groups of similar ingredients and their canonical names (helper)
* `ingredient_canonical_map.json` → mapping of each ingredient to its canonical form (helper)


In [None]:

# ============================================================
# Student 2 — Ingredient Cleaning (phrase-aware, no space-splitting mistakes)
# - Split on real separators only (| / \ , ; ، - with spaces, bullets)
# - Protect multi-word ingredient phrases (KEEP_PHRASES)
# - Extract ingredients from stray sentences; drop non-ingredient words
# - Canonicalize with ALIASES + singularization + fuzzy nudge
# - Output only: dish_name | classifications_std_list | image_file | scrape_date
# ============================================================

!pip install -q pandas sentence-transformers scikit-learn rapidfuzz inflect

import re, json, pandas as pd
from collections import Counter, defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from rapidfuzz import process, fuzz
import inflect

# --- Unknown/placeholder detection ---
UNKNOWN_PATTERNS = [
    r"^\s*not\s*found\s*$",
    r"^\s*unknown\s*$",
    r"^\s*n/?a\s*$",
    r"^\s*none\s*$",
    r"^\s*null\s*$",
    r"^\s*missing\s*$",
    r"^\s*بدون\s*$",           # Arabic: without
    r"^\s*غير\s*متوفر\s*$",   # Arabic: unavailable
]

UNKNOWN_REGEXES = [re.compile(p, re.IGNORECASE) for p in UNKNOWN_PATTERNS]

def is_unknown_text(s: str) -> bool:
    return any(rx.match(s) for rx in UNKNOWN_REGEXES)


# ----------------------------
# Config
# ----------------------------
INPUT_CSV  = "Standerlized_file_cleaned.csv"
OUTPUT_CSV = "SaudiFoodFile_standardized.csv"
REPORT_CSV = "ingredient_clusters_report.csv"
MAP_JSON   = "ingredient_canonical_map.json"

CLASS_COL_CANDIDATES = ["classifications", "classifications_en", "classfications"]
MODEL_NAME = "sentence-transformers/paraphrase-MiniLM-L6-v2"
DISTANCE_THRESHOLD = 0.35
FUZZY_SCORE_CUTOFF = 92

# ----------------------------
# Phrase protection & vocabulary
# ----------------------------
# Multi-word ingredients to KEEP as single tokens
KEEP_PHRASES = {
    # oils / dairy
    "olive oil", "vegetable oil", "clarified butter", "milk powder", "butter milk", "heavy cream",
    # stocks / sauces / pastes
    "tomato paste", "tomato sauce", "soy sauce", "pomegranate molasses", "date molasses", "rose water", "orange blossom water",
    # veg & herbs
    "bell pepper", "green onion", "spring onion", "bay leaves", "mint leaves", "parsley leaves", "coriander leaves",
    # spices
    "black lime", "mixed spices", "spice mix", "allspice", "black pepper", "white pepper",
    # proteins / grains
    "basmati rice", "chicken stock", "beef stock", "vegetable stock",
}

# Synonyms/variants → canonical (lowercase)
ALIASES = {
    # souring / lime
    "dried lime": "black lime", "dried limes": "black lime", "omani lime": "black lime",
    "omani limes": "black lime", "loomi": "black lime", "black lemon": "black lime",

    # oils/fats/dairy
    "veg oil": "vegetable oil", "olive oils": "olive oil", "butter milk": "buttermilk",
    "yoghurt": "yogurt", "labnah": "labneh",

    # herbs & veg
    "cilantro": "coriander", "coriander leaves": "coriander", "green coriander": "coriander",
    "parsley leaves": "parsley", "mint leaves": "mint", "spring onion": "green onion",

    # peppers
    "capsicum": "bell pepper", "green pepper": "bell pepper", "sweet pepper": "bell pepper",
    "chilli": "chili", "chilies": "chili", "chillies": "chili", "chili pepper": "chili", "chili peppers": "chili",

    # powders & sticks -> base spice
    "turmeric powder": "turmeric", "ginger powder": "ginger",
    "garlic powder": "garlic", "onion powder": "onion",
    "cinnamon stick": "cinnamon", "cinnamon sticks": "cinnamon",
    "cardamon": "cardamom",

    # sauces/pastes/syrups
    "tomato purée": "tomato paste", "tomato puree": "tomato paste",
    "simple syrup": "sugar syrup",

    # pulses/grains
    "garbanzo beans": "chickpeas", "garbanzo bean": "chickpeas",
    "chick peas": "chickpeas", "chick pea": "chickpeas",
    "black eyed beans": "black-eyed beans", "black eyed pea": "black-eyed beans",

    # spice mixes / generic
    "spice mix": "mixed spices", "mix spices": "mixed spices", "spices mix": "mixed spices"
}

# Words that are NOT ingredients (filler, verbs, instructions)
NON_ING_WORDS = {
    "after","decorate","decoration","decorations","add","with","such","touch","patriotic","cream",
    "or","and","the","a","an","then","until","when","like","as","to","for","of","into","over",
    "warm","hot","cold","slice","sliced","diced","chopped","minced","ground","crushed","whole",
    "fresh","optional","needed","garnish","make","prepare","preparation","cook","cooked","baked",
    "boiled","fried","seauted","sauteed","browned","mix","mixed","topping","kitchen","precise","instant"
}

# ----------------------------
# Helpers
# ----------------------------
SEP_NORMALIZER = re.compile(r"[|/\\،;,]+")      # | / \ , ; Arabic comma
AROUND_HYPHEN  = re.compile(r"\s*[-–—]\s*")     # spaced hyphens/dashes as separators
BULLETS        = re.compile(r"[•·]+")

# After splitting, strip these inside tokens
PUNCT_DROP_INSIDE = re.compile(r"[-_/\\|]+")
NONWORD           = re.compile(r"[^\w\s\(\)]")

IRREGULAR = {"tomatoes":"tomato","potatoes":"potato","limes":"lime","chillies":"chili","chilies":"chili","cloves":"cloves"}
p = inflect.engine()

def safe_str(x):
    if x is None: return ""
    s = str(x)
    return "" if s.strip().lower() in {"", "nan", "none", "null"} else s

def protect_phrases(text: str) -> str:
    """Replace spaces inside KEEP_PHRASES with underscores to protect them."""
    s = text
    # longer phrases first to avoid partial overlaps
    for ph in sorted(KEEP_PHRASES, key=lambda x: -len(x)):
        pattern = r"\b" + re.escape(ph) + r"\b"
        s = re.sub(pattern, ph.replace(" ", "_"), s, flags=re.IGNORECASE)
    return s

def normalize_separators(s: str) -> str:
    s = BULLETS.sub("|", s)
    s = AROUND_HYPHEN.sub("|", s)           # ' - ' → '|'
    s = SEP_NORMALIZER.sub("|", s)          # unify to '|'
    s = re.sub(r"\|{2,}", "|", s).strip("| ")
    return s

def to_singular(word: str) -> str:
    w = word.strip().lower()
    if w in IRREGULAR: return IRREGULAR[w]
    s = p.singular_noun(w)
    return s if isinstance(s, str) and s else w

def clean_phrase(s: str) -> str:
    s = s.strip().lower()
    s = PUNCT_DROP_INSIDE.sub(" ", s)
    s = NONWORD.sub(" ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def drop_non_ingredients(words):
    return [w for w in words if w not in NON_ING_WORDS and len(w) > 1]

def split_to_tokens(cell) -> list[str]:
    """
    - If the whole cell is a placeholder (not found/unknown/etc.), return []
      so it becomes ["unknown_ingredients"] later.
    - Normalize separators to '|', protect phrases, split on '|'.
    - Extract protected phrases, then ingredient-like words only.
    """
    raw = safe_str(cell)
    if not raw:
        return []
    # whole-cell unknowns → empty list (will become ["unknown_ingredients"])
    if is_unknown_text(raw):
        return []

    # normalize separators and protect phrases
    s = normalize_separators(raw)
    s = protect_phrases(s)
    parts = [p.strip() for p in s.split("|") if p.strip()]

    tokens = []
    for p in parts:
        # per-part unknowns too
        if is_unknown_text(p):
            # skip this part entirely
            continue

        # restore underscores for already isolated protected phrases
        if "_" in p and p in KEEP_PHRASES or p.replace("_"," ") in KEEP_PHRASES:
            tokens.append(p.replace("_"," "))
            continue

        # 1) collect any protected phrases still inside
        found = []
        rest  = p
        for ph in sorted(KEEP_PHRASES, key=lambda x: -len(x)):
            ph_prot = ph.replace(" ", "_")
            if ph_prot.lower() in rest.lower():
                found.append(ph)
                rest = re.sub(re.escape(ph_prot), " ", rest, flags=re.IGNORECASE)

        # 2) remaining words (filtered)
        rest = clean_phrase(rest.replace("_"," "))
        words = drop_non_ingredients(rest.split())
        words = [to_singular(w) for w in words]

        for ph in found:
            tokens.append(ph)
        for w in words:
            tokens.append(w)

    # alias + dedupe
    out = []
    for t in tokens:
        t0 = ALIASES.get(t.strip().lower().replace("_"," "), t.strip().lower().replace("_"," "))
        if t0:
            out.append(t0)

    seen, dedup = set(), []
    for x in out:
        if x not in seen:
            dedup.append(x); seen.add(x)
    return dedup

    # normalize separators and protect phrases
    s = normalize_separators(raw)
    s = protect_phrases(s)
    parts = [p.strip() for p in s.split("|") if p.strip()]

    tokens = []
    for p in parts:
        # restore underscores for already isolated protected phrases
        if "_" in p and p in KEEP_PHRASES or p.replace("_"," ") in KEEP_PHRASES:
            tokens.append(p.replace("_"," "))
            continue

        # If it's a sentence: extract any protected phrases inside, then words
        # 1) pull out protected phrases inside the piece
        found = []
        rest  = p
        for ph in sorted(KEEP_PHRASES, key=lambda x: -len(x)):
            ph_prot = ph.replace(" ", "_")
            if ph_prot.lower() in rest.lower():
                # collect and remove
                found.append(ph)
                rest = re.sub(re.escape(ph_prot), " ", rest, flags=re.IGNORECASE)

        # 2) remaining words
        rest = clean_phrase(rest.replace("_"," "))
        words = drop_non_ingredients(rest.split())
        # singularize last token of any 1-2 word units (light touch)
        words = [to_singular(w) for w in words]

        # combine: protected phrases + remaining words
        for ph in found:
            tokens.append(ph)
        for w in words:
            tokens.append(w)

    # apply aliases & final cleanup
    out = []
    for t in tokens:
        t0 = t.strip().lower()
        t0 = t0.replace("_"," ")
        if not t0:
            continue
        t0 = ALIASES.get(t0, t0)
        out.append(t0)

    # de-dup preserve order
    seen, dedup = set(), []
    for x in out:
        if x not in seen:
            dedup.append(x); seen.add(x)

    return dedup

def normalize_for_clustering(token: str) -> str:
    """Secondary normalization for clustering stage."""
    t = token.strip().lower()
    t = re.sub(r"\s+", " ", t)
    return t

# ----------------------------
# Load data (robust to "NA")
# ----------------------------
df = pd.read_csv(INPUT_CSV, encoding="utf-8", keep_default_na=False, na_filter=False)
n_rows = len(df)
CLASS_COL = next((c for c in CLASS_COL_CANDIDATES if c in df.columns), None)
if not CLASS_COL:
    raise ValueError(f"Could not find classifications column. Found: {list(df.columns)}")

# ----------------------------
# Tokenize all rows with phrase-aware extractor
# ----------------------------
raw_lists = df[CLASS_COL].apply(split_to_tokens)

# Clean tokens for clustering
cleaned_all, bag = [], []
for tokens in raw_lists:
    cleaned = []
    for t in tokens:
        nt = normalize_for_clustering(t)
        if nt:
            cleaned.append(nt)
            bag.append(nt)
    cleaned_all.append(cleaned)

freq = Counter(bag)
unique_tokens = list(freq.keys())

# If no tokens → fill unknowns but still emit rows
if not unique_tokens:
    df["classifications_std_list"] = [["unknown_ingredients"] for _ in range(n_rows)]
    out = df[["dish_name","classifications_std_list","image_file","scrape_date"]]
    out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
    print(f"✅ Done (all unknown). Rows: {len(out)}")
    raise SystemExit

# ----------------------------
# Embed & cluster
# ----------------------------
model = SentenceTransformer(MODEL_NAME)
emb = model.encode(unique_tokens, show_progress_bar=True, normalize_embeddings=True)

try:
    clust = AgglomerativeClustering(
        n_clusters=None, linkage="average",
        metric="cosine", distance_threshold=DISTANCE_THRESHOLD
    )
except TypeError:
    clust = AgglomerativeClustering(
        n_clusters=None, linkage="average",
        affinity="cosine", distance_threshold=DISTANCE_THRESHOLD
    )
labels = clust.fit_predict(emb)

clusters = defaultdict(list)
for tok, lab in zip(unique_tokens, labels):
    clusters[lab].append(tok)

# canonical per cluster: most frequent (tie -> shortest)
cluster_canonical = {lab: sorted(toks, key=lambda t: (-freq[t], len(t)))[0] for lab, toks in clusters.items()}
token2canon = {tok: cluster_canonical[lab] for tok, lab in zip(unique_tokens, labels)}

# optional fuzzy nudge
canonical_vocab = list(set(token2canon.values()))
def fuzzy_canon(token: str, cutoff=FUZZY_SCORE_CUTOFF):
    best = process.extractOne(token, canonical_vocab, scorer=fuzz.WRatio, score_cutoff=cutoff)
    return best[0] if best else token
for tok in list(token2canon.keys()):
    cand = fuzzy_canon(tok)
    if cand != token2canon[tok] and freq.get(cand, 0) >= freq.get(token2canon[tok], 0):
        token2canon[tok] = cand

# ----------------------------
# Apply mapping to EVERY row
# ----------------------------
std_lists = []
for cleaned in cleaned_all:
    mapped = [token2canon.get(t, t) for t in cleaned]
    # de-dup preserve order
    seen, dedup = set(), []
    for x in mapped:
        if x not in seen:
            dedup.append(x); seen.add(x)
    if not dedup:
        dedup = ["unknown_ingredients"]
    std_lists.append(dedup)

assert len(std_lists) == n_rows
df["classifications_std_list"] = std_lists

# --- Force unknown_ingredients for placeholder fragments like ["not","found"] ---
def coalesce_unknown(lst):
    # If list is empty, we'll handle later; if it exactly equals ["not","found"], force unknown
    if not lst:
        return ["unknown_ingredients"]
    lf = [x.strip().lower() for x in lst]
    if lf == ["not","found"] or lf == ["unknown"]:
        return ["unknown_ingredients"]
    # If list contains only non-ingredient placeholders, collapse too
    joined = " ".join(lf)
    if is_unknown_text(joined):
        return ["unknown_ingredients"]
    return lst

df["classifications_std_list"] = df["classifications_std_list"].apply(coalesce_unknown)


# ---------------------------------------------------------
# Post-processing consistency fix: collapse generic + specific variants
# ---------------------------------------------------------
canonical_collapse = {
    # generic → preferred
    "oil": "olive oil",
    "vegetable oil": "olive oil",
    "ghee": "butter",
    "yogurt": "labneh",       # example, if you prefer labneh
    # add any others you notice
}

def collapse_variants(lst):
    """Replace generic tokens with canonical equivalents and de-duplicate."""
    out = []
    seen = set()
    for x in lst:
        y = canonical_collapse.get(x, x)
        if y not in seen:
            out.append(y)
            seen.add(y)
    return out

df["classifications_std_list"] = df["classifications_std_list"].apply(collapse_variants)



# ----------------------------
# Reports & mapping
# ----------------------------
rows = []
for lab, toks in clusters.items():
    can = cluster_canonical[lab]
    for t in sorted(toks):
        rows.append({"cluster_id": lab, "canonical": can, "member": t, "member_freq": freq[t]})
pd.DataFrame(rows).sort_values(["canonical","member"]).to_csv(REPORT_CSV, index=False, encoding="utf-8")
with open(MAP_JSON, "w", encoding="utf-8") as f:
    json.dump(token2canon, f, ensure_ascii=False, indent=2)

# ----------------------------
# Save ONLY the requested 4 columns
# ----------------------------
required_cols = ["dish_name", "image_file", "scrape_date"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

out = df[["dish_name","classifications_std_list","image_file","scrape_date"]]
out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print("✅ Done.")
print(f"Rows in/out: {n_rows} / {len(out)}")
print(f"Example rows:\n{out.head(6).to_string(index=False)}")




### **Ingredient Cleaning - Step 3 — Final Consistency Pass**

After cleaning and standardizing the ingredients, some names were still not completely consistent.
For example, a few dishes still used slightly different spellings or duplicate ingredient terms such as:

> ["paper", "chilli", "olive oils"] instead of ["chili", "olive oil"].

This step was added to double-check and correct any remaining inconsistencies in spelling, wording, or duplicates.
It reviews every ingredient list and makes final adjustments to ensure that all rows follow the same standard format.

In [None]:
# ============================================================
# Consistency pass over the produced CSV
# - Enforce canonical spellings & synonyms (e.g., chilli → chili)
# - Your requests: paper → chili, allspices → mixed spices
# - Remove non-ingredient words, dedupe, keep only 4 columns
# Outputs:
#   • SaudiFoodFile_standardized_consistent.csv
#   • consistency_changes_report.csv (what changed)
# ============================================================

import ast, re, json, pandas as pd

INPUT_CSV  = "SaudiFoodFile_standardized.csv"   # <— your produced file
OUTPUT_CSV = "SaudiFoodFile_standardized_consistent.csv"
REPORT_CSV = "consistency_changes_report.csv"

# If your column name differs, adjust here:
CLASS_COL = "classifications_std_list"
REQ_COLS  = ["dish_name", CLASS_COL, "image_file", "scrape_date"]

# ---------- canonicalization config ----------
# ✅ Add/adjust anything you want here:
ALIASES = {
    # your explicit requests
    "paper": "chili",              # e.g., OCR/typo -> chili
    "allspices": "mixed spices",
    "all-spices": "mixed spices",
    "all spice": "mixed spices",
    "all-spice": "mixed spices",
    "allspice": "mixed spices",    # if you prefer "mixed spices" as canonical

    # common spellings / plurals / variants
    "chilli": "chili",
    "chilies": "chili",
    "chillies": "chili",
    "green chilli": "chili",
    "green chili": "chili",
    "red chili": "chili",
    "red chilli": "chili",
    "chili pepper": "chili",
    "chili peppers": "chili",

    "black lemon": "black lime",
    "dried lime": "black lime",
    "dried limes": "black lime",
    "omani lime": "black lime",
    "omani limes": "black lime",
    "loomi": "black lime",

    "veg oil": "vegetable oil",
    "olive oils": "olive oil",

    "cilantro": "coriander",
    "coriander leaves": "coriander",
    "green coriander": "coriander",
    "parsley leaves": "parsley",
    "mint leaves": "mint",
    "spring onion": "green onion",
    "capsicum": "bell pepper",
    "green pepper": "bell pepper",
    "sweet pepper": "bell pepper",

    "turmeric powder": "turmeric",
    "ginger powder": "ginger",
    "garlic powder": "garlic",
    "onion powder": "onion",
    "cinnamon stick": "cinnamon",
    "cinnamon sticks": "cinnamon",
    "cardamon": "cardamom",

    "tomato puree": "tomato paste",
    "tomato purée": "tomato paste",
    "simple syrup": "sugar syrup",

    "garbanzo bean": "chickpeas",
    "garbanzo beans": "chickpeas",
    "chick pea": "chickpeas",
    "chick peas": "chickpeas",
    "black eyed bean": "black-eyed beans",
    "black eyed beans": "black-eyed beans",
}

# If BOTH appear, drop the generic in favor of the preferred.
# (You can add more pairs here.)
COLLAPSE_IF_PRESENT = [
    ("oil", "olive oil"),          # keep "olive oil", drop "oil"
    ("vegetable oil", "olive oil"),# keep "olive oil"
    ("pepper", "chili"),           # if chili is present, drop generic "pepper"
]

# Words to drop if they sneak in (not ingredients)
NON_ING_WORDS = {
    "after","decorate","decoration","decorations","add","with","such","touch","patriotic",
    "or","and","the","a","an","then","until","when","like","as","to","for","of","into","over",
    "warm","hot","cold","slice","sliced","diced","chopped","minced","ground","crushed","whole",
    "fresh","optional","needed","garnish","make","prepare","preparation","cook","cooked","baked",
    "boiled","fried","seauted","sauteed","browned","mix","mixed","topping","kitchen","precise","instant"
}

# ---------- helpers ----------
def parse_list_cell(x):
    """Parse list stored as Python-list string, or already-list, or fallback."""
    if isinstance(x, list):
        return [str(t).strip().lower() for t in x if str(t).strip()]
    s = str(x).strip()
    if not s:
        return []
    try:
        val = ast.literal_eval(s)
        if isinstance(val, list):
            return [str(t).strip().lower() for t in val if str(t).strip()]
    except Exception:
        pass
    # fallback: split by | or comma
    parts = re.split(r"\s*\|\s*|,+", s)
    return [p.strip().lower() for p in parts if p.strip()]

def apply_aliases(tokens):
    out = []
    for t in tokens:
        t0 = t.strip().lower()
        if not t0 or t0 in NON_ING_WORDS:
            continue
        t0 = ALIASES.get(t0, t0)
        out.append(t0)
    # de-dup preserve order
    seen, dedup = set(), []
    for x in out:
        if x not in seen:
            dedup.append(x); seen.add(x)
    return dedup

def collapse_generics(tokens):
    s = set(tokens)
    # if preferred present, drop generic
    for generic, preferred in COLLAPSE_IF_PRESENT:
        if preferred in s and generic in s:
            s.discard(generic)
    # rebuild original order
    out, seen = [], set()
    for x in tokens:
        if x in s and x not in seen:
            out.append(x); seen.add(x)
    return out

# ---------- load ----------
df = pd.read_csv(INPUT_CSV, encoding="utf-8")
missing = [c for c in REQ_COLS if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# ---------- process ----------
changes = []
new_lists = []
for idx, row in df.iterrows():
    original = parse_list_cell(row[CLASS_COL])
    aliased  = apply_aliases(original)
    collapsed = collapse_generics(aliased)

    # track changes (only if different)
    if original != collapsed:
        changes.append({
            "row": idx,
            "before": json.dumps(original, ensure_ascii=False),
            "after":  json.dumps(collapsed, ensure_ascii=False),
        })
    new_lists.append(collapsed if collapsed else ["unknown_ingredients"])

df[CLASS_COL] = new_lists

# keep only requested 4 columns, same order
out = df[["dish_name", CLASS_COL, "image_file", "scrape_date"]]
out.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

pd.DataFrame(changes).to_csv(REPORT_CSV, index=False, encoding="utf-8")

print("✅ Consistency pass complete.")
print(f"• Output CSV: {OUTPUT_CSV}")
print(f"• Changes report: {REPORT_CSV}")
print(f"Rows changed: {len(changes)}")



