In [2]:
from dotenv import load_dotenv

load_dotenv()
import os
import sys

src_folder = os.getenv("src_folder")
data_folder = os.getenv("data_folder")
print(f"Source folder: {src_folder}")
print(f"Data folder: {data_folder}")
sys.path.append(src_folder)

Source folder: /Users/luvsuneja/Documents/repos/masala-embed/esci-dataset/src
Data folder: /Users/luvsuneja/Documents/repos/masala-embed/esci-dataset/data


In [41]:
import pandas as pd
from preprocessing import deduplicate_dataset

pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", 100)

# Run deduplication using the modular function
df_deduplicated, df_filtered = deduplicate_dataset(
    input_file="MM-Food-100K.csv",
    output_file="MM-Food-100K-deduplicated.csv",
    threshold=0.9,
)
print(f"Deduplication complete. Final dataset has {len(df_deduplicated)} records")

INFO:preprocessing:Loading data from /Users/luvsuneja/Documents/repos/masala-embed/esci-dataset/data
INFO:preprocessing:Loaded 100000 records from MM-Food-100K.csv
INFO:preprocessing:Combining columns: ['dish_name', 'ingredients']
INFO:model2vec.hf_utils:Folder does not exist locally, attempting to use huggingface hub.
INFO:preprocessing:Deduplicated dataset has 21783 records
INFO:preprocessing:Filtered dataset has 78217 records
INFO:preprocessing:Saved deduplicated dataset: 21783 records to MM-Food-100K-deduplicated.csv


Deduplication complete. Final dataset has 21783 records


In [31]:
df_mm = pd.read_csv(os.path.join(data_folder, "..", "data", "MM-Food-100K.csv"))

In [32]:
dishes = df_mm["dish_name"].unique()

In [33]:
print(df_mm["dish_name"].isnull().sum())
print(df_mm["ingredients"].isnull().sum())

2
0


In [34]:
dishes[1027:1030]

array(['Fried Rice with Shrimp', 'Stir-fried minced meat with vegetables',
       nan], dtype=object)

In [35]:
dishes[1029]

nan

In [36]:
df_deduplicated["combined_text"].sample(5)

30007                                           layered potato dish\npotatoes, cheese, butter
3743               beef and broccoli stir-fry\nbeef, broccoli, cauliflower, soy sauce, garlic
36248                                 fruit salad\nwatermelon, dragon fruit, jelly, pineapple
19439    shrimp toast and yogurt parfait\nshrimp, bread, yogurt, granola, blueberries, coffee
59574                                        canned mandarin oranges\nmandarin oranges, syrup
Name: combined_text, dtype: object

In [37]:
# filter for dish name in df_filtered that are not in df_deduplicated
df_filtered_unique = df_filtered[
    ~df_filtered["dish_name"].isin(df_deduplicated["dish_name"])
]
print(
    f"Filtered dataset has {len(df_filtered_unique)} unique dish names not in deduplicated dataset"
)

Filtered dataset has 17430 unique dish names not in deduplicated dataset


In [38]:
df_filtered_unique["dish_name"].sample(5)

2610             Flavored Yogurt
7504              Mixed Raw Meat
79169    Rice with sauce and egg
99693      Spicy Stir-Fried Meat
62851        Whole Fish in Sauce
Name: dish_name, dtype: object

In [4]:
# Test the hybrid deduplication function
import os
import sys

src_folder = os.getenv("src_folder")
sys.path.append(src_folder)

from preprocessing import hybrid_deduplicate_dataset

# Run hybrid deduplication
df_hybrid = hybrid_deduplicate_dataset(
    input_file="MM-Food-100K.csv",
    output_file="MM-Food-100K-hybrid-deduplicated.csv",
    threshold=0.85,
)

print("Hybrid deduplication complete!")
print("Original dataset: 100,000 records")
print(f"Hybrid deduplication: {len(df_hybrid)} records")
print(f"Unique dish names in hybrid: {df_hybrid['dish_name'].nunique()}")

INFO:preprocessing:Loading data from /Users/luvsuneja/Documents/repos/masala-embed/esci-dataset/data
INFO:preprocessing:Loaded 100000 records from MM-Food-100K.csv
INFO:preprocessing:Removed 2 rows with null/empty values in columns ['dish_name']
INFO:preprocessing:Combining columns: ['dish_name', 'ingredients']
INFO:preprocessing:Processing 19288 unique dish names
INFO:preprocessing:Initializing SemHash model...
  from .autonotebook import tqdm as notebook_tqdm
INFO:model2vec.hf_utils:Folder does not exist locally, attempting to use huggingface hub.
INFO:preprocessing:SemHash model initialized successfully
Deduplicating by dish_name: 100%|██████████| 19288/19288 [03:25<00:00, 93.74it/s] 
INFO:preprocessing:Hybrid deduplication complete: 25577 records saved to MM-Food-100K-hybrid-deduplicated.csv


Hybrid deduplication complete!
Original dataset: 100,000 records
Hybrid deduplication: 25577 records
Unique dish names in hybrid: 19288


In [5]:
df_hybrid.head()

Unnamed: 0,image_url,camera_or_phone_prob,food_prob,dish_name,food_type,ingredients,portion_size,nutritional_profile,cooking_method,sub_dt,combined_text
0,https://file.b18a.io/7843322356500104680_44354...,0.7,0.95,Fried Chicken,Restaurant food,"[""chicken"",""breading"",""oil""]","[""chicken:300g""]","{""fat_g"":25.0,""protein_g"":30.0,""calories_kcal""...",Frying,20250704,"fried chicken\nchicken, breading, oil"
1,https://file.b18a.io/7832990761400104990_52307...,0.7,1.0,Fried Chicken,Restaurant food,"[""chicken"",""lettuce"",""green onions"",""lemon"",""t...","[""chicken:300g"",""lettuce:50g"",""green onions:20g""]","{""fat_g"":30.0,""protein_g"":40.0,""calories_kcal""...",Fried,20250626,"fried chicken\nchicken, lettuce, green onions,..."
2,https://file.b18a.io/7839138398700103731_79685...,0.7,0.9,Fried Chicken,Homemade food,"[""chicken"",""sauce"",""sesame seeds""]","[""chicken:300g"",""sauce:50g""]","{""fat_g"":25.0,""protein_g"":30.0,""calories_kcal""...",Fried,20250710,"fried chicken\nchicken, sauce, sesame seeds"
3,https://file.b18a.io/7987824173900103282_48550...,0.7,0.9,Fried Chicken,Homemade food,"[""fried chicken"",""lettuce"",""cucumber"",""sambal ...","[""chicken:300g"",""lettuce:50g"",""cucumber:50g"",""...","{""fat_g"":35.0,""protein_g"":40.0,""calories_kcal""...",Frying,20250720,"fried chicken\nfried chicken, lettuce, cucumbe..."
4,https://file.b18a.io/7841360032000103904_80804...,0.7,0.95,Fried Chicken,Restaurant food,"[""fried chicken"",""broccoli"",""carrots"",""tomato""...","[""chicken:400g"",""vegetables:100g""]","{""fat_g"":45.0,""protein_g"":60.0,""calories_kcal""...",Fried,20250720,"fried chicken\nfried chicken, broccoli, carrot..."


In [8]:
df_hybrid["dish_name"].value_counts().head(100)

dish_name
Hot Pot                111
Mixed Asian Dishes      66
Breakfast Plate         65
Mixed Asian Cuisine     62
Vegetable Salad         55
                      ... 
Steamed Bun             10
Fried Fish              10
Roasted Chicken         10
Mixed Chinese Meal      10
Mixed platter           10
Name: count, Length: 100, dtype: int64

In [7]:
df_hybrid["dish_name"].shape

(25577,)

In [None]:
import os

print(f"CPU cores available: {os.cpu_count()}")

In [3]:
# Test the parallel hybrid deduplication function
from preprocessing import hybrid_deduplicate_dataset_parallel

# Run parallel hybrid deduplication
df_hybrid_parallel = hybrid_deduplicate_dataset_parallel(
    input_file="MM-Food-100K.csv",
    output_file="MM-Food-100K-hybrid-parallel-deduplicated.csv",
    threshold=0.85,
    max_workers=None,  # Use all available cores
)

print("Parallel hybrid deduplication complete!")
print("Original dataset: 100,000 records")
print(f"Hybrid parallel deduplication: {len(df_hybrid_parallel)} records")
print(
    f"Unique dish names in hybrid parallel: {df_hybrid_parallel['dish_name'].nunique()}"
)

INFO:preprocessing:Loading data from /Users/luvsuneja/Documents/repos/masala-embed/esci-dataset/data
INFO:preprocessing:Loaded 100000 records from MM-Food-100K.csv
INFO:preprocessing:Removed 2 rows with null/empty values in columns ['dish_name']
INFO:preprocessing:Combining columns: ['dish_name', 'ingredients']
INFO:preprocessing:Processing 19288 unique dish names with parallel processing


KeyboardInterrupt: 