In [2]:
import pandas as pd
import os

save_path = r'E:\My Drive\Nutrition5kDataset'
nutrition5k_ingredients = pd.read_excel(os.path.join(save_path, 'ingredients.xlsx'))

UECFOOD100_folder_path = r'D:\Projects\CDS\Capstone Project\UECFOOD100'
uecfood100_ingredients = pd.read_csv(os.path.join(UECFOOD100_folder_path, 'category.txt'), sep='\t')

In [4]:
nutrition5k_ingredients_list = list(nutrition5k_ingredients['ingr'].unique())
uecfood100_ingredients_list = list(uecfood100_ingredients['name'].unique())

In [5]:
from sentence_transformers import SentenceTransformer, util
import torch
import pandas as pd

# 1. Load the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Assuming you've already defined the two lists
# nutrition5k_ingredients_list = list(nutrition5k_ingredients['ingr'].unique())
# uecfood100_ingredients_list = list(uecfood100_category_names['name'].unique())

# Drop NaNs if any
nutrition5k_ingredients_list = [str(i).strip() for i in nutrition5k_ingredients_list if pd.notna(i)]
uecfood100_ingredients_list = [str(i).strip() for i in uecfood100_ingredients_list if pd.notna(i)]

# 3. Encode both lists
nutrition_embeddings = model.encode(nutrition5k_ingredients_list, convert_to_tensor=True)
uec_embeddings = model.encode(uecfood100_ingredients_list, convert_to_tensor=True)

# 4. Compute cosine similarities
cosine_scores = util.pytorch_cos_sim(nutrition_embeddings, uec_embeddings)

# 5. Generate mapping
mapping = []
for idx, label in enumerate(nutrition5k_ingredients_list):
    best_match_idx = torch.argmax(cosine_scores[idx]).item()
    best_score = cosine_scores[idx][best_match_idx].item()
    best_label = uecfood100_ingredients_list[best_match_idx]
    
    mapping.append({
        "nutrition5k_ingredient": label,
        "mapped_uecfood100_ingredient": best_label,
        "similarity_score": round(best_score, 3)
    })

# 6. Convert to DataFrame and save or print
mapping_df = pd.DataFrame(mapping)
mapping_df.head()  # Display first few rows

# Optionally save to CSV
# mapping_df.to_csv("nutrition5k_to_uecfood100_mapping.csv", index=False)


Unnamed: 0,nutrition5k_ingredient,mapped_uecfood100_ingredient,similarity_score
0,cottage cheese,raisin bread,0.521
1,strawberries,potato salad,0.439
2,garden salad,green salad,0.84
3,bacon,sausage,0.54
4,potatoes,potato salad,0.701


In [8]:
mapping_df[mapping_df['similarity_score'] == 1].sort_values('similarity_score', ascending=False)

Unnamed: 0,nutrition5k_ingredient,mapped_uecfood100_ingredient,similarity_score
38,sausage,sausage,1.0
88,spaghetti,spaghetti,1.0
148,sushi,sushi,1.0
193,fried rice,fried rice,1.0
206,fried chicken,fried chicken,1.0
223,potato salad,potato salad,1.0
272,sandwiches,sandwiches,1.0
286,toast,toast,1.0
315,roast chicken,roast chicken,1.0
349,miso soup,miso soup,1.0


In [10]:
mapping_df[mapping_df['similarity_score'].between(0.7, 0.99)].sort_values('similarity_score', ascending=False)

Unnamed: 0,nutrition5k_ingredient,mapped_uecfood100_ingredient,similarity_score
131,croissants,croissant,0.955
138,egg rolls,egg roll,0.946
278,omelets,omelet,0.939
192,hot dogs,hot dog,0.933
9,steak,beef steak,0.925
...,...,...,...
504,soy sauce,spaghetti meat sauce,0.712
385,eel,eels on rice,0.711
344,roast pork,sweet and sour pork,0.703
4,potatoes,potato salad,0.701
