In [1]:
import kagglehub
path = kagglehub.dataset_download("barkataliarbab/usda-fooddata-central-foundation-foods-2025")
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/barkataliarbab/usda-fooddata-central-foundation-foods-2025/versions/1


In [2]:
!mv /root/.cache/kagglehub/datasets/barkataliarbab/usda-fooddata-central-foundation-foods-2025/versions/1 /content/

mv: cannot move '/root/.cache/kagglehub/datasets/barkataliarbab/usda-fooddata-central-foundation-foods-2025/versions/1' to '/content/1': Directory not empty


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Load CSVs
base_path = "/content/1/FoodData_Central_foundation_food_csv_2025-04-24/"

food = pd.read_csv(f"{base_path}food.csv")
nutrient = pd.read_csv(f"{base_path}nutrient.csv")
food_nutrient = pd.read_csv(f"{base_path}food_nutrient.csv")
food_category = pd.read_csv(f"{base_path}food_category.csv")
food_calorie_conversion = pd.read_csv(f"{base_path}food_calorie_conversion_factor.csv")
food_protein_conversion = pd.read_csv(f"{base_path}food_protein_conversion_factor.csv")
food_attribute = pd.read_csv(f"{base_path}food_attribute.csv")
food_attribute_type = pd.read_csv(f"{base_path}food_attribute_type.csv")

  food_nutrient = pd.read_csv(f"{base_path}food_nutrient.csv")


In [5]:
# Merging main nutrient info
food_nutrient_full = food_nutrient.merge(
    nutrient,
    left_on='nutrient_id',
    right_on='id',
    how='left',
    suffixes=('_food_nutrient', '_nutrient')
)

food_full = food_nutrient_full.merge(
    food,
    left_on='fdc_id',
    right_on='fdc_id',
    how='left'
)

food_full = food_full.merge(
    food_category,
    left_on='food_category_id',
    right_on='id',
    how='left',
    suffixes=('','_category')
)


In [6]:
# 3️⃣ Apply calorie/protein conversion factors
if 'calorie_factor' in food_calorie_conversion.columns:
    food_full = food_full.merge(
        food_calorie_conversion[['fdc_id','calorie_factor']],
        on='fdc_id',
        how='left'
    )
    food_full['amount_scaled'] = food_full['amount'] * food_full['calorie_factor'].fillna(1)
else:
    food_full['amount_scaled'] = food_full['amount']

if 'protein_factor' in food_protein_conversion.columns:
    food_full = food_full.merge(
        food_protein_conversion[['fdc_id','protein_factor']],
        on='fdc_id',
        how='left'
    )
    food_full['amount_scaled'] = food_full['amount_scaled'] * food_full['protein_factor'].fillna(1)

In [7]:
# 3️⃣ Apply calorie/protein conversion factors
if 'calorie_factor' in food_calorie_conversion.columns:
    food_full = food_full.merge(
        food_calorie_conversion[['fdc_id','calorie_factor']],
        on='fdc_id',
        how='left'
    )
    food_full['amount_scaled'] = food_full['amount'] * food_full['calorie_factor'].fillna(1)
else:
    food_full['amount_scaled'] = food_full['amount']

if 'protein_factor' in food_protein_conversion.columns:
    food_full = food_full.merge(
        food_protein_conversion[['fdc_id','protein_factor']],
        on='fdc_id',
        how='left'
    )
    food_full['amount_scaled'] = food_full['amount_scaled'] * food_full['protein_factor'].fillna(1)


In [8]:
# Pivot to nutrient matrix (foods x nutrients)
food_matrix = food_full.pivot_table(
    index='description',  # food name
    columns='name',       # nutrient name
    values='amount_scaled',
    aggfunc='mean'
).fillna(0)
food_matrix.reset_index(inplace=True)

In [9]:
def recommend_food_by_goal(goal_dict, top_n=5, food_df=None):
    """
    Recommend foods based on nutrient goals.

    goal_dict: dict
        Keys: nutrient names (friendly names like 'Protein', 'Fat')
        Values: 'high', 'low', 'medium'
    food_df: pd.DataFrame
        Nutrient matrix with foods as rows and nutrients as columns
    """
    if food_df is None:
        raise ValueError("food_df (nutrient matrix) must be provided")

    # Copy to avoid modifying original
    df = food_df.copy()

    scores = pd.Series(0, index=df.index, dtype=float)

    # Iterate over nutrient goals
    for nutrient, goal in goal_dict.items():
        col = nutrient_map.get(nutrient)
        if col is None or col not in df.columns:
            print(f"Nutrient '{nutrient}' not found in dataset, skipping...")
            continue
        values = df[col].fillna(0)
        norm = (values - values.min()) / (values.max() - values.min() + 1e-9)
        if goal.lower() == "high":
            scores += norm
        elif goal.lower() == "low":
            scores += (1 - norm)
        elif goal.lower() == "medium":
            scores += 1 - np.abs(norm - 0.5) * 2
        else:
            print(f"Unknown goal '{goal}' for nutrient '{nutrient}', skipping...")

    result = df.copy()
    result['score'] = scores
    result = result.sort_values(by='score', ascending=False)

    return result[['description', 'score']].head(top_n)

In [10]:
import pickle
nutrient_map = {col: col for col in food_matrix.columns if col != 'description'}
model_data = {
    "food_matrix": food_matrix,
    "nutrient_map": nutrient_map,
    "recommend_food_by_goal": recommend_food_by_goal
}
with open("nutrition_recommendation_model.pkl", "wb") as f:
    pickle.dump(model_data, f)