In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import warnings
import glob
import os
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 50)
pd.set_option('display.float_format', '{:.3f}'.format)

In [2]:
path = r"C:\\01_Data\\PythonProgram\\Nutrition_Recommender_PROECT\\foodData\\DATASET"
all_files = glob.glob(os.path.join(path, "FOOD-DATA-GROUP*.csv"))

dfs = [pd.read_csv(f) for f in all_files]
food_df = pd.concat(dfs, ignore_index=True)
print("Files merged:", len(all_files))
print("Final shape:", food_df.shape)
print(food_df.head())

Files merged: 5
Final shape: (2395, 37)
   Unnamed: 0.1  Unnamed: 0                              food  Caloric Value  \
0             0           0                      cream cheese             51   
1             1           1                 neufchatel cheese            215   
2             2           2  requeijao cremoso light catupiry             49   
3             3           3                    ricotta cheese             30   
4             4           4              cream cheese low fat             30   

     Fat  Saturated Fats  Monounsaturated Fats  Polyunsaturated Fats  \
0  5.000           2.900                 1.300                 0.200   
1 19.400          10.900                 4.900                 0.800   
2  3.600           2.300                 0.900                 0.000   
3  2.000           1.300                 0.500                 0.002   
4  2.300           1.400                 0.600                 0.042   

   Carbohydrates  Sugars  Protein  Dietary Fib

In [3]:
food_df = food_df.loc[:, ~food_df.columns.str.contains('^Unnamed')]
print("Columns after cleaning:", food_df.columns.tolist())


Columns after cleaning: ['food', 'Caloric Value', 'Fat', 'Saturated Fats', 'Monounsaturated Fats', 'Polyunsaturated Fats', 'Carbohydrates', 'Sugars', 'Protein', 'Dietary Fiber', 'Cholesterol', 'Sodium', 'Water', 'Vitamin A', 'Vitamin B1', 'Vitamin B11', 'Vitamin B12', 'Vitamin B2', 'Vitamin B3', 'Vitamin B5', 'Vitamin B6', 'Vitamin C', 'Vitamin D', 'Vitamin E', 'Vitamin K', 'Calcium', 'Copper', 'Iron', 'Magnesium', 'Manganese', 'Phosphorus', 'Potassium', 'Selenium', 'Zinc', 'Nutrition Density']


In [4]:
def to_numeric_col(s):
    return pd.to_numeric(s.astype(str).str.replace(',', '').str.strip(), errors='coerce')

for col in food_df.columns:
    if food_df[col].dtype == 'object':
        sample = food_df[col].dropna().astype(str).head(20).tolist()
        num_count = sum(1 for x in sample if any(ch.isdigit() for ch in x))
        if num_count >= max(1, len(sample)//2):
            food_df[col] = to_numeric_col(food_df[col])

numeric_cols = food_df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns detected:", numeric_cols[:10], "...")


Numeric columns detected: ['Caloric Value', 'Fat', 'Saturated Fats', 'Monounsaturated Fats', 'Polyunsaturated Fats', 'Carbohydrates', 'Sugars', 'Protein', 'Dietary Fiber', 'Cholesterol'] ...


In [5]:
possible_name_cols = [c for c in food_df.columns if c.lower() in ('food','name','item','description')]
name_col = possible_name_cols[0] if possible_name_cols else food_df.columns[0]
food_df[name_col] = food_df[name_col].astype(str)

print("Food name column detected:", name_col)


Food name column detected: food


In [6]:
feature_cols = [c for c in food_df.select_dtypes(include=[np.number]).columns if c != name_col]

imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(food_df[feature_cols])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Shape of feature matrix:", X_scaled.shape)


Shape of feature matrix: (2395, 34)


In [7]:
cos_sim = cosine_similarity(X_scaled)
sim_df = pd.DataFrame(cos_sim, index=food_df[name_col], columns=food_df[name_col])
print("Similarity matrix shape:", sim_df.shape)


Similarity matrix shape: (2395, 2395)


In [8]:
def recommend_similar(food_name, topn=10):
    if food_name not in sim_df.index:
        matches = [f for f in sim_df.index if food_name.lower() in f.lower()]
        if matches:
            food_name = matches[0]
        else:
            raise ValueError("Food not found in dataset.")
    sims = sim_df[food_name].sort_values(ascending=False)[1:topn+1]
    recs = pd.DataFrame({'Food': sims.index, 'Similarity': sims.values})
    return recs.merge(food_df, left_on='Food', right_on=name_col, how='left').drop(columns=[name_col])


In [9]:
def recommend_substitute(food_name, constraint_col, max_value, topn=10):
    if food_name not in sim_df.index:
        raise ValueError("Food not found.")
    sims = sim_df[food_name].copy()
    subset = food_df[food_df[constraint_col] <= max_value]
    subset = subset[subset[name_col] != food_name]
    subset = subset.set_index(name_col)
    valid_names = subset.index.intersection(sims.index)
    ranked = sims.loc[valid_names].sort_values(ascending=False)[:topn]
    return food_df[food_df[name_col].isin(ranked.index)][[name_col, constraint_col]].assign(Similarity=ranked.values)


In [10]:
def recommend_for_nutrient_targets(targets, topn=10):
    selected = [k for k in targets.keys() if k in feature_cols]
    M = food_df[selected].fillna(food_df[selected].median())
    mm = MinMaxScaler()
    M_scaled = mm.fit_transform(M)
    target_vals = np.array([targets[k] for k in selected]).reshape(1, -1)
    target_scaled = mm.transform(target_vals)
    dists = np.linalg.norm(M_scaled - target_scaled, axis=1)
    result = pd.DataFrame({name_col: food_df[name_col], 'distance': dists})
    return result.sort_values('distance').head(topn).merge(df, on=name_col, how='left')


In [11]:
# Example 1️: Show 5 foods most similar to "cream cheese"
example_food = "cream cheese"
print(f" Foods most similar to '{example_food}':")
display(recommend_similar(example_food, topn=5))

 Foods most similar to 'cream cheese':


Unnamed: 0,Food,Similarity,Caloric Value,Fat,Saturated Fats,Monounsaturated Fats,Polyunsaturated Fats,Carbohydrates,Sugars,Protein,Dietary Fiber,Cholesterol,Sodium,Water,Vitamin A,Vitamin B1,Vitamin B11,Vitamin B12,Vitamin B2,Vitamin B3,Vitamin B5,Vitamin B6,Vitamin C,Vitamin D,Vitamin E,Vitamin K,Calcium,Copper,Iron,Magnesium,Manganese,Phosphorus,Potassium,Selenium,Zinc,Nutrition Density
0,spearmint dried,0.987,1,0.018,0.09,0.018,0.057,0.3,0.0,0.094,0.1,0.0,0.012,0.029,0.0,0.0,0.007,0.087,0.0,0.009,0.04,0.044,0.092,0.0,0.0,0.0,0.0,7.4,0.021,0.4,3.0,0.008,1.4,9.6,0.0,0.649
1,cheese crackers,0.987,34,1.6,0.4,0.4,0.7,4.2,0.3,0.8,0.2,0.2,0.065,0.2,0.05,0.056,0.039,0.095,0.028,0.03,0.4,0.065,0.009,0.0,0.0,0.2,0.026,9.5,0.032,0.3,1.8,0.075,14.0,10.9,0.031,7.053
2,pork chitterlings cooked,0.986,66,5.8,2.7,2.0,0.3,0.0,0.0,3.5,0.0,78.4,0.078,19.2,0.0,0.0,0.099,0.024,0.056,0.093,0.065,0.059,0.0,0.0,0.0,0.083,0.0,7.1,0.056,0.4,2.5,0.033,18.7,4.0,0.07,9.329
3,soybean margarine,0.986,34,3.8,0.8,1.8,1.0,0.043,0.0,0.052,0.0,0.0,0.005,0.7,0.067,0.06,0.013,0.049,0.051,0.033,0.012,0.054,0.077,0.0,0.1,0.0,1.4,0.0,0.0,0.1,0.0,1.1,2.0,0.0,0.0,5.309
4,mustard,0.985,3,0.2,0.021,0.1,0.086,0.3,0.048,0.2,0.2,0.0,0.015,4.2,0.096,0.026,0.006,0.097,0.0,0.011,0.04,0.013,0.015,0.03,0.0,0.086,0.012,3.2,0.086,0.096,2.4,0.075,5.4,7.6,0.064,1.046


In [12]:
# Example 2️: Suggest low-calorie substitutes (<= 60 calories)
constraint_col = "Caloric Value"  # same as in your dataset
example_food = "cream cheese"
print(f" Low-calorie substitutes for '{example_food}' (<= 60 Cal):")
display(recommend_substitute(example_food, constraint_col, 60, topn=5))


 Low-calorie substitutes for 'cream cheese' (<= 60 Cal):


Unnamed: 0,food,Caloric Value,Similarity
1066,mustard,3,0.987
1069,spearmint dried,1,0.987
1451,zaziki milfina,51,0.986
1876,cheese crackers,34,0.985
2072,soybean margarine,34,0.985


In [15]:
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate_recommender(X_scaled, sim_df, food_df, name_col):
    
    # Silhouette Score — how well foods cluster together nutritionally
    kmeans = KMeans(n_clusters=8, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    sil_score = silhouette_score(X_scaled, labels)
    print(f"Silhouette Score (Clustering Quality): {sil_score:.3f}")

    # Internal Similarity Consistency — are similar foods really close?
    np.random.seed(42)
    sample_foods = np.random.choice(sim_df.index, 10, replace=False)
    avg_similarities = []
    avg_distances = []

    for food in sample_foods:
        top_sim = sim_df[food].sort_values(ascending=False)[1:6]
        idx = food_df[food_df[name_col] == food].index[0]
        top_idx = [food_df[food_df[name_col] == f].index[0] for f in top_sim.index]
        
        dist = np.linalg.norm(X_scaled[idx] - X_scaled[top_idx], axis=1)
        avg_similarities.append(top_sim.mean())
        avg_distances.append(dist.mean())

    print(f"Average Cosine Similarity (Top 5): {np.mean(avg_similarities):.3f}")
    print(f"Average Euclidean Distance (Top 5): {np.mean(avg_distances):.3f}")

    #  Recommendation Stability 
    X_noisy = X_scaled + np.random.normal(0, 0.01, X_scaled.shape)
    cos_sim_noisy = cosine_similarity(X_noisy)
    stability = np.corrcoef(sim_df.values.flatten(), cos_sim_noisy.flatten())[0, 1]
    print(f"Recommendation Stability: {stability:.3f}")

    accuracy = (0.4 * np.mean(avg_similarities) + 0.4 * stability + 0.2 * sil_score)
    print(f"\n Overall Recommendation Accuracy: {accuracy:.3f}")

# evaluation
evaluate_recommender(X_scaled, sim_df, food_df, name_col)


Silhouette Score (Clustering Quality): 0.576
Average Cosine Similarity (Top 5): 0.937
Average Euclidean Distance (Top 5): 2.070
Recommendation Stability: 1.000

 Overall Recommendation Accuracy: 0.890
