In [23]:
from pyspark.sql import SparkSession, functions as F
spark = SparkSession.builder.appName('ALSFoodRecommender').getOrCreate()
print('Spark:', spark.version)

Spark: 4.1.0


In [24]:
data_path = 'output/nutritional_profiles'
df = spark.read.parquet(data_path)
print('Rows:', df.count())
df.printSchema()
df.show(5, truncate=False)

Rows: 4688
root
 |-- fdc_id: long (nullable = true)
 |-- food_description: string (nullable = true)
 |-- food_type: string (nullable = true)
 |-- total_nutrients: long (nullable = true)
 |-- energy: double (nullable = true)
 |-- protein: double (nullable = true)
 |-- carbs: double (nullable = true)
 |-- total_fat: double (nullable = true)
 |-- water: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- caffeine: double (nullable = true)
 |-- fiber: double (nullable = true)
 |-- sugars: double (nullable = true)
 |-- glucose: double (nullable = true)
 |-- fructose: double (nullable = true)
 |-- sucrose: double (nullable = true)
 |-- lactose: double (nullable = true)
 |-- saturated_fat: double (nullable = true)
 |-- monounsaturated_fat: double (nullable = true)
 |-- polyunsaturated_fat: double (nullable = true)
 |-- trans_fat: double (nullable = true)
 |-- cholesterol: double (nullable = true)
 |-- vitamin_a: double (nullable = true)
 |--

In [25]:
dtypes = dict(df.dtypes)
cols = df.columns
candidates = [c for c in cols if 'food' in c.lower() or 'name' in c.lower() or 'desc' in c.lower()]
id_col = candidates[0] if candidates else cols[0]
numeric_types = set(['int','bigint','double','float','decimal'])
nutrient_cols = [c for c,t in df.dtypes if (t in numeric_types) and (c != id_col)]
print('Using id column:', id_col)
print('Detected nutrient columns (sample):', nutrient_cols[:20])
assert len(nutrient_cols) > 0, 'No numeric nutrient columns detected; adjust notebook to match your schema.'

Using id column: food_description
Detected nutrient columns (sample): ['fdc_id', 'total_nutrients', 'energy', 'protein', 'carbs', 'total_fat', 'water', 'ash', 'alcohol', 'caffeine', 'fiber', 'sugars', 'glucose', 'fructose', 'sucrose', 'lactose', 'saturated_fat', 'monounsaturated_fat', 'polyunsaturated_fat', 'trans_fat']


In [26]:
pairs = [F.struct(F.lit(c).alias('nutrient'), F.col(c).cast('double').alias('value')) for c in nutrient_cols]
df_long = df.select(id_col, F.explode(F.array(*pairs)).alias('kv')).select(id_col, F.col('kv.nutrient').alias('nutrient'), F.col('kv.value').alias('value')).where(F.col('value').isNotNull())
print('Long rows:', df_long.count())
df_long.show(5, truncate=False)

Long rows: 67909
+--------------------------+---------------+---------+
|food_description          |nutrient       |value    |
+--------------------------+---------------+---------+
|SMOKEY TERIYAKI BEEF JERKY|fdc_id         |1109427.0|
|SMOKEY TERIYAKI BEEF JERKY|total_nutrients|14.0     |
|SMOKEY TERIYAKI BEEF JERKY|energy         |321.0    |
|SMOKEY TERIYAKI BEEF JERKY|protein        |35.71    |
|SMOKEY TERIYAKI BEEF JERKY|carbs          |25.0     |
+--------------------------+---------------+---------+
only showing top 5 rows


In [27]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
food_indexer = StringIndexer(inputCol=id_col, outputCol='food_idx').fit(df.select(id_col))
nutrient_indexer = StringIndexer(inputCol='nutrient', outputCol='nutrient_idx').fit(df_long.select('nutrient'))
df_long = nutrient_indexer.transform(df_long)
df_long = food_indexer.transform(df_long)
ratings = df_long.select(F.col('nutrient_idx').cast('int').alias('user'), F.col('food_idx').cast('int').alias('item'), F.col('value').cast('float').alias('rating'))
ratings = ratings.where(F.col('rating').isNotNull())
print('Ratings rows:', ratings.count())
ratings.show(5)

Ratings rows: 67909
+----+----+---------+
|user|item|   rating|
+----+----+---------+
|   0|3639|1109427.0|
|   1|3639|     14.0|
|   6|3639|    321.0|
|   2|3639|    35.71|
|   3|3639|     25.0|
+----+----+---------+
only showing top 5 rows


In [28]:
als = ALS(userCol='user', itemCol='item', ratingCol='rating', implicitPrefs=True, rank=20, maxIter=10, regParam=0.05, nonnegative=True)
model = als.fit(ratings)
item_factors = model.itemFactors
item_factors.show(5, truncate=False)

                                                                                

+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                                                                                               |
+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |[0.0, 0.8239727, 0.3660615, 0.71068865, 0.09429439, 0.0, 1.1354415, 0.5490961, 0.8855106, 0.02373007, 0.09090366, 0.025704276, 0.0, 0.021423621, 0.44155097, 0.51309925, 0.018635798, 0.20721187, 0.0, 0.03297347]     |
|10 |[0.0, 0.2983364, 0.0, 0.10211588, 0.0, 0.116484255, 0.0, 0.0, 0.0, 0.0, 0.11272467, 0.0, 0.0, 0.0, 0.724771

In [29]:
food_df_idx = food_indexer.transform(df.select(id_col)).select(id_col, F.col('food_idx').cast('int').alias('food_idx')).distinct()
item_with_names = item_factors.join(food_df_idx, item_factors.id == food_df_idx.food_idx).select('id', 'features', id_col).cache()
print('Count items with names:', item_with_names.count())
item_list = item_with_names.rdd.map(lambda r: (r[2], r[1])).collect()
import numpy as np
name_to_vec = {name: np.array(features, dtype=float) for (name, features) in item_list}
names = list(name_to_vec.keys())
print('Loaded', len(names), 'food vectors')

                                                                                

Count items with names: 4499


[Stage 1405:>                                                      (0 + 8) / 10]

Loaded 4499 food vectors


                                                                                

In [30]:
def find_similar(food_name, k=5):
    key = None
    if food_name in name_to_vec:
        key = food_name
    else:
        lowered = food_name.lower()
        for n in names:
            if lowered in n.lower():
                key = n
                break
    if key is None:
        print('Food not found. Try a different name or inspect the names list.')
        return []
    v = name_to_vec[key]
    vnorm = np.linalg.norm(v)
    if vnorm == 0:
        print('Zero vector for', key)
        return []
    sims = []
    for n in names:
        if n == key:
            continue
        w = name_to_vec[n]
        denom = (vnorm * np.linalg.norm(w))
        if denom == 0:
            score = 0.0
        else:
            score = float(np.dot(v, w) / denom)
        sims.append((n, score))
    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:k]

print('Sample food names:')
for n in names[:50]:
    print('-', n)

Sample food names:
- SODA
- ALMOND MILK, UNSWEETENED, PLAIN, SHELF STABLE
- GRANOLA
- 2% REDUCED FAT MILK
- BROWNIE MIX
- CHUNK LIGHT TUNA IN WATER
- DIJON MUSTARD, DIJON
- FRENCH STYLE GREEN BEANS
- HOMESTYLE WAFFLES, HOMESTYLE
- LIGHT ICE CREAM
- ORGANIC MANGO CHUNKS
- PITTED CALIFORNIA RIPE LARGE OLIVES
- SNOW PEAS
- TOMATO KETCHUP, TOMATO
- WHITE CORN TORTILLAS, WHITE CORN
- "CHOP,BI,CC,GM,ET12,5OZ,10#,Z"
- 100% CRANBERRY FLAVORED JUICE BLEND WITH TWO OTHER JUICES FROM CONCENTRATE, CRANBERRY
- 100% ORANGE JUICE FROM CONCENTRATE, ORANGE
- 100% PURE WHITE TEA
- 100% WHOLE GRAIN WHITE CHEDDAR POPCORN, WHITE CHEDDAR
- 2% LACTOSE FREE REDUCED FAT MILK
- 2X DARK ROASTED COFFEE + MILK
- 365 EVERYDAY VALUE, NONFAT YOGURT
- 6 CHEESE ITALIAN A BLEND OF LOW-MOISTURE PART-SKIM MOZZARELLA, MONTEREY JACK, PROVOLONE, ASIAGO, PARMESAN AND ROMANO FINELY SHREDDED CHEESES, 6 CHEESE ITALIAN
- 8"" DOUBLE LAYER RED VELVET CAKE
- A.1., STEAK SAUCE
- ALASKA SEAFOOD, OCEAN BEAUTY SEAFOODS, WILD CAUGHT SALM

In [31]:
q = "banana"
results = find_similar(q, k=5)
for name, score in results:
    print(f"{name}: {score:.4f}")

DUNCAN HINES, SIGNATURE CAKE MIX, FRENCH VANILLA: 0.9657
GLAZED WALNUTS: 0.9401
YOGURT FLAVORED COATING GRANOLA BARS: 0.9390
KROGER, PENUT BUTTER FUDGE ENROBED COOKIES: 0.9390
MINI SANDWICH CRACKERS: 0.9389
