In [20]:
# Imports and Spark session
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.types import BooleanType
import os

spark = SparkSession.builder.appName("Food Recommendations").getOrCreate()
print("Spark version:", spark.version)

Spark version: 4.0.1


In [None]:
# Configuration: adjust as needed
input_path = "output/nutritional_profiles"
fallback_file = "output/nutritional_profiles/part-00000-c8bad44e-47d3-4d65-81e1-c1a23350546a-c000.snappy.parquet"
output_dir = "output/Recommendations"

top_k = 10
sort_by = "protein_per_kcal"  # e.g., 'protein_per_kcal' to sort by a single column; empty to use composite score
ascending = False

# Filters
calorie_min = 0
calorie_max = 800
exclude_allergens = ["gluten", "peanut", "tree nut", "soy", "milk", "egg", "fish", "shellfish", "sesame"]
exclude_keywords = ['pork', 'beef']  # e.g., ['pork', 'beef']
include_keywords = ['lentil']  # e.g., ['lentil', 'tofu', 'quinoa']
must_include = True   # if True and include_keywords set, only keep rows containing those keywords

# Scoring weights (used when sort_by is empty)
weights = {
    'protein_per_kcal': 1.0,
    'fiber_per_kcal': 0.5,
    'carb_per_kcal': -0.2,
    'sugar_per_kcal': -0.5,
    'fat_per_kcal': -0.3,
    'include_keyword_bonus': 0.3
}

# Control writing outputs
write_outputs = True

In [22]:
# Read Parquet
path = input_path if os.path.exists(input_path) else fallback_file
print("Input path:", path)
df = spark.read.parquet(path)
df.printSchema()

Input path: output/nutritional_profiles
root
 |-- fdc_id: long (nullable = true)
 |-- food_description: string (nullable = true)
 |-- food_type: string (nullable = true)
 |-- total_nutrients: long (nullable = true)
 |-- energy: double (nullable = true)
 |-- protein: double (nullable = true)
 |-- carbs: double (nullable = true)
 |-- total_fat: double (nullable = true)
 |-- water: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- caffeine: double (nullable = true)
 |-- fiber: double (nullable = true)
 |-- sugars: double (nullable = true)
 |-- glucose: double (nullable = true)
 |-- fructose: double (nullable = true)
 |-- sucrose: double (nullable = true)
 |-- lactose: double (nullable = true)
 |-- saturated_fat: double (nullable = true)
 |-- monounsaturated_fat: double (nullable = true)
 |-- polyunsaturated_fat: double (nullable = true)
 |-- trans_fat: double (nullable = true)
 |-- cholesterol: double (nullable = true)
 |-- vitamin_a: 

In [23]:
# Column detection helpers
def detect_column(df, candidates):
    # Exact match first
    for cand in candidates:
        for i, col in enumerate(df.columns):
            if col.lower() == cand.lower():
                return df.columns[i]
    # Contains match fallback
    for cand in candidates:
        for i, col in enumerate(df.columns):
            if cand.lower() in col.lower():
                return df.columns[i]
    return None

id_col = detect_column(df, ['fdc_id', 'id'])
name_col = detect_column(df, ['description', 'food_name', 'name', 'brand_name'])
kcal_col = detect_column(df, ['energy_kcal', 'energy', 'kcal', 'calories', '1008'])
protein_col = detect_column(df, ['protein_g', 'protein', '1003'])
fiber_col = detect_column(df, ['fiber_g', 'fiber', 'dietary_fiber', '1079'])
carb_col = detect_column(df, ['carbohydrate_g', 'carbohydrates', 'carb', 'carbohydrate', '1005'])
sugar_col = detect_column(df, ['sugar_g', 'sugars', 'sugar', '2000'])
fat_col = detect_column(df, ['fat_g', 'fat', 'total_lipid', '1004'])
text_col = detect_column(df, ['ingredients', 'ingredients_text', 'description', 'food_name', 'name', 'brand_name'])

print('Selected columns:')
print(' id:', id_col)
print(' name:', name_col)
print(' kcal:', kcal_col)
print(' protein:', protein_col)
print(' fiber:', fiber_col)
print(' carbs:', carb_col)
print(' sugars:', sugar_col)
print(' fat:', fat_col)
print(' text:', text_col)

if not id_col or not name_col or not kcal_col or not protein_col:
    raise RuntimeError(f"Missing required columns. Available: {df.columns}")

Selected columns:
 id: fdc_id
 name: food_description
 kcal: energy
 protein: protein
 fiber: fiber
 carbs: carbs
 sugars: sugars
 fat: total_fat
 text: food_description


In [25]:
# Build base DataFrame with filters and nutrient densities
def contains_any_udf(keywords):
    lower_keywords = [k.lower() for k in keywords]
    def _fn(text):
        if text is None:
            return False
        t = text.lower()
        for k in lower_keywords:
            if k in t:
                return True
        return False
    return F.udf(_fn, BooleanType())

df_base = df.filter(F.col(kcal_col) > 0)

# Text filters
if text_col:
    exclude_udf = contains_any_udf(exclude_allergens + exclude_keywords)
    include_udf = contains_any_udf(include_keywords)
    df_base = (df_base
        .withColumn('has_excluded', exclude_udf(F.col(text_col)))
        .withColumn('has_included', include_udf(F.col(text_col)))
        .filter(~F.col('has_excluded'))
    )
    if must_include and include_keywords:
        df_base = df_base.filter(F.col('has_included'))
else:
    df_base = df_base.withColumn('has_included', F.lit(False))

# Calorie bounds
df_base = df_base.filter((F.col(kcal_col) >= calorie_min) & (F.col(kcal_col) <= calorie_max))

# Densities per kcal
df_base = df_base.withColumn('protein_per_kcal', F.col(protein_col) / F.col(kcal_col))
if fiber_col:
    df_base = df_base.withColumn('fiber_per_kcal', F.col(fiber_col) / F.col(kcal_col))
if carb_col:
    df_base = df_base.withColumn('carb_per_kcal', F.col(carb_col) / F.col(kcal_col))
if sugar_col:
    df_base = df_base.withColumn('sugar_per_kcal', F.col(sugar_col) / F.col(kcal_col))
if fat_col:
    df_base = df_base.withColumn('fat_per_kcal', F.col(fat_col) / F.col(kcal_col))

print('Rows after filters:', df_base.count())

Rows after filters: 3697


In [26]:
# Scoring / Sorting and Top-K output
select_cols = [id_col, name_col, protein_col, kcal_col, 'protein_per_kcal']
for c in ['fiber_per_kcal', 'carb_per_kcal', 'sugar_per_kcal', 'fat_per_kcal', 'has_included']:
    if c in df_base.columns:
        select_cols.append(c)

if sort_by:
    if sort_by not in df_base.columns:
        raise RuntimeError(f"Requested sort-by column '{sort_by}' not found in columns: {df_base.columns}")
    ordered = df_base.orderBy(F.col(sort_by).asc() if ascending else F.col(sort_by).desc())
    recommendations = ordered.select(*select_cols).limit(top_k)
else:
    score = (F.lit(0.0) + F.lit(weights.get('protein_per_kcal', 0.0)) * F.col('protein_per_kcal'))
    if 'fiber_per_kcal' in df_base.columns:
        score = score + F.lit(weights.get('fiber_per_kcal', 0.0)) * F.col('fiber_per_kcal')
    if 'carb_per_kcal' in df_base.columns:
        score = score + F.lit(weights.get('carb_per_kcal', 0.0)) * F.col('carb_per_kcal')
    if 'sugar_per_kcal' in df_base.columns:
        score = score + F.lit(weights.get('sugar_per_kcal', 0.0)) * F.col('sugar_per_kcal')
    if 'fat_per_kcal' in df_base.columns:
        score = score + F.lit(weights.get('fat_per_kcal', 0.0)) * F.col('fat_per_kcal')
    score = score + F.lit(weights.get('include_keyword_bonus', 0.0)) * F.when(F.col('has_included'), F.lit(1.0)).otherwise(F.lit(0.0))
    recommendations = (df_base
        .withColumn('score', score)
        .select(*select_cols, 'score')
        .orderBy(F.col('score').asc() if ascending else F.col('score').desc())
        .limit(top_k)
    )

recommendations.show(truncate=False)

# Optional write outputs
if write_outputs:
    os.makedirs(output_dir, exist_ok=True)
    (recommendations
        .coalesce(1)
        .write
        .mode('overwrite')
        .option('header', True)
        .csv(os.path.join(output_dir, 'top_csv'))
    )
    (recommendations
        .coalesce(1)
        .write
        .mode('overwrite')
        .parquet(os.path.join(output_dir, 'top_parquet'))
    )
    print('Saved:', output_dir)

+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------+-------------------+-------------------+--------------------+---------------------+---------------------+------------+
|fdc_id |food_description                                                                                                                                               |protein|energy|protein_per_kcal   |fiber_per_kcal     |carb_per_kcal       |sugar_per_kcal       |fat_per_kcal         |has_included|
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------+-------------------+-------------------+--------------------+---------------------+---------------------+------------+
|1531299|CASHEW CARAMEL PROTEIN PARADISE MACROBAR, CASHEW CARAMEL                          

## Usage
1. Rulează celulele 1–4 pentru inițializare și citirea datelor.
2. În Celula 3, modifică preferințele: `exclude_allergens`, `include_keywords`, `exclude_keywords`, `calorie_min/max`, `sort_by` sau `weights`.
3. Rulează celulele 5–7 pentru detectare coloane, filtrare, scor/sort și Top-K.
4. Rezultatele se pot scrie în `output/Recommendations` dacă `write_outputs = True`.