In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, max, count, collect_list, concat_ws, coalesce, upper
import os

spark = SparkSession.builder \
    .appName("NutritionalValuesGenerator") \
    .config("spark.sql.shuffle.partitions", "50") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/19 12:54:36 WARN Utils: Your hostname, Tedy-Laptop, resolves to a loopback address: 127.0.0.1; using 192.168.1.148 instead (on interface wlp1s0)
25/12/19 12:54:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/19 12:54:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
dataset_dir = "/home/tedy/Git/FII-BDA/sampled_dataset"


df_food = spark.read.parquet(f"{dataset_dir}/food.parquet")
df_food_nutrient = spark.read.parquet(f"{dataset_dir}/food_nutrient.parquet")
df_nutrient = spark.read.parquet(f"{dataset_dir}/nutrient.parquet")
df_food_portion = spark.read.parquet(f"{dataset_dir}/food_portion.parquet")
df_measure_unit = spark.read.parquet(f"{dataset_dir}/measure_unit.parquet")


print(f"Food records: {df_food.count()}")
print(f"Food-Nutrient records: {df_food_nutrient.count()}")
print(f"Nutrient records: {df_nutrient.count()}")
print(f"Food Portion records: {df_food_portion.count()}")
print(f"Measure Unit records: {df_measure_unit.count()}")

Food records: 5000
Food-Nutrient records: 65141
Nutrient records: 477
Food Portion records: 104
Measure Unit records: 5


In [3]:
df_food_nutrient_clean = df_food_nutrient.select(
    col("fdc_id"),
    col("nutrient_id"),
    col("amount"),
    col("data_points"),
    col("min"),
    col("max"),
    col("median")
).filter(col("amount").isNotNull())

df_nutrient_clean = df_nutrient.select(
    col("id").alias("nutrient_id"),
    col("name").alias("nutrient_name"),
    col("unit_name").alias("nutrient_unit"),
    col("rank")
)

df_food_enriched = df_food.select(
    col("fdc_id"),
    col("description"),
    col("data_type")
)

df_merged = df_food_enriched.join(
    df_food_nutrient_clean,
    on="fdc_id",
    how="inner"
).join(
    df_nutrient_clean,
    on="nutrient_id",
    how="left"
)

df_merged = df_merged.fillna("Unknown", subset=["nutrient_name", "nutrient_unit"])

df_merged = df_merged.withColumn(
    "nutrient_display",
    when(col("nutrient_unit").isNotNull(),
         concat_ws(" (", col("nutrient_name"), concat_ws(")", col("nutrient_unit")))
    ).otherwise(col("nutrient_name"))
)

print(f"Merged records: {df_merged.count()}")

Merged records: 65141


In [4]:
df_aggregated = df_merged.groupBy("fdc_id", "description", "data_type").agg(
    collect_list("nutrient_display").alias("nutrients"),
    collect_list("amount").alias("amounts"),
    collect_list("median").alias("medians"),
    collect_list("min").alias("mins"),
    collect_list("max").alias("maxs"),
    count("nutrient_id").alias("total_nutrients")
)

df_aggregated = df_aggregated.withColumn(
    "nutrients_json",
    concat_ws("|", col("nutrients"))
)

df_pivot = df_merged.groupBy("fdc_id").pivot("nutrient_name").agg(
    coalesce(
        max(when(col("nutrient_name").isNotNull(), col("amount"))),
        max(when(col("nutrient_name").isNotNull(), col("median")))
    ).alias("amount")
)

print(f"Aggregated foods: {df_aggregated.count()}")
print(f"Pivot table columns: {len(df_pivot.columns)}")
print(f"Foods with nutrients: {df_pivot.count()}")

Aggregated foods: 4688
Pivot table columns: 180
Foods with nutrients: 4688


In [5]:
df_merged_clean = df_merged.filter(col("amount").isNotNull())

# Macronutrients
energy_col = df_merged_clean.filter(col("nutrient_name") == "Energy").select("fdc_id", col("amount").alias("energy")).dropDuplicates(["fdc_id"])
protein_col = df_merged_clean.filter(upper(col("nutrient_name")).contains("PROTEIN")).select("fdc_id", col("amount").alias("protein")).dropDuplicates(["fdc_id"])
carbs_col = df_merged_clean.filter(col("nutrient_name") == "Carbohydrate, by difference").select("fdc_id", col("amount").alias("carbs")).dropDuplicates(["fdc_id"])
total_fat_col = df_merged_clean.filter(col("nutrient_name") == "Total lipid (fat)").select("fdc_id", col("amount").alias("total_fat")).dropDuplicates(["fdc_id"])

# Food composition basics
water_col = df_merged_clean.filter(col("nutrient_name") == "Water").select("fdc_id", col("amount").alias("water")).dropDuplicates(["fdc_id"])
ash_col = df_merged_clean.filter(col("nutrient_name") == "Ash").select("fdc_id", col("amount").alias("ash")).dropDuplicates(["fdc_id"])
alcohol_col = df_merged_clean.filter(col("nutrient_name") == "Alcohol, ethyl").select("fdc_id", col("amount").alias("alcohol")).dropDuplicates(["fdc_id"])
caffeine_col = df_merged_clean.filter(col("nutrient_name") == "Caffeine").select("fdc_id", col("amount").alias("caffeine")).dropDuplicates(["fdc_id"])

# Fiber and Sugars
fiber_col = df_merged_clean.filter(col("nutrient_name").contains("Fiber, total dietary")).select("fdc_id", col("amount").alias("fiber")).dropDuplicates(["fdc_id"])
sugars_col = df_merged_clean.filter(col("nutrient_name") == "Total Sugars").select("fdc_id", col("amount").alias("sugars")).dropDuplicates(["fdc_id"])

# Sugar types
glucose_col = df_merged_clean.filter(col("nutrient_name") == "Glucose").select("fdc_id", col("amount").alias("glucose")).dropDuplicates(["fdc_id"])
fructose_col = df_merged_clean.filter(col("nutrient_name") == "Fructose").select("fdc_id", col("amount").alias("fructose")).dropDuplicates(["fdc_id"])
sucrose_col = df_merged_clean.filter(col("nutrient_name") == "Sucrose").select("fdc_id", col("amount").alias("sucrose")).dropDuplicates(["fdc_id"])
lactose_col = df_merged_clean.filter(col("nutrient_name") == "Lactose").select("fdc_id", col("amount").alias("lactose")).dropDuplicates(["fdc_id"])

# Fats breakdown
saturated_fat_col = df_merged_clean.filter(col("nutrient_name") == "Fatty acids, total saturated").select("fdc_id", col("amount").alias("saturated_fat")).dropDuplicates(["fdc_id"])
monounsaturated_fat_col = df_merged_clean.filter(col("nutrient_name") == "Fatty acids, total monounsaturated").select("fdc_id", col("amount").alias("monounsaturated_fat")).dropDuplicates(["fdc_id"])
polyunsaturated_fat_col = df_merged_clean.filter(col("nutrient_name") == "Fatty acids, total polyunsaturated").select("fdc_id", col("amount").alias("polyunsaturated_fat")).dropDuplicates(["fdc_id"])
trans_fat_col = df_merged_clean.filter(col("nutrient_name") == "Fatty acids, total trans").select("fdc_id", col("amount").alias("trans_fat")).dropDuplicates(["fdc_id"])
cholesterol_col = df_merged_clean.filter(col("nutrient_name") == "Cholesterol").select("fdc_id", col("amount").alias("cholesterol")).dropDuplicates(["fdc_id"])

# Vitamins
vitamin_a_col = df_merged_clean.filter(col("nutrient_name").contains("Vitamin A, RAE")).select("fdc_id", col("amount").alias("vitamin_a")).dropDuplicates(["fdc_id"])
vitamin_c_col = df_merged_clean.filter(col("nutrient_name").contains("Vitamin C")).select("fdc_id", col("amount").alias("vitamin_c")).dropDuplicates(["fdc_id"])
vitamin_d_col = df_merged_clean.filter(col("nutrient_name").contains("Vitamin D (D2 + D3)")).select("fdc_id", col("amount").alias("vitamin_d")).dropDuplicates(["fdc_id"])
vitamin_e_col = df_merged_clean.filter(col("nutrient_name").contains("Vitamin E (alpha-tocopherol)")).select("fdc_id", col("amount").alias("vitamin_e")).dropDuplicates(["fdc_id"])
vitamin_k_col = df_merged_clean.filter(col("nutrient_name").contains("Vitamin K (phylloquinone)")).select("fdc_id", col("amount").alias("vitamin_k")).dropDuplicates(["fdc_id"])

# B Vitamins
thiamin_col = df_merged_clean.filter(col("nutrient_name").contains("Thiamin")).select("fdc_id", col("amount").alias("thiamin_b1")).dropDuplicates(["fdc_id"])
riboflavin_col = df_merged_clean.filter(col("nutrient_name").contains("Riboflavin")).select("fdc_id", col("amount").alias("riboflavin_b2")).dropDuplicates(["fdc_id"])
niacin_col = df_merged_clean.filter(col("nutrient_name").contains("Niacin")).select("fdc_id", col("amount").alias("niacin_b3")).dropDuplicates(["fdc_id"])
vitamin_b6_col = df_merged_clean.filter(col("nutrient_name").contains("Vitamin B-6")).select("fdc_id", col("amount").alias("vitamin_b6")).dropDuplicates(["fdc_id"])
folate_col = df_merged_clean.filter(col("nutrient_name") == "Folate, total").select("fdc_id", col("amount").alias("folate")).dropDuplicates(["fdc_id"])
vitamin_b12_col = df_merged_clean.filter(col("nutrient_name").contains("Vitamin B-12")).select("fdc_id", col("amount").alias("vitamin_b12")).dropDuplicates(["fdc_id"])

# Additional micronutrients
choline_col = df_merged_clean.filter(col("nutrient_name").contains("Choline, total")).select("fdc_id", col("amount").alias("choline")).dropDuplicates(["fdc_id"])

# Minerals
calcium_col = df_merged_clean.filter(col("nutrient_name") == "Calcium, Ca").select("fdc_id", col("amount").alias("calcium")).dropDuplicates(["fdc_id"])
iron_col = df_merged_clean.filter(col("nutrient_name") == "Iron, Fe").select("fdc_id", col("amount").alias("iron")).dropDuplicates(["fdc_id"])
magnesium_col = df_merged_clean.filter(col("nutrient_name") == "Magnesium, Mg").select("fdc_id", col("amount").alias("magnesium")).dropDuplicates(["fdc_id"])
phosphorus_col = df_merged_clean.filter(col("nutrient_name") == "Phosphorus, P").select("fdc_id", col("amount").alias("phosphorus")).dropDuplicates(["fdc_id"])
potassium_col = df_merged_clean.filter(col("nutrient_name") == "Potassium, K").select("fdc_id", col("amount").alias("potassium")).dropDuplicates(["fdc_id"])
sodium_col = df_merged_clean.filter(col("nutrient_name") == "Sodium, Na").select("fdc_id", col("amount").alias("sodium")).dropDuplicates(["fdc_id"])
zinc_col = df_merged_clean.filter(col("nutrient_name") == "Zinc, Zn").select("fdc_id", col("amount").alias("zinc")).dropDuplicates(["fdc_id"])
copper_col = df_merged_clean.filter(col("nutrient_name") == "Copper, Cu").select("fdc_id", col("amount").alias("copper")).dropDuplicates(["fdc_id"])
manganese_col = df_merged_clean.filter(col("nutrient_name") == "Manganese, Mn").select("fdc_id", col("amount").alias("manganese")).dropDuplicates(["fdc_id"])
selenium_col = df_merged_clean.filter(col("nutrient_name") == "Selenium, Se").select("fdc_id", col("amount").alias("selenium")).dropDuplicates(["fdc_id"])

# Carotenoids and antioxidants
beta_carotene_col = df_merged_clean.filter(col("nutrient_name").contains("Carotene, beta")).select("fdc_id", col("amount").alias("beta_carotene")).dropDuplicates(["fdc_id"])
lycopene_col = df_merged_clean.filter(col("nutrient_name") == "Lycopene").select("fdc_id", col("amount").alias("lycopene")).dropDuplicates(["fdc_id"])
lutein_col = df_merged_clean.filter(col("nutrient_name").contains("Lutein + zeaxanthin")).select("fdc_id", col("amount").alias("lutein_zeaxanthin")).dropDuplicates(["fdc_id"])

# Join all nutrients
df_metrics = df_food_enriched.join(energy_col, on="fdc_id", how="left") \
    .join(protein_col, on="fdc_id", how="left") \
    .join(carbs_col, on="fdc_id", how="left") \
    .join(total_fat_col, on="fdc_id", how="left") \
    .join(water_col, on="fdc_id", how="left") \
    .join(ash_col, on="fdc_id", how="left") \
    .join(alcohol_col, on="fdc_id", how="left") \
    .join(caffeine_col, on="fdc_id", how="left") \
    .join(fiber_col, on="fdc_id", how="left") \
    .join(sugars_col, on="fdc_id", how="left") \
    .join(glucose_col, on="fdc_id", how="left") \
    .join(fructose_col, on="fdc_id", how="left") \
    .join(sucrose_col, on="fdc_id", how="left") \
    .join(lactose_col, on="fdc_id", how="left") \
    .join(saturated_fat_col, on="fdc_id", how="left") \
    .join(monounsaturated_fat_col, on="fdc_id", how="left") \
    .join(polyunsaturated_fat_col, on="fdc_id", how="left") \
    .join(trans_fat_col, on="fdc_id", how="left") \
    .join(cholesterol_col, on="fdc_id", how="left") \
    .join(vitamin_a_col, on="fdc_id", how="left") \
    .join(vitamin_c_col, on="fdc_id", how="left") \
    .join(vitamin_d_col, on="fdc_id", how="left") \
    .join(vitamin_e_col, on="fdc_id", how="left") \
    .join(vitamin_k_col, on="fdc_id", how="left") \
    .join(thiamin_col, on="fdc_id", how="left") \
    .join(riboflavin_col, on="fdc_id", how="left") \
    .join(niacin_col, on="fdc_id", how="left") \
    .join(vitamin_b6_col, on="fdc_id", how="left") \
    .join(folate_col, on="fdc_id", how="left") \
    .join(vitamin_b12_col, on="fdc_id", how="left") \
    .join(choline_col, on="fdc_id", how="left") \
    .join(calcium_col, on="fdc_id", how="left") \
    .join(iron_col, on="fdc_id", how="left") \
    .join(magnesium_col, on="fdc_id", how="left") \
    .join(phosphorus_col, on="fdc_id", how="left") \
    .join(potassium_col, on="fdc_id", how="left") \
    .join(sodium_col, on="fdc_id", how="left") \
    .join(zinc_col, on="fdc_id", how="left") \
    .join(copper_col, on="fdc_id", how="left") \
    .join(manganese_col, on="fdc_id", how="left") \
    .join(selenium_col, on="fdc_id", how="left") \
    .join(beta_carotene_col, on="fdc_id", how="left") \
    .join(lycopene_col, on="fdc_id", how="left") \
    .join(lutein_col, on="fdc_id", how="left")

print(f"Foods with energy: {df_metrics.filter(col('energy').isNotNull()).count()}")
print(f"Foods with protein: {df_metrics.filter(col('protein').isNotNull()).count()}")
print(f"Foods with carbs: {df_metrics.filter(col('carbs').isNotNull()).count()}")
print(f"Foods with total fat: {df_metrics.filter(col('total_fat').isNotNull()).count()}")
print(f"Foods with water: {df_metrics.filter(col('water').isNotNull()).count()}")
print(f"Foods with fiber: {df_metrics.filter(col('fiber').isNotNull()).count()}")
print(f"Foods with vitamin C: {df_metrics.filter(col('vitamin_c').isNotNull()).count()}")
print(f"Foods with calcium: {df_metrics.filter(col('calcium').isNotNull()).count()}")
print(f"Foods with iron: {df_metrics.filter(col('iron').isNotNull()).count()}")

Foods with energy: 4485
Foods with protein: 4550
Foods with carbs: 4513
Foods with total fat: 4505
Foods with water: 50
Foods with fiber: 3827
Foods with vitamin C: 2136
Foods with calcium: 3741
Foods with iron: 3747


In [6]:
df_final_profile = df_aggregated.select(
    col("fdc_id"),
    col("description").alias("food_description"),
    col("data_type").alias("food_type"),
    col("total_nutrients")
).join(
    df_metrics.select(
        "fdc_id", "energy", "protein", "carbs", "total_fat", 
        "water", "ash", "alcohol", "caffeine",
        "fiber", "sugars", "glucose", "fructose", "sucrose", "lactose",
        "saturated_fat", "monounsaturated_fat", "polyunsaturated_fat", "trans_fat", "cholesterol",
        "vitamin_a", "vitamin_c", "vitamin_d", "vitamin_e", "vitamin_k",
        "thiamin_b1", "riboflavin_b2", "niacin_b3", "vitamin_b6", "folate", "vitamin_b12",
        "choline",
        "calcium", "iron", "magnesium", "phosphorus", "potassium", "sodium", 
        "zinc", "copper", "manganese", "selenium",
        "beta_carotene", "lycopene", "lutein_zeaxanthin"
    ),
    on="fdc_id",
    how="left"
)

df_final_profile = df_final_profile.select(
    "fdc_id",
    "food_description",
    "food_type",
    "total_nutrients",
    # Macronutrients
    "energy",
    "protein",
    "carbs",
    "total_fat",
    # Food composition
    "water",
    "ash",
    "alcohol",
    "caffeine",
    # Fiber and Sugars
    "fiber",
    "sugars",
    "glucose",
    "fructose",
    "sucrose",
    "lactose",
    # Fats breakdown
    "saturated_fat",
    "monounsaturated_fat",
    "polyunsaturated_fat",
    "trans_fat",
    "cholesterol",
    # Vitamins
    "vitamin_a",
    "vitamin_c",
    "vitamin_d",
    "vitamin_e",
    "vitamin_k",
    # B Vitamins
    "thiamin_b1",
    "riboflavin_b2",
    "niacin_b3",
    "vitamin_b6",
    "folate",
    "vitamin_b12",
    # Additional nutrients
    "choline",
    # Minerals
    "calcium",
    "iron",
    "magnesium",
    "phosphorus",
    "potassium",
    "sodium",
    "zinc",
    "copper",
    "manganese",
    "selenium",
    # Carotenoids
    "beta_carotene",
    "lycopene",
    "lutein_zeaxanthin"
)

print(f"Final nutritional profiles: {df_final_profile.count()}")
print(f"Total columns: {len(df_final_profile.columns)}")
print(f"\nSample of comprehensive nutritional data:")
df_final_profile.show(5, truncate=False)

Final nutritional profiles: 4688
Total columns: 48

Sample of comprehensive nutritional data:


25/12/19 12:54:54 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/12/19 12:54:58 WARN DAGScheduler: Broadcasting large task binary with size 1376.7 KiB


+------+-------------------------------------+------------+---------------+------+-------+-----+---------+-----+----+-------+--------+-----+------+-------+--------+-------+-------+-------------+-------------------+-------------------+---------+-----------+---------+---------+---------+---------+---------+----------+-------------+---------+----------+------+-----------+-------+-------+----+---------+----------+---------+------+----+------+---------+--------+-------------+--------+-----------------+
|fdc_id|food_description                     |food_type   |total_nutrients|energy|protein|carbs|total_fat|water|ash |alcohol|caffeine|fiber|sugars|glucose|fructose|sucrose|lactose|saturated_fat|monounsaturated_fat|polyunsaturated_fat|trans_fat|cholesterol|vitamin_a|vitamin_c|vitamin_d|vitamin_e|vitamin_k|thiamin_b1|riboflavin_b2|niacin_b3|vitamin_b6|folate|vitamin_b12|choline|calcium|iron|magnesium|phosphorus|potassium|sodium|zinc|copper|manganese|selenium|beta_carotene|lycopene|lutein_zeaxan

In [7]:
output_dir = "/home/tedy/Git/FII-BDA/output"
os.makedirs(output_dir, exist_ok=True)


df_final_profile.repartition(1).write.mode("overwrite").parquet(
    f"{output_dir}/nutritional_profiles"
 )


print(f"Results exported to {output_dir}")
print("Generated files:")
print("  - nutritional_profiles/")

25/12/19 12:55:02 WARN DAGScheduler: Broadcasting large task binary with size 1359.9 KiB


Results exported to /home/tedy/Git/FII-BDA/output
Generated files:
  - nutritional_profiles/
