# Top 10 Protein per Kcal (Spark)
This section uses Apache Spark to read the nutritional profiles Parquet and compute the Top 10 foods by protein-per-kcal, writing results to `output/Top10_bestfoods_prots_per_kcal/`.

In [1]:
# Initialize Spark Session
from pyspark.sql import SparkSession, functions as F
import os
spark = SparkSession.builder.appName("Top10 Protein per Kcal").getOrCreate()
print("Spark version:", spark.version)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/19 16:32:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark version: 4.0.1


In [2]:
# Read nutritional profiles with Spark (directory or specific part file)
base_dir = "output/nutritional_profiles"
part_file = "output/nutritional_profiles/part-00000-c8bad44e-47d3-4d65-81e1-c1a23350546a-c000.snappy.parquet"
input_path = base_dir if os.path.exists(base_dir) else part_file
df_spark = spark.read.parquet(input_path)
print("Input path:", input_path)
df_spark.printSchema()


                                                                                

Input path: output/nutritional_profiles
root
 |-- fdc_id: long (nullable = true)
 |-- food_description: string (nullable = true)
 |-- food_type: string (nullable = true)
 |-- total_nutrients: long (nullable = true)
 |-- energy: double (nullable = true)
 |-- protein: double (nullable = true)
 |-- carbs: double (nullable = true)
 |-- total_fat: double (nullable = true)
 |-- water: double (nullable = true)
 |-- ash: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- caffeine: double (nullable = true)
 |-- fiber: double (nullable = true)
 |-- sugars: double (nullable = true)
 |-- glucose: double (nullable = true)
 |-- fructose: double (nullable = true)
 |-- sucrose: double (nullable = true)
 |-- lactose: double (nullable = true)
 |-- saturated_fat: double (nullable = true)
 |-- monounsaturated_fat: double (nullable = true)
 |-- polyunsaturated_fat: double (nullable = true)
 |-- trans_fat: double (nullable = true)
 |-- cholesterol: double (nullable = true)
 |-- vitamin_a: 

In [3]:
# Detect columns for id, name, energy (kcal), and protein (g)
cols = [c.lower() for c in df_spark.columns]
def find_col(candidates):
    for cand in candidates:
        for i, col in enumerate(df_spark.columns):
            if col.lower() == cand.lower():
                return df_spark.columns[i]
    # fallback: contains match
    for cand in candidates:
        for i, col in enumerate(df_spark.columns):
            if cand.lower() in col.lower():
                return df_spark.columns[i]
    return None

id_candidates = ["fdc_id", "id"]
name_candidates = ["description", "food_name", "name"]
kcal_candidates = ["energy_kcal", "energy", "kcal", "calories", "1008"]
protein_candidates = ["protein_g", "protein", "1003"]

id_col = find_col(id_candidates)
name_col = find_col(name_candidates)
kcal_col = find_col(kcal_candidates)
protein_col = find_col(protein_candidates)

print("Selected columns:")
print(" id:", id_col)
print(" name:", name_col)
print(" kcal:", kcal_col)
print(" protein:", protein_col)

if not id_col or not name_col or not kcal_col or not protein_col:
    raise RuntimeError(f"Missing required columns. Available: {df_spark.columns}")


Selected columns:
 id: fdc_id
 name: food_description
 kcal: energy
 protein: protein


In [4]:
# Compute protein_per_kcal and select Top 10
df_ratio = (
    df_spark
    .filter(F.col(kcal_col) > 0)
    .withColumn("protein_per_kcal", F.col(protein_col) / F.col(kcal_col))
)
top10 = (
    df_ratio
    .select(id_col, name_col, protein_col, kcal_col, "protein_per_kcal")
    .orderBy(F.col("protein_per_kcal").desc())
    .limit(10)
)
top10.show(truncate=False)

# Write outputs
output_dir = "output/Top10_bestfoods_prots_per_kcal"
os.makedirs(output_dir, exist_ok=True)
(
    top10
    .coalesce(1)
    .write
    .mode("overwrite")
    .option("header", True)
    .csv(os.path.join(output_dir, "top10_csv"))
)
(
    top10
    .coalesce(1)
    .write
    .mode("overwrite")
    .parquet(os.path.join(output_dir, "top10_parquet"))
)
print("Saved:", output_dir)

+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------+-------------------+
|fdc_id |food_description                                                                                                                                               |protein|energy|protein_per_kcal   |
+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+------+-------------------+
|2408740|ORGANIC SESAME FLOUR, SESAME FLOUR                                                                                                                             |45.0   |98.0  |0.45918367346938777|
|1531299|CASHEW CARAMEL PROTEIN PARADISE MACROBAR, CASHEW CARAMEL                                                                                                       |18.33  |45.