In [None]:
# === Experiment: Range partitioning for GroupBy(User_ID) -> avg(Heart_Rate) ===
# Produces: /data/exp_range_user_avg_results.csv

from pyspark.sql import SparkSession, functions as F
from pyspark.storagelevel import StorageLevel
import os, time
import pandas as pd

# ---------- Config ----------
SOURCE_CSV  = "file:///data/cleaned_personal_health_data.csv"   # adjust if needed
OUTPUT_CSV  = "/data/exp_range_user_avg_results.csv"
PARTITIONS_LIST = [2, 4, 8, 16, 32]   # adjust if you want
RUNS_PER_SETTING = 5
LIMIT_USERS = int(os.environ.get("LIMIT_USERS", "0"))  # optional: subset for speed
SPARK_SHUFFLE_PARTS = 4

# ---------- Spark session ----------
spark = SparkSession.builder \
    .appName("Exp-Range-UserAvg") \
    .config("spark.sql.shuffle.partitions", str(SPARK_SHUFFLE_PARTS)) \
    .config("spark.speculation", "false") \
    .getOrCreate()
spark.sparkContext.setLogLevel("WARN")

# ---------- Load & prep ----------
df = spark.read.csv(SOURCE_CSV, header=True, inferSchema=True)
df = df.select("User_ID", "Heart_Rate")

# Optional subset
if LIMIT_USERS > 0:
    subset = df.select("User_ID").distinct().limit(LIMIT_USERS)
    df = df.join(subset, on="User_ID", how="inner")
    print(f"[INFO] LIMIT_USERS={LIMIT_USERS} -> rows after filter: {df.count()}")

_ = df.limit(1).count()

results_rows = []

for P in PARTITIONS_LIST:
    # Range partitioning by User_ID
    base = df.repartitionByRange(P, F.col("User_ID")).persist(StorageLevel.MEMORY_AND_DISK)
    _ = base.count()  # materialize cache

    # Warm-up query (not timed)
    _ = base.groupBy("User_ID").agg(F.avg("Heart_Rate").alias("Avg_Heart_Rate")).count()

    times = []
    for r in range(1, RUNS_PER_SETTING + 1):
        t0 = time.perf_counter()
        _ = base.groupBy("User_ID").agg(F.avg("Heart_Rate").alias("Avg_Heart_Rate")).count()
        elapsed = round(time.perf_counter() - t0, 6)
        times.append(elapsed)
        print(f"[range] P={P:>2} run {r}/{RUNS_PER_SETTING}: {elapsed:.6f}s")

    avg_t = round(sum(times) / len(times), 6)
    results_rows.append({
        "Partitioning Type": "range",
        "Partitions": P,
        "Runs": RUNS_PER_SETTING,
        "Per-Run Times (s)": ";".join(map(str, times)),
        "Avg Execution Time (s)": avg_t,
        "LimitUsers": LIMIT_USERS
    })
    base.unpersist()

# ---------- Save ----------
pdf = pd.DataFrame(results_rows)
pdf.to_csv(OUTPUT_CSV, index=False)
print(f"\n[OK] Wrote {OUTPUT_CSV}")
display(pdf)
