In [3]:
import os
import sys

# =========================
# 0) PYTHON VERSION FIX
# =========================
# Force python vars to match driver to avoid version mismatch (Driver vs Worker)
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable 
os.environ["PYSPARK_PYTHON"] = sys.executable

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# =========================
# 1) SPARK SESSION (Robust)
# =========================
spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("Velomenaj_Scoring_Global")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")

# =========================
# 2) READ PARQUET INPUT (All History)
# =========================
input_path = "data_temp/gold/gold_flow_amenagement_daily"
input_path_abs = "file:" + os.path.abspath(input_path)

print(f"Reading Gold data from: {input_path_abs}")
df_raw = spark.read.parquet(input_path_abs)

# Cleaning / Casting
df = (
    df_raw.select(
        F.col("amenagement_id"),
        F.col("date"),
        F.col("flux_estime").cast("double")
    )
    .filter(F.col("amenagement_id").isNotNull())
    .filter(F.col("flux_estime") >= 0)  # Remove negative noise
)

print(f"Total Measurements: {df.count()}")

# =========================
# 5) AGGREGATION GLOBALE (Par amenagement_id)
# =========================
# Objectif : Score unique par aménagement sur TOUT l'historique

agg_global = (
    df.groupBy("amenagement_id")
    .agg(
        F.countDistinct("date").alias("n_days_total"),
        F.avg("flux_estime").alias("mean_flux_global"),
        F.stddev_pop("flux_estime").alias("std_flux_global")
    )
)

# =========================
# 6) USAGE SCORE (Percent Rank Global)
# =========================
w_rank = Window.orderBy(F.col("mean_flux_global").asc_nulls_last())
scored = agg_global.withColumn("usage_score", F.percent_rank().over(w_rank))

# =========================
# 7) STABILITY SCORE (CoV)
# =========================
stability_raw = F.when(
    (F.col("mean_flux_global").isNull()) | (F.col("mean_flux_global") <= 0),
    F.lit(0.0)
).otherwise(
    F.lit(1.0) - (F.col("std_flux_global") / F.col("mean_flux_global"))
)

scored = scored.withColumn(
    "stability_score",
    F.when(stability_raw < 0, F.lit(0.0))
     .when(stability_raw > 1, F.lit(1.0))
     .otherwise(stability_raw)
)

# =========================
# 8) FINAL SCORE (Global)
# =========================
W_USAGE = 0.65
W_STAB = 0.35

scored = scored.withColumn(
    "score",
    F.lit(W_USAGE) * F.col("usage_score") + F.lit(W_STAB) * F.col("stability_score")
)

# =========================
# 9) DATA FILTER (Relaxed)
# =========================
# Au lieu de 180 jours/an, on demande un minimum global raisonnable (ex: 30 jours total)
# pour avoir un minimum de significativité.
MIN_DAYS_TOTAL = 30

scored = scored.withColumn(
    "score",
    F.when(F.col("n_days_total") >= MIN_DAYS_TOTAL, F.col("score"))
     .otherwise(F.lit(None).cast("double"))
)

# =========================
# 10) OUTPUT JSON (amenagement_id, score_global)
# =========================
out = scored.select(
    F.concat(F.lit("pvo_patrimoine_voirie.pvoamenagementcyclable."), F.col("amenagement_id")).alias("amenagement_id"),
    F.round("score", 6).alias("score")
).filter(F.col("score").isNotNull())

out_path = "amenagement_scoring_global_json_2"
out_path_abs = "file:" + os.path.abspath(out_path)

(
    out
    .write
    .mode("overwrite")
    .json(out_path_abs)
)

print("✅ Global Scores (v2) written to:", out_path_abs)
out.show(20, truncate=False)
print(f"Total Scored Amenities: {out.count()}")


Reading Gold data from: file:/Users/youness/Desktop/datathon_velomenaj/data_temp/gold/gold_flow_amenagement_daily
Total Measurements: 1323674


26/01/05 17:17:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:07 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:08 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 1

✅ Global Scores (v2) written to: file:/Users/youness/Desktop/datathon_velomenaj/amenagement_scoring_global_json_2


26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 1

+-------------------------------------------------+--------+
|amenagement_id                                   |score   |
+-------------------------------------------------+--------+
|pvo_patrimoine_voirie.pvoamenagementcyclable.788 |0.0     |
|pvo_patrimoine_voirie.pvoamenagementcyclable.3337|0.0     |
|pvo_patrimoine_voirie.pvoamenagementcyclable.6555|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.4130|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.3867|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.6556|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.6500|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.6557|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.5708|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.6560|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.6554|0.002743|
|pvo_patrimoine_voirie.pvoamenagementcyclable.3222|0.11414 |
|pvo_patrimoine_voirie.pvoamenagementcyclable.6916|0.016456|
|pvo_patrimoine_voirie.p

26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Total Scored Amenities: 456


26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
26/01/05 17:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
