In [1]:
import os
import sys
import json
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, ArrayType
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator
# IMPORT NATIVE ML FUNCTIONS TO AVOID UDFS
try:
    from pyspark.ml.functions import vector_to_array
except ImportError:
    print("WARNING: pyspark.ml.functions.vector_to_array not found. Using fallback.")

# =========================
# 1) SPARK SESSION SETUP
# =========================

# FIX: Force python vars to match driver to avoid version mismatch (Driver vs Worker)
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable 
os.environ["PYSPARK_PYTHON"] = sys.executable

spark = (
    SparkSession.builder
    .master("local[*]")
    .appName("Velomenaj_Prediction_TopTier")
    .config("spark.driver.memory", "6g")
    .getOrCreate()
)
spark.sparkContext.setLogLevel("WARN")
print("Spark Session Created Successfully")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/05 17:27:24 WARN Utils: Your hostname, Younesss-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.42.233.166 instead (on interface en0)
26/01/05 17:27:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/05 17:27:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
26/01/05 17:27:24 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Spark Session Created Successfully


In [2]:
# =========================
# 2) LOAD DATA (Features + Targets)
# =========================

# A. Load Features (Infrastructure)
path_amenagements = "file:" + os.path.abspath("data_temp/silver_amenagements_with_coordinates")
df_raw_features = spark.read.parquet(path_amenagements)
# FIX: Add prefix to match output format of Scoring2
df_raw_features = df_raw_features.withColumn(
    "amenagement_id", 
    F.concat(F.lit("pvo_patrimoine_voirie.pvoamenagementcyclable."), F.col("amenagement_id"))
)

# --- ROBUST NATIVE SPARK COORDINATE PARSING (No UDF = No Broken Pipe) ---
# 1. Clean string: Remove brackets [], spaces, and quotes
# 2. Split by comma to get a flat array of numbers [lon1, lat1, lon2, lat2...]
df_clean = df_raw_features.withColumn(
    "cleaned_coords", 
    F.split(F.regexp_replace(F.col("coordiantes"), r"[\[\]\s]", ""), ",")
)

# 3. Explode to rows to process list elements
# posexplode gives: pos (index), val (number as string)
df_exploded = df_clean.select(
    "amenagement_id", 
    F.posexplode(F.col("cleaned_coords")).alias("pos", "val")
)

# 4. Filter empty and cast to Double
df_exploded = df_exploded.filter(F.length(F.col("val")) > 0).withColumn("val", F.col("val").cast("double"))

# 5. GroupBy to calculate Main Centroid
# Even Index (0, 2, 4...) = Longitude
# Odd Index (1, 3, 5...) = Latitude
df_coords = df_exploded.groupBy("amenagement_id").agg(
    F.avg(F.when(F.col("pos") % 2 == 1, F.col("val"))).alias("centroid_lat"),
    F.avg(F.when(F.col("pos") % 2 == 0, F.col("val"))).alias("centroid_lon")
)

# 6. Join back to original features to recover metadata (nom, type, etc.)
df_features = df_raw_features.join(df_coords, on="amenagement_id", how="inner") \
                             .filter(F.col("centroid_lat").isNotNull())

# B. Load Targets (Global Scores 2014-2025)
path_scores = "file:" + os.path.abspath("amenagement_scoring_global_json_2")
df_scores = spark.read.json(path_scores).withColumnRenamed("score_global", "score")

# C. Join
df_full = df_features.join(df_scores, on="amenagement_id", how="inner")

print(f"Dataset Size (Global): {df_full.count()} rows")
df_full.select("amenagement_id", "nom", "centroid_lat", "centroid_lon", "score").show(5, truncate=False)

Dataset Size (Global): 456 rows
+-------------------------------------------------+---------------------+------------------+-----------------+--------+
|amenagement_id                                   |nom                  |centroid_lat      |centroid_lon     |score   |
+-------------------------------------------------+---------------------+------------------+-----------------+--------+
|pvo_patrimoine_voirie.pvoamenagementcyclable.6026|Rue Léon Blum        |45.764272317499994|4.917622205000001|0.19752 |
|pvo_patrimoine_voirie.pvoamenagementcyclable.2568|Pont d'Herbens       |45.795171905      |4.990913535000001|0.201999|
|pvo_patrimoine_voirie.pvoamenagementcyclable.6738|Rue de Saint-Cyr     |45.786508309999995|4.808178546666666|0.330917|
|pvo_patrimoine_voirie.pvoamenagementcyclable.2827|Quai Victor Augagneur|45.757608444      |4.840734573      |0.812162|
|pvo_patrimoine_voirie.pvoamenagementcyclable.282 |Pont Lafayette       |45.763568305      |4.84062562       |0.819798|
+-------

In [3]:
# =========================
# 3) PREPARE TARGET (Top 10%)
# =========================

# Calculate Threshold on Global Scores
quantile_90 = df_full.stat.approxQuantile("score", [0.9], 0.01)[0]
print(f"Top Tier Threshold (Top 10% Global): Score >= {quantile_90:.4f}")

# Create Binary Label - Handle NULLs robustly (if score is NULL, label is 0.0)
df_dataset = df_full.withColumn(
    "label", 
    F.when(F.col("score") >= quantile_90, 1.0).otherwise(0.0)
)

df_dataset.groupBy("label").count().show()

Top Tier Threshold (Top 10% Global): Score >= 0.7738
+-----+-----+
|label|count|
+-----+-----+
|  0.0|  406|
|  1.0|   50|
+-----+-----+



In [4]:
# =========================
# 4) TRAIN RANDOM FOREST PIPELINE
# =========================

# Handle Categorical Features
indexer_type = StringIndexer(inputCol="typeamenagement", outputCol="type_idx", handleInvalid="keep")
indexer_reseau = StringIndexer(inputCol="reseau", outputCol="reseau_idx", handleInvalid="keep")
encoder = OneHotEncoder(inputCols=["type_idx", "reseau_idx"], outputCols=["type_vec", "reseau_vec"])

# Assemble Features (Geo is Key!)
assembler = VectorAssembler(
    inputCols=["centroid_lat", "centroid_lon", "type_vec", "reseau_vec"],
    outputCol="features"
)

# Classifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=50, maxDepth=10)

# Pipeline
pipeline = Pipeline(stages=[indexer_type, indexer_reseau, encoder, assembler, rf])

# Train/Test Split
train_data, test_data = df_dataset.randomSplit([0.8, 0.2], seed=42)

print("Training Model...")
model = pipeline.fit(train_data)
print("Training Complete.")

# Evaluate
predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
auc = evaluator.evaluate(predictions)
print(f"Model Performance (AUC): {auc:.4f}")

Training Model...


26/01/05 17:27:40 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Training Complete.
Model Performance (AUC): 0.9467


In [5]:
# =========================
# 5) GRID SIMULATION (The Treasure Map)
# =========================

# Get best features from training data (Mode) to ensure valid inputs
# We pick the most frequent type/network to simulate a "standard" good infrastructure
common_type = df_dataset.groupBy("typeamenagement").count().orderBy(F.desc("count")).first()["typeamenagement"]
common_reseau = df_dataset.groupBy("reseau").count().orderBy(F.desc("count")).first()["reseau"]
print(f"Using simulation features: Type='{common_type}', Reseau='{common_reseau}'")

# Define Bounding Box (Lyon approx)
lat_min, lat_max = 45.70, 45.85
lon_min, lon_max = 4.75, 4.95

# Generate Grid Points (approx every 200m)
lat_steps = np.linspace(lat_min, lat_max, 50)
lon_steps = np.linspace(lon_min, lon_max, 50)

grid_data = []
for lat in lat_steps:
    for lon in lon_steps:
        # Simulate a infrastructure using the common valid types
        grid_data.append((float(lat), float(lon), common_type, common_reseau))

# Create DataFrame
df_grid = spark.createDataFrame(grid_data, ["centroid_lat", "centroid_lon", "typeamenagement", "reseau"])

# Predict Probability
grid_predictions = model.transform(df_grid)

# --- ROBUST NATIVE PROBABILITY EXTRACTION (No UDF = No Broken Pipe) ---
from pyspark.ml.functions import vector_to_array

# 'probability' is a DenseVector. vector_to_array converts it to [prob_class0, prob_class1]
# We select index 1 (Success)
df_heatmap = grid_predictions.withColumn("prob_array", vector_to_array("probability")) \
                             .withColumn("prob_success", F.col("prob_array")[1])

# STRATEGY: TOP 50 RANKING (No strict threshold)
# We want the 50 best locations, whatever their absolute score is.
top_candidates = df_heatmap.orderBy(F.desc("prob_success")).limit(50)

print(f"Selecting Top 50 candidates...")
top_candidates.select("centroid_lat", "centroid_lon", "prob_success").show(10)

Using simulation features: Type='Piste Cyclable', Reseau='Réseau structurant et super structurant'
Selecting Top 50 candidates...
+------------------+------------------+-------------------+
|      centroid_lat|      centroid_lon|       prob_success|
+------------------+------------------+-------------------+
|45.755102040816325| 4.839795918367347|  0.720142494549613|
| 45.76122448979592| 4.839795918367347| 0.6062145753296765|
|45.755102040816325| 4.843877551020408| 0.5981640066632304|
| 45.75816326530612| 4.839795918367347| 0.5727544323939976|
| 45.76428571428572| 4.839795918367347| 0.5550570661721673|
|45.755102040816325|  4.84795918367347| 0.5398954227188242|
| 45.76734693877551| 4.839795918367347|0.47520390881350744|
|45.755102040816325|4.8520408163265305| 0.4571453235554165|
|45.755102040816325| 4.856122448979592| 0.4571453235554165|
| 45.77040816326531| 4.839795918367347| 0.4456252432126205|
+------------------+------------------+-------------------+
only showing top 10 rows


                                                                                

In [6]:
# =========================
# 6) EXPORT OUTPUT (JSON)
# =========================
output_file = "predictions_heatmap_lyon_2.json"

# Collect to Driver (small data) and write JSON
pdf_candidates = top_candidates.select("centroid_lat", "centroid_lon", "prob_success").toPandas()
pdf_candidates["recommendation"] = "Top-50 Potential"

pdf_candidates.to_json(output_file, orient='records', indent=4)
print(f"✅ Prediction Map exported to: {os.path.abspath(output_file)}")
print(f"Rows written: {len(pdf_candidates)}")
print(pdf_candidates.head())

✅ Prediction Map exported to: /Users/youness/Desktop/datathon_velomenaj/predictions_heatmap_lyon_2.json
Rows written: 50
   centroid_lat  centroid_lon  prob_success    recommendation
0     45.755102      4.839796      0.720142  Top-50 Potential
1     45.761224      4.839796      0.606215  Top-50 Potential
2     45.755102      4.843878      0.598164  Top-50 Potential
3     45.758163      4.839796      0.572754  Top-50 Potential
4     45.764286      4.839796      0.555057  Top-50 Potential
