In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()

def show_schema(path, sep):
    df = (spark.read
          .option("header", True)
          .option("sep", sep)
          .option("inferSchema", False)
          .option("encoding", "UTF-8")
          .csv(path))
    print("\n===", path, f"(sep='{sep}') ===")
    print("nb_cols:", len(df.columns))
    print("cols:", df.columns)

# CSV en ';'
show_schema("data/bronze/comptage/sites/sites.csv", ";")
show_schema("data/bronze/amenagements/amenagements.csv", ";")      # ou le nom exact si unique
show_schema("data/bronze/comptage/channels/channels.csv", ";")

# CSV en ','
show_schema("data/bronze/comptage/measures/measures.csv", ",") # ou le nom exact si unique
show_schema("data/bronze/comptage_manuel/comptage_manuel.csv", ",")   # ton comptage manuel Croix-Rousse (CSV)



=== data/bronze/comptage/sites/sites.csv (sep=';') ===
nb_cols: 11
cols: ['gid', 'site_id', 'parent_site_id', 'fr_insee_code', 'xlong', 'ylat', 'external_ids', 'infrastructure_type', 'site_name', 'lon', 'lat']

=== data/bronze/amenagements/amenagements.csv (sep=';') ===
nb_cols: 23
cols: ['nom', 'commune1', 'insee1', 'commune2', 'insee2', 'reseau', 'financementac', 'typeamenagement', 'typeamenagement2', 'positionnement', 'senscirculation', 'environnement', 'localisation', 'typologiepiste', 'revetementpiste', 'domanialite', 'reglementation', 'zonecirculationapaisee', 'anneelivraison', 'longueur', 'observation', 'validite', 'gid']

=== data/bronze/comptage/channels/channels.csv (sep=';') ===
nb_cols: 19
cols: ['channel_id', 'channel_provider_id', 'site_provider_id', 'site_id', 'mobility_type', 'comment', 'counter_transmission_type', 'publication_transmission_type', 'counter_type', 'direction', 'provider_direction_code', 'provider_direction_name', 'data_provider_name', 'temporality', 'st

In [3]:
from pathlib import Path

src = Path("data/bronze/comptage_manuel/comptage_manuel.csv")
dst = Path("data/bronze/comptage_manuel/comptage_manuel_clean.csv")

lines = src.read_text(encoding="utf-8").splitlines()

# On garde à partir de la ligne qui commence par "Date," (header réel)
start_idx = next(i for i, l in enumerate(lines) if l.startswith("Date,"))

clean = "\n".join(lines[start_idx:]) + "\n"
dst.write_text(clean, encoding="utf-8")

print("Written:", dst)
print("First 3 lines of cleaned file:")
for l in clean.splitlines()[:3]:
    print(l)


Written: data\bronze\comptage_manuel\comptage_manuel_clean.csv
First 3 lines of cleaned file:
Date,Point comptage,Flux,Direction,Genre,Véhicule,Heure début,Heure fin,Nombre comptés,Remarques,Compteur,,
03/12/2024,Avenue des Cottages,Cottages,Descente,Indéfini/Mixte,Vélo,07:30,09:00,135,Cargos non distingués,Daniel,,
03/12/2024,Avenue des Cottages,Cottages,Descente,Indéfini/Mixte,Trotinette/EDPM,07:30,09:00,7,Cargos non distingués,Daniel,,


In [4]:
show_schema("data/bronze/comptage_manuel/comptage_manuel_clean.csv", ",")



=== data/bronze/comptage_manuel/comptage_manuel_clean.csv (sep=',') ===
nb_cols: 13
cols: ['Date', 'Point comptage', 'Flux', 'Direction', 'Genre', 'Véhicule', 'Heure début', 'Heure fin', 'Nombre comptés', 'Remarques', 'Compteur', '_c11', '_c12']


In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim
from pyspark.sql.types import StringType, FloatType, DoubleType
from pyspark.sql.functions import regexp_replace


spark = SparkSession.builder.getOrCreate()

# Lecture BRONZE
sites_bronze = (
    spark.read
    .option("header", True)
    .option("sep", ";")
    .option("encoding", "UTF-8")
    .csv("data/bronze/comptage/sites/sites.csv")
)

print("=== BRONZE sites ===")
sites_bronze.printSchema()
print("Rows:", sites_bronze.count())

# Sélection + renommage + typage (SILVER)
sites_silver = (
    sites_bronze
    .select(
        col("site_id").cast(StringType()).alias("site_id"),
        trim(col("site_name")).alias("site_name"),

        # virgule -> point, puis cast
        regexp_replace(col("lon"), ",", ".").cast(DoubleType()).alias("lon"),
        regexp_replace(col("lat"), ",", ".").cast(DoubleType()).alias("lat"),

        col("fr_insee_code").alias("insee_code"),
        col("infrastructure_type"),
        col("parent_site_id").cast(StringType()).alias("parent_site_id"),
    )
)

print("=== SILVER sites (preview) ===")
sites_silver.show(5, truncate=False)

# Écriture SILVER (overwrite volontaire)
(
    sites_silver
    .write
    .mode("overwrite")
    .parquet("data/silver/silver_sites")
)

print("silver_sites written to data/silver/silver_sites")


=== BRONZE sites ===
root
 |-- gid: string (nullable = true)
 |-- site_id: string (nullable = true)
 |-- parent_site_id: string (nullable = true)
 |-- fr_insee_code: string (nullable = true)
 |-- xlong: string (nullable = true)
 |-- ylat: string (nullable = true)
 |-- external_ids: string (nullable = true)
 |-- infrastructure_type: string (nullable = true)
 |-- site_name: string (nullable = true)
 |-- lon: string (nullable = true)
 |-- lat: string (nullable = true)

Rows: 280
=== SILVER sites (preview) ===
+--------------+---------------------------------------+------------------+-----------------+----------+-------------------+--------------+
|site_id       |site_name                              |lon               |lat              |insee_code|infrastructure_type|parent_site_id|
+--------------+---------------------------------------+------------------+-----------------+----------+-------------------+--------------+
|69149.00072.12|OULLINS_Gare routière                  |4.81512527  

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower, when, regexp_replace
from pyspark.sql.types import StringType, IntegerType, DoubleType

spark = SparkSession.builder.getOrCreate()

amen_bronze = (
    spark.read
    .option("header", True)
    .option("sep", ";")
    .option("encoding", "UTF-8")
    .csv("data/bronze/amenagements/amenagements.csv")
)

print("=== BRONZE amenagements ===")
amen_bronze.printSchema()
print("Rows:", amen_bronze.count())

# Nettoyages légers + typage (SILVER enrichi)
amen_silver = (
    amen_bronze
    .select(
        col("gid").cast(StringType()).alias("amenagement_id"),

        trim(col("nom")).alias("nom"),

        trim(col("commune1")).alias("commune1"),
        trim(col("insee1")).alias("insee1"),
        trim(col("commune2")).alias("commune2"),
        trim(col("insee2")).alias("insee2"),

        trim(col("reseau")).alias("reseau"),
        trim(col("financementac")).alias("financementac"),

        trim(col("typeamenagement")).alias("typeamenagement"),
        trim(col("typeamenagement2")).alias("typeamenagement2"),
        trim(col("positionnement")).alias("positionnement"),
        trim(col("senscirculation")).alias("senscirculation"),
        trim(col("environnement")).alias("environnement"),
        trim(col("localisation")).alias("localisation"),
        trim(col("typologiepiste")).alias("typologiepiste"),
        trim(col("revetementpiste")).alias("revetementpiste"),
        trim(col("domanialite")).alias("domanialite"),
        trim(col("reglementation")).alias("reglementation"),
        trim(col("zonecirculationapaisee")).alias("zonecirculationapaisee"),

        # Typages importants
        regexp_replace(col("anneelivraison"), ",", ".").cast(IntegerType()).alias("anneelivraison"),
        regexp_replace(col("longueur"), ",", ".").cast(DoubleType()).alias("longueur_m"),

        trim(col("observation")).alias("observation"),

        # validite -> bool (robuste)
        when(lower(trim(col("validite"))).isin("oui", "true", "1"), True)
        .when(lower(trim(col("validite"))).isin("non", "false", "0"), False)
        .otherwise(None)
        .alias("is_valid")
    )
)

print("=== SILVER amenagements (preview) ===")
amen_silver.show(5, truncate=False)

(
    amen_silver
    .write
    .mode("overwrite")
    .parquet("data/silver/silver_amenagements")
)

print("silver_amenagements written to data/silver/silver_amenagements")


=== BRONZE amenagements ===
root
 |-- nom: string (nullable = true)
 |-- commune1: string (nullable = true)
 |-- insee1: string (nullable = true)
 |-- commune2: string (nullable = true)
 |-- insee2: string (nullable = true)
 |-- reseau: string (nullable = true)
 |-- financementac: string (nullable = true)
 |-- typeamenagement: string (nullable = true)
 |-- typeamenagement2: string (nullable = true)
 |-- positionnement: string (nullable = true)
 |-- senscirculation: string (nullable = true)
 |-- environnement: string (nullable = true)
 |-- localisation: string (nullable = true)
 |-- typologiepiste: string (nullable = true)
 |-- revetementpiste: string (nullable = true)
 |-- domanialite: string (nullable = true)
 |-- reglementation: string (nullable = true)
 |-- zonecirculationapaisee: string (nullable = true)
 |-- anneelivraison: string (nullable = true)
 |-- longueur: string (nullable = true)
 |-- observation: string (nullable = true)
 |-- validite: string (nullable = true)
 |-- gid: s

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower, when, to_timestamp
from pyspark.sql.types import StringType, BooleanType, IntegerType

spark = SparkSession.builder.getOrCreate()

channels_bronze = (
    spark.read
    .option("header", True)
    .option("sep", ";")
    .option("encoding", "UTF-8")
    .csv("data/bronze/comptage/channels/channels.csv")
)

print("=== BRONZE channels ===")
channels_bronze.printSchema()
print("Rows:", channels_bronze.count())

# On garde les champs analytiques + temporels, on jette les provider_* (tech)
channels_silver = (
    channels_bronze
    .select(
        col("channel_id").cast(StringType()).alias("channel_id"),
        col("site_id").cast(StringType()).alias("site_id"),

        trim(col("mobility_type")).alias("mobility_type"),

        # utile pour interprétation des flux
        trim(col("direction")).alias("direction"),
        trim(col("counter_type")).alias("counter_type"),
        trim(col("temporality")).alias("temporality"),
        trim(col("counter_transmission_type")).alias("counter_transmission_type"),
        trim(col("publication_transmission_type")).alias("publication_transmission_type"),
        trim(col("time_step")).cast(IntegerType()).alias("time_step"),

        # bornes d’activité du channel (important pour analyses avant/après)
        to_timestamp(col("started_at")).alias("started_at"),
        to_timestamp(col("ended_at")).alias("ended_at")
    )
    # Feature utile : est-ce un channel vélo ?
    .withColumn(
        "is_bike_channel",
        when(lower(trim(col("mobility_type"))).isin("velo", "vélo", "bike", "bicycle", "cycling"), True)
        .otherwise(False)
        .cast(BooleanType())
    )
)

print("=== SILVER channels (preview) ===")
channels_silver.show(5, truncate=False)

(
    channels_silver
    .write
    .mode("overwrite")
    .parquet("data/silver/silver_channels")
)

print("silver_channels written to data/silver/silver_channels")


=== BRONZE channels ===
root
 |-- channel_id: string (nullable = true)
 |-- channel_provider_id: string (nullable = true)
 |-- site_provider_id: string (nullable = true)
 |-- site_id: string (nullable = true)
 |-- mobility_type: string (nullable = true)
 |-- comment: string (nullable = true)
 |-- counter_transmission_type: string (nullable = true)
 |-- publication_transmission_type: string (nullable = true)
 |-- counter_type: string (nullable = true)
 |-- direction: string (nullable = true)
 |-- provider_direction_code: string (nullable = true)
 |-- provider_direction_name: string (nullable = true)
 |-- data_provider_name: string (nullable = true)
 |-- temporality: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- ended_at: string (nullable = true)
 |-- last_updated_at: string (nullable = true)
 |-- time_step: string (nullable = true)
 |-- provider_portal_url: string (nullable = true)

Rows: 1302
=== SILVER channels (preview) ===
+-----------------+--------------+

In [8]:
channels_silver.groupBy("is_bike_channel").count().show()

+---------------+-----+
|is_bike_channel|count|
+---------------+-----+
|           true|  674|
|          false|  628|
+---------------+-----+



In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, to_date, hour, when
from pyspark.sql.types import StringType, IntegerType, BooleanType

spark = SparkSession.builder.getOrCreate()

measures_bronze = (
    spark.read
    .option("header", True)
    .option("sep", ",")
    .option("encoding", "UTF-8")
    .csv("data/bronze/comptage/measures/measures.csv")
)

print("=== BRONZE measures ===")
measures_bronze.printSchema()
print("Rows:", measures_bronze.count())

# Typage + colonnes utiles (SILVER enrichi)
measures_silver = (
    measures_bronze
    .select(
        col("channel_id").cast(StringType()).alias("channel_id"),
        to_timestamp(col("start_datetime")).alias("ts_start"),
        to_timestamp(col("end_datetime")).alias("ts_end"),
        col("count").cast(IntegerType()).alias("flux")
    )
    .withColumn("date", to_date(col("ts_start")))
    .withColumn("hour", hour(col("ts_start")))
    .withColumn("is_valid", when(col("flux").isNotNull() & (col("flux") >= 0) & col("channel_id").isNotNull(), True).otherwise(False).cast(BooleanType()))
)

print("=== SILVER measures (preview) ===")
measures_silver.show(5, truncate=False)

# Écriture partitionnée (important pour Spark)
(
    measures_silver
    .write
    .mode("overwrite")
    .partitionBy("date")
    .parquet("data/silver/silver_measures")
)

print("silver_measures written to data/silver/silver_measures (partitioned by date)")


=== BRONZE measures ===
root
 |-- channel_id: string (nullable = true)
 |-- counter_id: string (nullable = true)
 |-- start_datetime: string (nullable = true)
 |-- end_datetime: string (nullable = true)
 |-- count: string (nullable = true)

Rows: 45339893
=== SILVER measures (preview) ===
+----------+-------------------+-------------------+----+----------+----+--------+
|channel_id|ts_start           |ts_end             |flux|date      |hour|is_valid|
+----------+-------------------+-------------------+----+----------+----+--------+
|100029813 |2023-04-06 01:00:00|2023-04-06 02:00:00|0   |2023-04-06|1   |true    |
|100029813 |2023-04-06 00:00:00|2023-04-06 01:00:00|0   |2023-04-06|0   |true    |
|100029813 |2023-04-05 23:00:00|2023-04-06 00:00:00|0   |2023-04-05|23  |true    |
|100029813 |2023-04-05 22:00:00|2023-04-05 23:00:00|1   |2023-04-05|22  |true    |
|100029813 |2023-04-05 21:00:00|2023-04-05 22:00:00|0   |2023-04-05|21  |true    |
+----------+-------------------+--------------

In [10]:
measures_silver.printSchema()
from pyspark.sql.functions import min, max, sum as Fsum

measures_silver.select(min("date"), max("date")).show()
measures_silver.groupBy("is_valid").count().show()


root
 |-- channel_id: string (nullable = true)
 |-- ts_start: timestamp (nullable = true)
 |-- ts_end: timestamp (nullable = true)
 |-- flux: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- is_valid: boolean (nullable = false)

+----------+----------+
| min(date)| max(date)|
+----------+----------+
|2007-01-18|2025-12-18|
+----------+----------+

+--------+--------+
|is_valid|   count|
+--------+--------+
|    true|45339893|
+--------+--------+



In [11]:
# =========================
# SILVER - Comptage manuel (dates robustes)
# =========================
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lit, coalesce, expr
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.getOrCreate()
print("Spark version:", spark.version)

# 1) Lecture BRONZE
manual_bronze = (
    spark.read
    .option("header", True)
    .option("sep", ",")
    .option("encoding", "UTF-8")
    .csv("data/bronze/comptage_manuel/comptage_manuel_clean.csv")
)

print("=== BRONZE manual counts ===")
manual_bronze.printSchema()
print("Rows:", manual_bronze.count())

# 2) Transformation SILVER
# Astuce: on trim la colonne Date pour enlever espaces éventuels
# Puis: date = to_date(try_to_timestamp(...)) avec fallback yyyy puis yy
manual_silver = (
    manual_bronze
    .select(
        trim(col("Point comptage")).alias("manual_site_name"),

        coalesce(
            expr("to_date(try_to_timestamp(trim(Date), 'dd/MM/yyyy'))"),
            expr("to_date(try_to_timestamp(trim(Date), 'dd/MM/yy'))")
        ).alias("date"),

        trim(col("Flux")).alias("flux_label"),
        trim(col("Direction")).alias("direction"),
        trim(col("Genre")).alias("genre"),
        trim(col("Véhicule")).alias("vehicule_type"),

        trim(col("Heure début")).alias("start_time"),
        trim(col("Heure fin")).alias("end_time"),

        col("Nombre comptés").cast(IntegerType()).alias("count"),
        trim(col("Remarques")).alias("remarks"),

        lit("manual").alias("source")
    )
)

print("=== SILVER manual counts (preview) ===")
manual_silver.show(10, truncate=False)

# 3) Contrôle qualité : dates non parsées
bad_dates = manual_silver.filter(col("date").isNull()) \
    .select("manual_site_name", "date", "start_time", "end_time", "count")

print("Rows with date=NULL:", bad_dates.count())
bad_dates.show(30, truncate=False)

# 4) Écriture SILVER
(
    manual_silver
    .write
    .mode("overwrite")
    .parquet("data/silver/silver_manual_counts")
)

print(" silver_manual_counts written to data/silver/silver_manual_counts")


Spark version: 4.0.1
=== BRONZE manual counts ===
root
 |-- Date: string (nullable = true)
 |-- Point comptage: string (nullable = true)
 |-- Flux: string (nullable = true)
 |-- Direction: string (nullable = true)
 |-- Genre: string (nullable = true)
 |-- Véhicule: string (nullable = true)
 |-- Heure début: string (nullable = true)
 |-- Heure fin: string (nullable = true)
 |-- Nombre comptés: string (nullable = true)
 |-- Remarques: string (nullable = true)
 |-- Compteur: string (nullable = true)
 |-- _c11: string (nullable = true)
 |-- _c12: string (nullable = true)

Rows: 3355
=== SILVER manual counts (preview) ===
+-------------------+----------+------------------+---------+--------------+---------------+----------+--------+-----+---------------------+------+
|manual_site_name   |date      |flux_label        |direction|genre         |vehicule_type  |start_time|end_time|count|remarks              |source|
+-------------------+----------+------------------+---------+--------------+---

In [12]:
manual_silver.printSchema()
manual_silver.select("date", "start_time", "end_time", "count").show(10, truncate=False)


root
 |-- manual_site_name: string (nullable = true)
 |-- date: date (nullable = true)
 |-- flux_label: string (nullable = true)
 |-- direction: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- vehicule_type: string (nullable = true)
 |-- start_time: string (nullable = true)
 |-- end_time: string (nullable = true)
 |-- count: integer (nullable = true)
 |-- remarks: string (nullable = true)
 |-- source: string (nullable = false)

+----------+----------+--------+-----+
|date      |start_time|end_time|count|
+----------+----------+--------+-----+
|2024-12-03|07:30     |09:00   |135  |
|2024-12-03|07:30     |09:00   |7    |
|2024-12-03|07:30     |09:00   |25   |
|2024-12-03|07:30     |09:00   |5    |
|2024-12-03|07:30     |09:00   |43   |
|2024-12-03|07:30     |09:00   |2    |
|2024-12-03|07:30     |09:00   |17   |
|2024-12-03|07:30     |09:00   |1    |
|2024-12-03|07:30     |09:00   |27   |
|2024-12-03|07:30     |09:00   |4    |
+----------+----------+--------+-----+
onl

In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum as Fsum, count as Fcount, countDistinct, min, max

In [14]:
from pyspark.sql.types import DateType, IntegerType, BooleanType

In [15]:


# Inputs
measures = spark.read.parquet("data/silver/silver_measures")
channels = spark.read.parquet("data/silver/silver_channels").select("channel_id").distinct()

# Fenêtre demandée
start_date = lit("2014-01-01").cast(DateType())
end_date   = lit("2025-12-01").cast(DateType())

# 1) Filtrer fenêtre + lignes inutilisables (NULLs + négatifs)
m0 = (
    measures
    .filter((col("date") >= start_date) & (col("date") <= end_date))
    .filter(col("channel_id").isNotNull())
    .filter(col("date").isNotNull())
    .filter(col("flux").isNotNull())
    .filter(col("flux") >= 0)
)

# 2) Enforcer "channel_id must exist in silver_channels" (semi-join)
m1 = m0.join(channels, on="channel_id", how="inner")

# 3) Agrégation journalière pour gérer doublons
#    1 ligne = channel_id x date
measures_daily = (
    m1.groupBy("channel_id", "date")
      .agg(Fsum(col("flux").cast(IntegerType())).alias("flux"))
      .withColumn("is_valid", lit(True).cast(BooleanType()))
)

# 4) Contraintes finales (no NULL + int >= 0)
measures_daily = (
    measures_daily
    .filter(col("channel_id").isNotNull())
    .filter(col("date").isNotNull())
    .filter(col("flux").isNotNull())
    .filter(col("is_valid").isNotNull())
    .filter(col("flux") >= 0)
)

# Audit rapide
before = measures.count()
after = measures_daily.count()

print("=== Measures DAILY clean audit ===")
print("Rows before (raw measures):", before)
print("Rows after  (daily clean) :", after)

measures_daily.select(min("date").alias("min_date"), max("date").alias("max_date")).show()

# Vérif: aucun channel inconnu
unknown = (
    measures_daily.select("channel_id").distinct()
    .join(channels, on="channel_id", how="left_anti")
    .count()
)
print("Unknown channel_id in daily:", unknown)

# Vérif: NULLs
nulls = measures_daily.select(
    (Fsum(col("channel_id").isNull().cast("int"))).alias("null_channel_id"),
    (Fsum(col("date").isNull().cast("int"))).alias("null_date"),
    (Fsum(col("flux").isNull().cast("int"))).alias("null_flux"),
    (Fsum(col("is_valid").isNull().cast("int"))).alias("null_is_valid"),
).collect()[0]
print("NULL counts:", dict(nulls.asDict()))

# Écriture partitionnée
(
    measures_daily
    .write
    .mode("overwrite")
    .partitionBy("date")
    .parquet("data/silver/silver_measures_daily_clean")
)

print("Written: data/silver/silver_measures_daily_clean")


=== Measures DAILY clean audit ===
Rows before (raw measures): 45339893
Rows after  (daily clean) : 1544994
+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2014-01-01|2025-12-01|
+----------+----------+

Unknown channel_id in daily: 0
NULL counts: {'null_channel_id': 0, 'null_date': 0, 'null_flux': 0, 'null_is_valid': 0}
Written: data/silver/silver_measures_daily_clean


In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, trim, lower, regexp_replace, sum as Fsum, min, max
from pyspark.sql.types import DateType, IntegerType, BooleanType

spark = SparkSession.builder.getOrCreate()

manual = spark.read.parquet("data/silver/silver_manual_counts")

# Fenêtre demandée
start_date = lit("2014-01-01").cast(DateType())
end_date   = lit("2025-12-01").cast(DateType())

# 1) Nettoyage + normalisation
m0 = (
    manual
    .withColumn("manual_site_name", trim(col("manual_site_name")))
    .withColumn("direction", trim(col("direction")))
    # normaliser direction (ex: "Descente", "Montée", etc.)
    .withColumn("direction_norm", lower(col("direction")))
    # flux = count
    .withColumn("flux", col("count").cast(IntegerType()))
    .drop("count")  # on standardise sur "flux"
    # Filtre période + contraintes non-NULL + non-négatif
    .filter((col("date") >= start_date) & (col("date") <= end_date))
    .filter(col("manual_site_name").isNotNull() & (col("manual_site_name") != ""))
    .filter(col("direction").isNotNull() & (col("direction") != ""))
    .filter(col("flux").isNotNull())
    .filter(col("flux") >= 0)
)

# 2) Agrégation journalière (compatibilité avec measures daily)
# 1 ligne = date x point x direction
manual_daily = (
    m0.groupBy("date", "manual_site_name", "direction")
      .agg(Fsum(col("flux")).alias("flux"))
      .withColumn("is_valid", lit(True).cast(BooleanType()))
      .withColumn("source", lit("manual"))
)

# 3) Clé standardisée pour combinaisons (sans channel_id)
# manual_key = nom_point normalisé + direction normalisée
manual_daily = (
    manual_daily
    .withColumn(
        "manual_key",
        regexp_replace(lower(col("manual_site_name")), r"\s+", " ")
    )
)

print("=== Manual DAILY clean audit ===")
manual_daily.select(min("date").alias("min_date"), max("date").alias("max_date")).show()
manual_daily.show(10, truncate=False)

(
    manual_daily
    .write
    .mode("overwrite")
    .partitionBy("date")
    .parquet("data/silver/silver_manual_counts_daily_clean")
)

print("✅ Written: data/silver/silver_manual_counts_daily_clean")


=== Manual DAILY clean audit ===
+----------+----------+
|  min_date|  max_date|
+----------+----------+
|2018-12-07|2025-06-10|
+----------+----------+

+----------+--------------------------+--------------------+----+--------+------+--------------------------+
|date      |manual_site_name          |direction           |flux|is_valid|source|manual_key                |
+----------+--------------------------+--------------------+----+--------+------+--------------------------+
|2019-12-10|Pont Poincaré             |Toutes              |64  |true    |manual|pont poincaré             |
|2025-06-10|Quai Clémenceau/Castellane|Toutes              |590 |true    |manual|quai clémenceau/castellane|
|2025-06-10|Bvd des Canuts            |Toutes              |2228|true    |manual|bvd des canuts            |
|2022-12-06|Margnolles / Oratoire     |Toutes              |46  |true    |manual|margnolles / oratoire     |
|2024-12-03|La Buissière              |vers Chemin Bussière|9   |true    |manual|la

In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

In [18]:



# === Inputs ===
auto = spark.read.parquet("data/silver/silver_measures_daily_clean")
manual = spark.read.parquet("data/silver/silver_manual_counts_daily_clean")

# === Harmonisation AUTOMATIQUE ===
auto_std = (
    auto
    .select(
        col("date"),
        col("channel_id").alias("counting_id"),
        col("flux"),
        lit("automatic").alias("source")
    )
)

# === Harmonisation MANUEL ===
manual_std = (
    manual
    .select(
        col("date"),
        col("manual_key").alias("counting_id"),
        col("flux"),
        col("source")  # déjà "manual"
    )
)

# === Union logique ===
daily_comparable = auto_std.unionByName(manual_std)

# === Audit ===
print("=== Comparable DAILY usage audit ===")
daily_comparable.groupBy("source").count().show()
daily_comparable.select("source").distinct().show()
daily_comparable.show(10, truncate=False)

# === Écriture GOLD (comparaison) ===
(
    daily_comparable
    .write
    .mode("overwrite")
    .partitionBy("date")
    .parquet("data/gold/gold_daily_usage_comparable")
)

print(" Written: data/gold/gold_daily_usage_comparable")


=== Comparable DAILY usage audit ===
+---------+-------+
|   source|  count|
+---------+-------+
|automatic|1544994|
|   manual|    143|
+---------+-------+

+---------+
|   source|
+---------+
|automatic|
|   manual|
+---------+

+----------+-----------+----+---------+
|date      |counting_id|flux|source   |
+----------+-----------+----+---------+
|2025-01-10|353263595  |591 |automatic|
|2025-01-10|353303358  |615 |automatic|
|2025-01-10|353452417  |61  |automatic|
|2025-01-10|353466539  |4324|automatic|
|2025-01-10|353293865  |115 |automatic|
|2025-01-10|353327354  |645 |automatic|
|2025-01-10|353467813  |9   |automatic|
|2025-01-10|353486500  |0   |automatic|
|2025-01-10|353340940  |5   |automatic|
|2025-01-10|353359669  |167 |automatic|
+----------+-----------+----+---------+
only showing top 10 rows
 Written: data/gold/gold_daily_usage_comparable


In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower
from pyspark.sql.types import StringType, DoubleType, IntegerType, DateType

spark = SparkSession.builder.getOrCreate()

# Inputs
manual_counts = spark.read.parquet("data/silver/silver_manual_counts")
manual_geo = (
    spark.read
    .option("header", True)
    .option("sep", ",")
    .option("encoding", "UTF-8")
    .csv("data/bronze/comptage_manuel/manual_sites_geo.csv")
)

# 1) Construire silver_manual_sites (filtrer status ok, typage, normaliser clé)
silver_manual_sites = (
    manual_geo
    .withColumn("manual_site_name", trim(col("manual_site_name")).cast(StringType()))
    .withColumn("address_query", trim(col("address_query")).cast(StringType()))
    .withColumn("status", lower(trim(col("status"))).cast(StringType()))
    .withColumn("lat", col("lat").cast(DoubleType()))
    .withColumn("lon", col("lon").cast(DoubleType()))
    .withColumn("display_name", col("display_name").cast(StringType()))
    .filter(col("status") == "ok")
    .filter(col("manual_site_name").isNotNull())
    .filter(col("lat").isNotNull() & col("lon").isNotNull())
    # garde 1 ligne par site (au cas où)
    .dropDuplicates(["manual_site_name"])
)

print("=== silver_manual_sites preview ===")
silver_manual_sites.show(50, truncate=False)

silver_manual_sites.write.mode("overwrite").parquet("data/silver/silver_manual_sites")
print("✅ Written: data/silver/silver_manual_sites")

# 2) Enrichir les comptages manuels avec coordonnées (join)
silver_manual_counts_geo = (
    manual_counts
    .withColumn("manual_site_name", trim(col("manual_site_name")))
    .join(
        silver_manual_sites.select("manual_site_name", "lat", "lon"),
        on="manual_site_name",
        how="left"
    )
)

# Contrôle : lignes sans géocodage (doit être 0 si tout bon)
missing_geo = silver_manual_counts_geo.filter(col("lat").isNull() | col("lon").isNull()).count()
print("Rows with missing geo:", missing_geo)

print("=== silver_manual_counts_geo preview ===")
silver_manual_counts_geo.show(20, truncate=False)

silver_manual_counts_geo.write.mode("overwrite").parquet("data/silver/silver_manual_counts_geo")
print("✅ Written: data/silver/silver_manual_counts_geo")


=== silver_manual_sites preview ===
+-----------------------+-------------------------------------+----------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+------------+------+
|manual_site_name       |address_query                        |lat       |lon      |display_name                                                                                                                                                                                              |status|n_candidates|mode  |
+-----------------------+-------------------------------------+----------+---------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+------------+------+
|Avenue des Cottage

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower, regexp_replace, count as Fcount, countDistinct
from pyspark.sql.types import StringType, DoubleType

spark = SparkSession.builder.getOrCreate()

# Inputs
manual_counts = spark.read.parquet("data/silver/silver_manual_counts")
manual_sites  = spark.read.parquet("data/silver/silver_manual_sites")

# --- Helper: clé normalisée simple (sans accents) ---
# 1) lower
# 2) trim
# 3) remplace tout ce qui n'est pas lettre/chiffre par un espace
# 4) compacte les espaces
def with_key(df, name_col):
    return (
        df
        .withColumn(name_col, trim(col(name_col)))
        .withColumn("manual_site_key", lower(col(name_col)))
        .withColumn("manual_site_key", regexp_replace(col("manual_site_key"), r"[^a-z0-9]+", " "))
        .withColumn("manual_site_key", regexp_replace(col("manual_site_key"), r"\s+", " "))
        .withColumn("manual_site_key", trim(col("manual_site_key")))
    )

sites_k = with_key(manual_sites, "manual_site_name") \
    .withColumn("lat", col("lat").cast(DoubleType())) \
    .withColumn("lon", col("lon").cast(DoubleType())) \
    .dropDuplicates(["manual_site_key"])  # dédoublonnage robuste (Maison voie verte)

counts_k = with_key(manual_counts, "manual_site_name")

# Join sur la clé normalisée
counts_geo_v2 = (
    counts_k
    .join(sites_k.select("manual_site_key", "lat", "lon"), on="manual_site_key", how="left")
)

missing_rows = counts_geo_v2.filter(col("lat").isNull() | col("lon").isNull()).count()
missing_sites = (
    counts_geo_v2
    .filter(col("lat").isNull() | col("lon").isNull())
    .select("manual_site_name", "manual_site_key")
    .distinct()
    .orderBy("manual_site_name")
)

print("=== Join v2 report ===")
print("Rows total:", counts_geo_v2.count())
print("Rows missing geo:", missing_rows)
print("Distinct missing sites:", missing_sites.count())

print("\n--- Sample of missing sites (up to 50) ---")
missing_sites.show(50, truncate=False)

# Écriture v2
sites_k.write.mode("overwrite").parquet("data/silver/silver_manual_sites_v2")
counts_geo_v2.write.mode("overwrite").parquet("data/silver/silver_manual_counts_geo_v2")

print("✅ Written: data/silver/silver_manual_sites_v2")
print("✅ Written: data/silver/silver_manual_counts_geo_v2")


=== Join v2 report ===
Rows total: 3355
Rows missing geo: 1653
Distinct missing sites: 13

--- Sample of missing sites (up to 50) ---
+--------------------------------+--------------------------------+
|manual_site_name                |manual_site_key                 |
+--------------------------------+--------------------------------+
|NULL                            |NULL                            |
|Bas Montée Bonnafous            |bas mont e bonnafous            |
|Bas Montée St Sébastien         |bas mont e st s bastien         |
|Bvd des Canuts                  |bvd des canuts                  |
|Carrefour St Clair              |carrefour st clair              |
|Croisement Bonnafous/Herbouville|croisement bonnafous herbouville|
|Féminin                         |f minin                         |
|Indéfini/Mixte                  |ind fini mixte                  |
|Margnolles / Oratoire           |margnolles oratoire             |
|Masculin                        |masculin        

In [21]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower, regexp_replace, lit
from pyspark.sql.types import BooleanType

spark = SparkSession.builder.getOrCreate()

manual_counts = spark.read.parquet("data/silver/silver_manual_counts")
manual_sites  = spark.read.parquet("data/silver/silver_manual_sites_v2")

# fonction clé normalisée (comme avant)
def with_key(df, name_col):
    return (
        df
        .withColumn(name_col, trim(col(name_col)))
        .withColumn("manual_site_key", lower(col(name_col)))
        .withColumn("manual_site_key", regexp_replace(col("manual_site_key"), r"[^a-z0-9]+", " "))
        .withColumn("manual_site_key", regexp_replace(col("manual_site_key"), r"\s+", " "))
        .withColumn("manual_site_key", trim(col("manual_site_key")))
    )

sites_k = with_key(manual_sites, "manual_site_name") \
    .select("manual_site_key", "manual_site_name", "lat", "lon") \
    .dropDuplicates(["manual_site_key"])

counts_k = with_key(manual_counts, "manual_site_name")

# 1) On garde uniquement les lignes dont manual_site_key existe dans les sites géocodés
counts_clean = (
    counts_k
    .join(sites_k.select("manual_site_key"), on="manual_site_key", how="inner")
)

# 2) Refaire la jointure géographique (maintenant elle doit matcher)
counts_geo_clean = (
    counts_clean
    .join(sites_k.select("manual_site_key", "lat", "lon"), on="manual_site_key", how="left")
)

missing_geo = counts_geo_clean.filter(col("lat").isNull() | col("lon").isNull()).count()
print("Rows after cleaning:", counts_clean.count())
print("Rows missing geo after cleaning:", missing_geo)

# Écriture
counts_clean.write.mode("overwrite").parquet("data/silver/silver_manual_counts_clean")
counts_geo_clean.write.mode("overwrite").parquet("data/silver/silver_manual_counts_geo_clean")

print("✅ Written: data/silver/silver_manual_counts_clean")
print("✅ Written: data/silver/silver_manual_counts_geo_clean")


Rows after cleaning: 1702
Rows missing geo after cleaning: 0
✅ Written: data/silver/silver_manual_counts_clean
✅ Written: data/silver/silver_manual_counts_geo_clean


In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as Fsum, count as Fcount, trim, lower, lit
from pyspark.sql.types import DateType

spark = SparkSession.builder.getOrCreate()

manual_geo = spark.read.parquet("data/silver/silver_manual_counts_geo_clean")

# Fenêtre demandée
start_date = lit("2014-01-01").cast(DateType())
end_date   = lit("2025-12-01").cast(DateType())

manual_daily = (
    manual_geo
    .filter((col("date") >= start_date) & (col("date") <= end_date))
    .filter(col("count").isNotNull() & (col("count") >= 0))
    .withColumn("direction", lower(trim(col("direction"))))
    .withColumn("vehicule_type", lower(trim(col("vehicule_type"))))
    .groupBy("manual_site_name", "date", "direction", "vehicule_type", "lat", "lon")
    .agg(
        Fsum(col("count")).alias("manual_flux"),
        Fcount(lit(1)).alias("n_records")
    )
)

print("=== manual_daily preview ===")
manual_daily.show(30, truncate=False)

manual_daily.write.mode("overwrite").partitionBy("date").parquet("data/silver/silver_manual_daily_clean")
print("✅ Written: data/silver/silver_manual_daily_clean (partitioned by date)")


=== manual_daily preview ===
+-----------------------+----------+-----------------------------------+-----------------------+----------+---------+-----------+---------+
|manual_site_name       |date      |direction                          |vehicule_type          |lat       |lon      |manual_flux|n_records|
+-----------------------+----------+-----------------------------------+-----------------------+----------+---------+-----------+---------+
|Bas Montée des Forts   |2024-12-03|lyon -> montée                     |vélo                   |45.7940321|4.8282233|4          |4        |
|Montée Sathonay Village|2023-12-05|toutes                             |vélo                   |45.8291423|4.8796227|26         |3        |
|Rue de St-Cyr          |2025-06-10|vers st-cyr                        |trotinette/edpm        |45.7854107|4.8078344|12         |24       |
|Piscine Loup Pendu     |2024-12-03|NULL                               |trotinette/edpm        |45.8155609|4.8874948|14         |14

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, max as spark_max, row_number
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.getOrCreate()

# 1. Charger les sites AUTOMATIQUES
df_auto_sites = spark.read.parquet("data/silver/silver_sites")

# 2. Trouver le "Plafond" (Max ID) en toute sécurité
# CORRECTION : On filtre d'abord pour ne garder que les chaînes composées uniquement de chiffres.
# Le symbole "^[0-9]+$" est une expression régulière qui veut dire "Que des chiffres du début à la fin".
max_auto_row = df_auto_sites \
    .filter(col("site_id").rlike("^[0-9]+$")) \
    .select(col("site_id").cast(IntegerType()).alias("id_int")) \
    .agg(spark_max("id_int")) \
    .collect()

max_auto_id = max_auto_row[0][0]

# Sécurité (si la table est vide)
if max_auto_id is None:
    max_auto_id = 900000000

print(f"On commencera la numérotation des manuels après l'ID : {max_auto_id}")

# 3. Charger les sites MANUELS (Version V2 nettoyée)
df_manual_sites = spark.read.parquet("data/silver/silver_manual_sites_v2")

# 4. Générer des IDs uniques pour les sites manuels
w = Window.orderBy("manual_site_name")

df_manual_sites_with_id = df_manual_sites.withColumn(
    "site_id_generated", 
    lit(max_auto_id) + row_number().over(w)
)

# 5. Sauvegarder cette table de référence
output_path = "data/silver/silver_manual_sites_v3_ids"
df_manual_sites_with_id.write.mode("overwrite").parquet(output_path)
print(f"✅ Mapping généré : {output_path}")

On commencera la numérotation des manuels après l'ID : 300062480
✅ Mapping généré : data/silver/silver_manual_sites_v3_ids


In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, trim, lower, regexp_replace, sum as Fsum
from pyspark.sql.types import StringType, IntegerType

spark = SparkSession.builder.getOrCreate()

# ====================
# 1. AUTO (capteurs) - ON NE TOUCHE PAS AUX IDs
# ====================
auto_daily = spark.read.parquet("data/silver/silver_measures_daily_clean")
channels   = spark.read.parquet("data/silver/silver_channels").select("channel_id", "site_id", "is_bike_channel")
sites      = spark.read.parquet("data/silver/silver_sites").select("site_id", "lat", "lon")

auto = (
    auto_daily
    .join(channels, on="channel_id", how="inner")
    .filter(col("is_bike_channel") == True)
    .join(sites, on="site_id", how="inner")
    .groupBy("site_id", "date", "lat", "lon")
    .agg(Fsum(col("flux")).cast(IntegerType()).alias("flux"))
    # ICI : On garde site_id tel quel (String) et on le renomme point_id
    .withColumn("point_id", col("site_id").cast(StringType())) 
    .withColumn("point_type", lit("auto"))
    .withColumn("direction", lit(None).cast(StringType()))
    .withColumn("vehicule_type", lit(None).cast(StringType()))
    .select("point_id", "point_type", "date", "flux", "lat", "lon", "direction", "vehicule_type")
)

# ====================
# 2. MANUAL (comptages manuels)
# ====================
manual_daily = spark.read.parquet("data/silver/silver_manual_daily_clean")
# On charge les IDs générés à l'étape d'avant
manual_sites_ids = spark.read.parquet("data/silver/silver_manual_sites_v3_ids").select("manual_site_name", "site_id_generated")

manual = (
    manual_daily
    .join(manual_sites_ids, on="manual_site_name", how="left")
    .groupBy("site_id_generated", "date", "lat", "lon")
    .agg(Fsum(col("manual_flux")).cast(IntegerType()).alias("flux"))
    # ICI : On prend l'ID généré (entier) et on le convertit en String pour matcher avec Auto
    .withColumn("point_id", col("site_id_generated").cast(StringType()))
    .withColumn("point_type", lit("manual"))
    .withColumn("direction", lit(None).cast(StringType()))
    .withColumn("vehicule_type", lit(None).cast(StringType()))
    .select("point_id", "point_type", "date", "flux", "lat", "lon", "direction", "vehicule_type")
)

# ====================
# 3. UNION
# ====================
silver_measures_union = auto.unionByName(manual)

print("=== Aperçu Final (point_id est de type STRING) ===")
silver_measures_union.show(20, truncate=False)

# Vérification du schéma : point_id doit être StringType
silver_measures_union.printSchema()

silver_measures_union.groupBy("point_type").count().show()


# Écriture
(
    silver_measures_union
    .write
    .mode("overwrite")
    .partitionBy("date")
    .parquet("data/silver/silver_measures_union")
)

print("✅ Terminé : data/silver/silver_measures_union")

=== Aperçu Final (point_id est de type STRING) ===
+---------+----------+----------+----+------------------+------------------+---------+-------------+
|point_id |point_type|date      |flux|lat               |lon               |direction|vehicule_type|
+---------+----------+----------+----+------------------+------------------+---------+-------------+
|100024515|auto      |2025-04-25|675 |45.78389          |4.89176           |NULL     |NULL         |
|300017148|auto      |2025-04-30|1131|45.76328678443991 |4.8297518491745   |NULL     |NULL         |
|300028348|auto      |2023-09-21|840 |45.796105007561295|4.831731319427491 |NULL     |NULL         |
|300039246|auto      |2025-05-04|103 |45.784496254038174|4.808114469051362 |NULL     |NULL         |
|100024515|auto      |2025-05-04|441 |45.78389          |4.89176           |NULL     |NULL         |
|100052661|auto      |2024-08-28|126 |45.7616829690733  |4.82730039680372  |NULL     |NULL         |
|300030753|auto      |2024-09-19|1654|45

In [34]:
df_auto_sites = spark.read.parquet("data/silver/silver_sites")
df_auto_sites.show(280)

+--------------+--------------------+------------------+------------------+----------+-------------------+--------------+
|       site_id|           site_name|               lon|               lat|insee_code|infrastructure_type|parent_site_id|
+--------------+--------------------+------------------+------------------+----------+-------------------+--------------+
|69149.00072.12|OULLINS_Gare rout...|        4.81512527|        45.7168309|     69600|OTHER SPECIFIC SITE|69149.00072.12|
|69194.00013.02|SAINT DIDIER_59 A...|       4.798341188|       45.81053311|     69370|OTHER SPECIFIC SITE|69194.00013.02|
|     100017792|Vaulx en Velin_Po...|4.9357686189513625| 45.77073817594027|     69256|               NULL|          NULL|
|     100017793|Décines_Passerell...| 4.954341799020768| 45.77486440927123|     69275|               NULL|          NULL|
|     100024515|Villeurbanne_Pont...|           4.89176|          45.78389|     69266|               NULL|          NULL|
|     100026724|Lyon 8/V

In [35]:
import json
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

spark = SparkSession.builder.getOrCreate()

# ==============================================================================
# 1. RÉCUPÉRATION DES DONNÉES MANUELLES (Avec les bons IDs)
# ==============================================================================
print("Lecture des données manuelles...")

# A. Les coordonnées (lat/lon)
df_coords = spark.read.parquet("data/silver/silver_manual_sites_v2") \
    .select("manual_site_name", "lat", "lon")

# B. Les IDs générés (les fameux "300062...")
df_ids = spark.read.parquet("data/silver/silver_manual_sites_v3_ids") \
    .select("manual_site_name", "site_id_generated")

# C. Jointure pour tout avoir au même endroit
# On convertit le résultat en liste de dictionnaires Python pour manipuler le JSON facilement
manual_sites_list = (
    df_coords.join(df_ids, on="manual_site_name", how="inner")
    .withColumn("site_id_str", col("site_id_generated").cast("string")) # Important : STRING
    .collect()
)

print(f"{len(manual_sites_list)} sites manuels récupérés avec succès.")


# ==============================================================================
# 2. CHARGEMENT ET MISE À JOUR DU FICHIER GEOJSON
# ==============================================================================
input_json_path = "data/bronze/comptage/sites/metropole-de-lyon_pvo_patrimoine_voirie.pvocomptagesite.json"
output_json_path = "data/bronze/comptage/sites/metropole_lyon_comptage_complet.json"

print(f"Chargement du fichier original : {input_json_path}")

with open(input_json_path, 'r', encoding='utf-8') as f:
    geojson_data = json.load(f)

original_count = len(geojson_data['features'])
print(f"Sites existants (Auto) : {original_count}")

# ==============================================================================
# 3. CRÉATION DES "FEATURES" POUR LES SITES MANUELS
# ==============================================================================
new_features = []

for row in manual_sites_list:
    # On construit la structure exacte attendue par le GeoJSON
    # IDs
    site_id_str = row['site_id_str']
    name = row['manual_site_name']
    lat = row['lat']
    lon = row['lon']
    
    feature = {
        "type": "Feature",
        "id": f"pvo_patrimoine_voirie.pvocomptagesite.{site_id_str}",
        "geometry": {
            "type": "Point",
            "coordinates": [lon, lat] # Attention : GeoJSON c'est [Longitude, Latitude]
        },
        "geometry_name": "the_geom",
        "properties": {
            "gid": site_id_str,              # On met l'ID ici aussi
            "site_id": site_id_str,          # L'ID critique (en String)
            "parent_site_id": "",
            "fr_insee_code": "69384",        # Code par défaut (ex: Lyon) ou vide
            "xlong": lon,
            "ylat": lat,
            "external_ids": "",
            "infrastructure_type": "Comptage Manuel", # Pour les distinguer
            "site_name": name
        },
        # bbox est optionnel mais présent dans votre fichier source
        "bbox": [lon, lat, lon, lat]
    }
    
    new_features.append(feature)

# ==============================================================================
# 4. FUSION ET SAUVEGARDE
# ==============================================================================

# On ajoute les nouveaux sites à la liste existante
geojson_data['features'].extend(new_features)

# Mise à jour des métadonnées (si présentes)
total_sites = len(geojson_data['features'])
geojson_data['totalFeatures'] = total_sites
geojson_data['numberMatched'] = total_sites
geojson_data['numberReturned'] = total_sites

# Sauvegarde du nouveau fichier
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(geojson_data, f, ensure_ascii=False, indent=None)

print("-" * 30)
print(f"Terminé !")
print(f"Anciens sites : {original_count}")
print(f"Nouveaux sites manuels : {len(new_features)}")
print(f"TOTAL : {total_sites}")
print(f"Fichier généré : {output_json_path}")

Lecture des données manuelles...
12 sites manuels récupérés avec succès.
Chargement du fichier original : data/bronze/comptage/sites/metropole-de-lyon_pvo_patrimoine_voirie.pvocomptagesite.json
Sites existants (Auto) : 280
------------------------------
Terminé !
Anciens sites : 280
Nouveaux sites manuels : 12
TOTAL : 292
Fichier généré : data/bronze/comptage/sites/metropole_lyon_comptage_complet.json
