## 1. Setup

In [1]:
import yaml
import os
import sys

# Windows compatibility
if sys.platform == "win32":
    import socketserver
    if not hasattr(socketserver, 'UnixStreamServer'):
        socketserver.UnixStreamServer = socketserver.TCPServer

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import tempfile

# Load configuration
with open("../config/config.yml") as f:
    config = yaml.safe_load(f)

silver_dir = config["paths"]["silver_dir"]
print(f"âœ“ Silver output: {silver_dir}")

âœ“ Silver output: data/silver


In [2]:
# Initialize Spark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
local_temp = tempfile.gettempdir()

spark = SparkSession.builder \
    .appName("GenerateMockParquet") \
    .master("local[*]") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.bindAddress", "localhost") \
    .config("spark.ui.enabled", "false") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .config("spark.local.dir", local_temp) \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"âœ“ Spark version: {spark.version}")
print(f"âœ“ HADOOP_HOME: {os.environ.get('HADOOP_HOME', 'Not set')}")

âœ“ Spark version: 3.5.3
âœ“ HADOOP_HOME: C:\hadoop


## 2. DÃ©finir les SchÃ©mas (selon schemas.md)

In [3]:
# Schema silver_amenagements
schema_amenagements = StructType([
    StructField("amenagement_id", StringType(), False),
    StructField("annee_livraison", IntegerType(), True),
    StructField("type_amenagement", StringType(), True),
    StructField("environnement", StringType(), True),
    StructField("longueur_m", FloatType(), True),
    StructField("geom_wkt", StringType(), True),
    StructField("centroid_lat", FloatType(), True),
    StructField("centroid_lon", FloatType(), True),
    StructField("commune", StringType(), True)
])

# Schema silver_sites
schema_sites = StructType([
    StructField("site_id", StringType(), False),
    StructField("lat", FloatType(), True),
    StructField("lon", FloatType(), True),
    StructField("commune", StringType(), True)
])

# Schema silver_channels
schema_channels = StructType([
    StructField("channel_id", StringType(), False),
    StructField("site_id", StringType(), True),
    StructField("mode", StringType(), True),
    StructField("sens", StringType(), True)
])

# Schema silver_measures
schema_measures = StructType([
    StructField("channel_id", StringType(), True),
    StructField("date", DateType(), True),
    StructField("flux", IntegerType(), True),
    StructField("is_valid", BooleanType(), True)
])

print("âœ“ Schemas dÃ©finis selon schemas.md")

âœ“ Schemas dÃ©finis selon schemas.md


## 3. CrÃ©er les DonnÃ©es Mock

In [4]:
# Mock silver_amenagements (3 infrastructures)
# Note: Sites sont Ã  ~50-150m des amenagements pour tester le buffer
data_amenagements = [
    ("AMEN_001", 2020, "Piste cyclable", "Urbain", 500.0, 
     "LINESTRING(4.835 45.764, 4.836 45.765)", 45.764, 4.835, "Lyon"),
    ("AMEN_002", 2021, "Bande cyclable", "PÃ©riurbain", 300.0,
     "LINESTRING(4.840 45.770, 4.841 45.771)", 45.770, 4.840, "Villeurbanne"),
    ("AMEN_003", 2019, "Voie verte", "Urbain", 800.0,
     "LINESTRING(4.850 45.750, 4.851 45.751)", 45.750, 4.850, "Lyon"),
]

df_amenagements = spark.createDataFrame(data_amenagements, schema_amenagements)
print(f"âœ“ Created silver_amenagements ({len(data_amenagements)} rows)")

âœ“ Created silver_amenagements (3 rows)


In [5]:
# Mock silver_sites (3 compteurs)
# SITE_001: ~100m de AMEN_001
# SITE_002: ~80m de AMEN_002  
# SITE_003: >500m de tous (hors buffer)
data_sites = [
    ("SITE_001", 45.7648, 4.8358, "Lyon"),       # ~100m de AMEN_001
    ("SITE_002", 45.7706, 4.8408, "Villeurbanne"),  # ~80m de AMEN_002
    ("SITE_003", 45.780, 4.860, "Villeurbanne"),    # Loin de tout
]

df_sites = spark.createDataFrame(data_sites, schema_sites)
print(f"âœ“ Created silver_sites ({len(data_sites)} rows)")

âœ“ Created silver_sites (3 rows)


In [6]:
# Mock silver_channels (5 canaux)
data_channels = [
    ("CHAN_001", "SITE_001", "velo", "Nord"),
    ("CHAN_002", "SITE_001", "velo", "Sud"),
    ("CHAN_003", "SITE_002", "velo", "Est"),
    ("CHAN_004", "SITE_003", "velo", "Ouest"),
    ("CHAN_005", "SITE_002", "voiture", "Nord"),  # Non-vÃ©lo, sera filtrÃ©
]

df_channels = spark.createDataFrame(data_channels, schema_channels)
print(f"âœ“ Created silver_channels ({len(data_channels)} rows)")

âœ“ Created silver_channels (5 rows)


In [7]:
# Mock silver_measures (30 jours de donnÃ©es)
from datetime import date, timedelta

base_date = date(2023, 6, 1)
data_measures = []

for day in range(30):
    current_date = base_date + timedelta(days=day)
    # CHAN_001: 150-210 vÃ©los/jour
    data_measures.append(("CHAN_001", current_date, 150 + day * 2, True))
    # CHAN_002: 100-130 vÃ©los/jour
    data_measures.append(("CHAN_002", current_date, 100 + day, True))
    # CHAN_003: 250-340 vÃ©los/jour
    data_measures.append(("CHAN_003", current_date, 250 + day * 3, True))
    # CHAN_004: 75-105 vÃ©los/jour (hors buffer, ne sera pas utilisÃ©)
    data_measures.append(("CHAN_004", current_date, 75 + day, True))

df_measures = spark.createDataFrame(data_measures, schema_measures)
print(f"âœ“ Created silver_measures ({len(data_measures)} rows)")

âœ“ Created silver_measures (120 rows)


## 4. Sauvegarder en Parquet

In [8]:
# Create output directory
output_path = f"../{silver_dir}"
os.makedirs(output_path, exist_ok=True)

# Save as Parquet
df_amenagements.write.mode("overwrite").parquet(f"{output_path}/silver_amenagements")
print(f"âœ“ Saved silver_amenagements.parquet")

df_sites.write.mode("overwrite").parquet(f"{output_path}/silver_sites")
print(f"âœ“ Saved silver_sites.parquet")

df_channels.write.mode("overwrite").parquet(f"{output_path}/silver_channels")
print(f"âœ“ Saved silver_channels.parquet")

df_measures.write.mode("overwrite").parquet(f"{output_path}/silver_measures")
print(f"âœ“ Saved silver_measures.parquet")

print(f"\nðŸŽ‰ Toutes les donnÃ©es Silver sont sauvegardÃ©es en Parquet dans {output_path}/")

Py4JJavaError: An error occurred while calling o93.parquet.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 6 in stage 0.0 failed 1 times, most recent failure: Lost task 6.0 in stage 0.0 (TID 6) (10.42.235.89 executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:385)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:210)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 27 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:307)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:107)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(origin.scala:76)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:461)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:32)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:437)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:85)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:83)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:142)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:869)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:391)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:364)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:243)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:802)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:789)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeTask(FileFormatWriter.scala:385)
	at org.apache.spark.sql.execution.datasources.WriteFilesExec.$anonfun$doExecuteWrite$1(WriteFiles.scala:100)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:210)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:774)
	... 27 more


In [None]:
# Verify saved files
print("=== VÃ©rification des fichiers Parquet ===")
for table in ["silver_amenagements", "silver_sites", "silver_channels", "silver_measures"]:
    df = spark.read.parquet(f"{output_path}/{table}")
    print(f"{table}: {df.count()} rows, {len(df.columns)} columns")

In [None]:
# Stop Spark
spark.stop()
print("âœ“ Spark session stopped")

## 5. Structure des Fichiers GÃ©nÃ©rÃ©s

```
data/silver/
â”œâ”€â”€ silver_amenagements/
â”‚   â””â”€â”€ *.parquet
â”œâ”€â”€ silver_sites/
â”‚   â””â”€â”€ *.parquet
â”œâ”€â”€ silver_channels/
â”‚   â””â”€â”€ *.parquet
â””â”€â”€ silver_measures/
    â””â”€â”€ *.parquet
```

**Prochaine Ã©tape :** ExÃ©cuter `02_spatial_usage_parquet.ipynb` pour le traitement Module 2