## 1. Setup & Configuration

In [4]:
import sys
import os
import yaml

# Windows compatibility fix for PySpark
if sys.platform == "win32":
    import socketserver
    if not hasattr(socketserver, 'UnixStreamServer'):
        socketserver.UnixStreamServer = socketserver.TCPServer

from pyspark.sql import SparkSession

# Load configuration
with open("../../../../config/config.yml") as f:
    config = yaml.safe_load(f)

silver_dir = config["paths"]["silver_dir"]

print(f"‚úì Silver directory: {silver_dir}")

‚úì Silver directory: data/silver


In [5]:
# Initialize Spark session (Windows configuration)
import tempfile

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

local_temp = tempfile.gettempdir()
os.environ['SPARK_LOCAL_DIRS'] = local_temp

spark = SparkSession.builder \
    .appName("CSV_to_Parquet") \
    .master("local[*]") \
    .config("spark.sql.adaptive.enabled", "false") \
    .config("spark.driver.host", "localhost") \
    .config("spark.driver.bindAddress", "localhost") \
    .config("spark.ui.enabled", "false") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .config("spark.local.dir", local_temp) \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f"‚úì Spark version: {spark.version}")
print(f"‚úì Python: {sys.executable}")

‚úì Spark version: 3.5.3
‚úì Python: c:\Users\medma\Documents\2025-2026\Big Data\datathon_velomenaj\venv\Scripts\python.exe


## 2. Fonction de conversion

In [6]:
def convert_csv_to_parquet(csv_path, parquet_path, sep=";"):
    """
    Convert CSV file to Parquet format.
    
    Args:
        csv_path: Path to CSV file
        parquet_path: Output path for Parquet file
        sep: CSV separator (default: ";")
    """
    print(f"üìñ Reading CSV: {csv_path}")
    df = spark.read.csv(csv_path, header=True, inferSchema=True, sep=sep)
    
    count = df.count()
    print(f"   ‚úì Loaded {count:,} rows, {len(df.columns)} columns")
    
    # Show schema
    print("   Schema:")
    for field in df.schema.fields:
        print(f"      - {field.name}: {field.dataType.simpleString()}")
    
    print(f"\nüíæ Writing Parquet: {parquet_path}")
    df.write.mode("overwrite").parquet(parquet_path)
    print(f"   ‚úì Saved successfully")
    
    return df, count

print("‚úì Function ready")

‚úì Function ready


## 3. Conversion de silver_amenagements.csv

In [9]:
# Convert silver_amenagements.csv to Parquet
csv_path = "../../../../data/silver_amenagements.csv"
parquet_path = f"../{silver_dir}/silver_amenagements2"

df_amenagements, count = convert_csv_to_parquet(csv_path, parquet_path)

üìñ Reading CSV: ../../../../data/silver_amenagements.csv
   ‚úì Loaded 4,592 rows, 27 columns
   Schema:
      - nom: string
      - commune1: string
      - insee1: double
      - commune2: string
      - insee2: double
      - reseau: string
      - financementac: string
      - typeamenagement: string
      - typeamenagement2: string
      - positionnement: string
      - senscirculation: string
      - environnement: string
      - localisation: string
      - typologiepiste: string
      - revetementpiste: string
      - domanialite: string
      - reglementation: string
      - zonecirculationapaisee: string
      - anneelivraison: double
      - longueur: string
      - observation: string
      - validite: string
      - gid: int
      - centroid_lat: double
      - centroid_lon: double
      - is_mock: boolean
      - geocoded_at: timestamp

üíæ Writing Parquet: ../data/silver/silver_amenagements2
   ‚úì Saved successfully


In [10]:
# Preview the data
print("\nüìä Preview:")
df_amenagements.show(5, truncate=50)


üìä Preview:
+------------------------------------+----------------------+-------+--------+------+------------------+------------------+--------------------+----------------+-----------------------------+---------------+-------------------+------------+------------------+--------------------------------------------------+-----------+-------------------------+----------------------+--------------+--------+----------------------------------+-----------------------------------+----+------------+------------+-------+-------------------------+
|                                 nom|              commune1| insee1|commune2|insee2|            reseau|     financementac|     typeamenagement|typeamenagement2|               positionnement|senscirculation|      environnement|localisation|    typologiepiste|                                   revetementpiste|domanialite|           reglementation|zonecirculationapaisee|anneelivraison|longueur|                       observation|                       

## 4. V√©rification du Parquet

In [11]:
# Re-read the Parquet to verify
df_verify = spark.read.parquet(parquet_path)

print(f"‚úì Parquet file loaded: {df_verify.count()} rows")
print(f"‚úì Columns: {df_verify.columns}")

# Compare row counts
assert df_verify.count() == count, "Row count mismatch!"
print("\nüéâ Verification passed!")

‚úì Parquet file loaded: 4592 rows
‚úì Columns: ['nom', 'commune1', 'insee1', 'commune2', 'insee2', 'reseau', 'financementac', 'typeamenagement', 'typeamenagement2', 'positionnement', 'senscirculation', 'environnement', 'localisation', 'typologiepiste', 'revetementpiste', 'domanialite', 'reglementation', 'zonecirculationapaisee', 'anneelivraison', 'longueur', 'observation', 'validite', 'gid', 'centroid_lat', 'centroid_lon', 'is_mock', 'geocoded_at']

üéâ Verification passed!


## 5. Conversion d'autres fichiers (optionnel)

D√©commentez et modifiez selon vos besoins.

In [None]:
# === Convertir d'autres fichiers CSV ===

# silver_sites
# convert_csv_to_parquet(
#     f"../{silver_dir}/silver_sites.csv",
#     f"../{silver_dir}/silver_sites"
# )

# silver_channels
# convert_csv_to_parquet(
#     f"../{silver_dir}/silver_channels.csv",
#     f"../{silver_dir}/silver_channels"
# )

# silver_measures
# convert_csv_to_parquet(
#     f"../{silver_dir}/silver_measures.csv",
#     f"../{silver_dir}/silver_measures"
# )

print("‚ÑπÔ∏è  D√©commentez les lignes ci-dessus pour convertir d'autres fichiers")

## 6. Cleanup

In [None]:
# Stop Spark session
spark.stop()
print("‚úì Spark session stopped")