## 1. Setup

In [6]:
import yaml
import os
import pandas as pd
from datetime import date, timedelta
import pyarrow 

# Load configuration
with open("../config/config.yml") as f:
    config = yaml.safe_load(f)

silver_dir = config["paths"]["silver_dir"]
print(f"âœ“ Silver output: {silver_dir}")

âœ“ Silver output: data/silver


## 2. CrÃ©er les DataFrames Pandas

In [7]:
# Mock silver_amenagements (3 infrastructures)
df_amenagements = pd.DataFrame([
    {
        "amenagement_id": "AMEN_001",
        "annee_livraison": 2020,
        "type_amenagement": "Piste cyclable",
        "environnement": "Urbain",
        "longueur_m": 500.0,
        "geom_wkt": "LINESTRING(4.835 45.764, 4.836 45.765)",
        "centroid_lat": 45.764,
        "centroid_lon": 4.835,
        "commune": "Lyon"
    },
    {
        "amenagement_id": "AMEN_002",
        "annee_livraison": 2021,
        "type_amenagement": "Bande cyclable",
        "environnement": "PÃ©riurbain",
        "longueur_m": 300.0,
        "geom_wkt": "LINESTRING(4.840 45.770, 4.841 45.771)",
        "centroid_lat": 45.770,
        "centroid_lon": 4.840,
        "commune": "Villeurbanne"
    },
    {
        "amenagement_id": "AMEN_003",
        "annee_livraison": 2019,
        "type_amenagement": "Voie verte",
        "environnement": "Urbain",
        "longueur_m": 800.0,
        "geom_wkt": "LINESTRING(4.850 45.750, 4.851 45.751)",
        "centroid_lat": 45.750,
        "centroid_lon": 4.850,
        "commune": "Lyon"
    }
])

print(f"âœ“ Created silver_amenagements ({len(df_amenagements)} rows)")
print(df_amenagements)

âœ“ Created silver_amenagements (3 rows)
  amenagement_id  annee_livraison type_amenagement environnement  longueur_m  \
0       AMEN_001             2020   Piste cyclable        Urbain       500.0   
1       AMEN_002             2021   Bande cyclable    PÃ©riurbain       300.0   
2       AMEN_003             2019       Voie verte        Urbain       800.0   

                                 geom_wkt  centroid_lat  centroid_lon  \
0  LINESTRING(4.835 45.764, 4.836 45.765)        45.764         4.835   
1  LINESTRING(4.840 45.770, 4.841 45.771)        45.770         4.840   
2  LINESTRING(4.850 45.750, 4.851 45.751)        45.750         4.850   

        commune  
0          Lyon  
1  Villeurbanne  
2          Lyon  


In [8]:
# Mock silver_sites (3 compteurs)
# SITE_001: ~100m de AMEN_001
# SITE_002: ~80m de AMEN_002  
# SITE_003: >500m de tous (hors buffer)
df_sites = pd.DataFrame([
    {"site_id": "SITE_001", "lat": 45.7648, "lon": 4.8358, "commune": "Lyon"},
    {"site_id": "SITE_002", "lat": 45.7706, "lon": 4.8408, "commune": "Villeurbanne"},
    {"site_id": "SITE_003", "lat": 45.780, "lon": 4.860, "commune": "Villeurbanne"},
])

print(f"âœ“ Created silver_sites ({len(df_sites)} rows)")
print(df_sites)

âœ“ Created silver_sites (3 rows)
    site_id      lat     lon       commune
0  SITE_001  45.7648  4.8358          Lyon
1  SITE_002  45.7706  4.8408  Villeurbanne
2  SITE_003  45.7800  4.8600  Villeurbanne


In [9]:
# Mock silver_channels (5 canaux)
df_channels = pd.DataFrame([
    {"channel_id": "CHAN_001", "site_id": "SITE_001", "mode": "velo", "sens": "Nord"},
    {"channel_id": "CHAN_002", "site_id": "SITE_001", "mode": "velo", "sens": "Sud"},
    {"channel_id": "CHAN_003", "site_id": "SITE_002", "mode": "velo", "sens": "Est"},
    {"channel_id": "CHAN_004", "site_id": "SITE_003", "mode": "velo", "sens": "Ouest"},
    {"channel_id": "CHAN_005", "site_id": "SITE_002", "mode": "voiture", "sens": "Nord"},
])

print(f"âœ“ Created silver_channels ({len(df_channels)} rows)")
print(df_channels)

âœ“ Created silver_channels (5 rows)
  channel_id   site_id     mode   sens
0   CHAN_001  SITE_001     velo   Nord
1   CHAN_002  SITE_001     velo    Sud
2   CHAN_003  SITE_002     velo    Est
3   CHAN_004  SITE_003     velo  Ouest
4   CHAN_005  SITE_002  voiture   Nord


In [10]:
# Mock silver_measures (30 jours de donnÃ©es)
base_date = date(2023, 6, 1)
measures_data = []

for day in range(30):
    current_date = base_date + timedelta(days=day)
    # CHAN_001: 150-210 vÃ©los/jour
    measures_data.append({"channel_id": "CHAN_001", "date": current_date, "flux": 150 + day * 2, "is_valid": True})
    # CHAN_002: 100-130 vÃ©los/jour
    measures_data.append({"channel_id": "CHAN_002", "date": current_date, "flux": 100 + day, "is_valid": True})
    # CHAN_003: 250-340 vÃ©los/jour
    measures_data.append({"channel_id": "CHAN_003", "date": current_date, "flux": 250 + day * 3, "is_valid": True})
    # CHAN_004: 75-105 vÃ©los/jour (hors buffer, ne sera pas utilisÃ©)
    measures_data.append({"channel_id": "CHAN_004", "date": current_date, "flux": 75 + day, "is_valid": True})

df_measures = pd.DataFrame(measures_data)

print(f"âœ“ Created silver_measures ({len(df_measures)} rows)")
print(df_measures.head(10))

âœ“ Created silver_measures (120 rows)
  channel_id        date  flux  is_valid
0   CHAN_001  2023-06-01   150      True
1   CHAN_002  2023-06-01   100      True
2   CHAN_003  2023-06-01   250      True
3   CHAN_004  2023-06-01    75      True
4   CHAN_001  2023-06-02   152      True
5   CHAN_002  2023-06-02   101      True
6   CHAN_003  2023-06-02   253      True
7   CHAN_004  2023-06-02    76      True
8   CHAN_001  2023-06-03   154      True
9   CHAN_002  2023-06-03   102      True


## 3. Sauvegarder en Parquet avec Pandas

In [11]:
# Create output directories
output_path = f"../{silver_dir}"
os.makedirs(output_path, exist_ok=True)

# Save each table as Parquet
df_amenagements.to_parquet(f"{output_path}/silver_amenagements.parquet", index=False)
print(f"âœ“ Saved silver_amenagements.parquet")

df_sites.to_parquet(f"{output_path}/silver_sites.parquet", index=False)
print(f"âœ“ Saved silver_sites.parquet")

df_channels.to_parquet(f"{output_path}/silver_channels.parquet", index=False)
print(f"âœ“ Saved silver_channels.parquet")

df_measures.to_parquet(f"{output_path}/silver_measures.parquet", index=False)
print(f"âœ“ Saved silver_measures.parquet")

print(f"\nðŸŽ‰ Toutes les donnÃ©es Silver sont sauvegardÃ©es en Parquet dans {output_path}/")

âœ“ Saved silver_amenagements.parquet
âœ“ Saved silver_sites.parquet
âœ“ Saved silver_channels.parquet
âœ“ Saved silver_measures.parquet

ðŸŽ‰ Toutes les donnÃ©es Silver sont sauvegardÃ©es en Parquet dans ../data/silver/


## 4. VÃ©rification

In [12]:
# Verify saved files
print("=== VÃ©rification des fichiers Parquet ===\n")
for table in ["silver_amenagements", "silver_sites", "silver_channels", "silver_measures"]:
    df_test = pd.read_parquet(f"{output_path}/{table}.parquet")
    print(f"{table}: {len(df_test)} rows, {len(df_test.columns)} columns")
    print(f"  Columns: {list(df_test.columns)}")
    print()

=== VÃ©rification des fichiers Parquet ===

silver_amenagements: 3 rows, 9 columns
  Columns: ['amenagement_id', 'annee_livraison', 'type_amenagement', 'environnement', 'longueur_m', 'geom_wkt', 'centroid_lat', 'centroid_lon', 'commune']

silver_sites: 3 rows, 4 columns
  Columns: ['site_id', 'lat', 'lon', 'commune']

silver_channels: 5 rows, 4 columns
  Columns: ['channel_id', 'site_id', 'mode', 'sens']

silver_measures: 120 rows, 4 columns
  Columns: ['channel_id', 'date', 'flux', 'is_valid']



## 5. Structure des Fichiers GÃ©nÃ©rÃ©s

```
data/silver/
â”œâ”€â”€ silver_amenagements.parquet
â”œâ”€â”€ silver_sites.parquet
â”œâ”€â”€ silver_channels.parquet
â””â”€â”€ silver_measures.parquet
```

**Prochaine Ã©tape:** ExÃ©cuter `02_spatial_usage_parquet.ipynb` pour le traitement Module 2 avec PySpark