## 1. Setup & Configuration

In [1]:
import yaml
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from math import radians, sin, cos, sqrt, atan2
import os

# Load configuration
with open("../config/config.yml") as f:
    config = yaml.safe_load(f)

# Extract key parameters
buffer_m = config["params"]["buffer_m"]
bike_mode = config["filters"]["bike_mode_value"]
silver_dir = config["paths"]["silver_dir"]
gold_dir = config["paths"]["gold_dir"]

print(f"âœ“ Buffer distance: {buffer_m}m")
print(f"âœ“ Bike mode filter: {bike_mode}")
print(f"âœ“ Silver data: {silver_dir}")
print(f"âœ“ Gold output: {gold_dir}")

âœ“ Buffer distance: 200m
âœ“ Bike mode filter: velo
âœ“ Silver data: data/silver
âœ“ Gold output: data/gold


## 2. Create Mock Silver Data

Using pandas DataFrames (same schema as PySpark version)

In [4]:
# Mock silver_amenagements (infrastructure)
mock_amenagements = pd.DataFrame([
    {"amenagement_id": "AMEN_001", "annee_livraison": 2020, "type_amenagement": "Piste cyclable",
     "environnement": "Urbain", "longueur_m": 500.0,
     "geom_wkt": "LINESTRING(4.835 45.764, 4.836 45.765)",
     "centroid_lat": 45.764, "centroid_lon": 4.835, "commune": "Lyon"},
    {"amenagement_id": "AMEN_002", "annee_livraison": 2021, "type_amenagement": "Bande cyclable",
     "environnement": "PÃ©riurbain", "longueur_m": 300.0,
     "geom_wkt": "LINESTRING(4.840 45.770, 4.841 45.771)",
     "centroid_lat": 45.770, "centroid_lon": 4.840, "commune": "Villeurbanne"},
    {"amenagement_id": "AMEN_003", "annee_livraison": 2019, "type_amenagement": "Voie verte",
     "environnement": "Urbain", "longueur_m": 800.0,
     "geom_wkt": "LINESTRING(4.850 45.750, 4.851 45.751)",
     "centroid_lat": 45.750, "centroid_lon": 4.850, "commune": "Lyon"},
])

print(f"âœ“ Created mock silver_amenagements ({len(mock_amenagements)} rows)")
print(mock_amenagements)

âœ“ Created mock silver_amenagements (3 rows)
  amenagement_id  annee_livraison type_amenagement environnement  longueur_m  \
0       AMEN_001             2020   Piste cyclable        Urbain       500.0   
1       AMEN_002             2021   Bande cyclable    PÃ©riurbain       300.0   
2       AMEN_003             2019       Voie verte        Urbain       800.0   

                                 geom_wkt  centroid_lat  centroid_lon  \
0  LINESTRING(4.835 45.764, 4.836 45.765)        45.764         4.835   
1  LINESTRING(4.840 45.770, 4.841 45.771)        45.770         4.840   
2  LINESTRING(4.850 45.750, 4.851 45.751)        45.750         4.850   

        commune  
0          Lyon  
1  Villeurbanne  
2          Lyon  


In [5]:
# Mock silver_sites (counter locations)
mock_sites = pd.DataFrame([
    {"site_id": "SITE_001", "lat": 45.764, "lon": 4.835, "commune": "Lyon"},  # Near AMEN_001
    {"site_id": "SITE_002", "lat": 45.770, "lon": 4.840, "commune": "Villeurbanne"},  # Near AMEN_002
    {"site_id": "SITE_003", "lat": 45.780, "lon": 4.860, "commune": "Villeurbanne"},  # Far from all
])

print(f"âœ“ Created mock silver_sites ({len(mock_sites)} rows)")
print(mock_sites)

âœ“ Created mock silver_sites (3 rows)
    site_id     lat    lon       commune
0  SITE_001  45.764  4.835          Lyon
1  SITE_002  45.770  4.840  Villeurbanne
2  SITE_003  45.780  4.860  Villeurbanne


In [6]:
# Mock silver_channels (counter channels)
mock_channels = pd.DataFrame([
    {"channel_id": "CHAN_001", "site_id": "SITE_001", "mode": "velo", "sens": "Nord"},
    {"channel_id": "CHAN_002", "site_id": "SITE_001", "mode": "velo", "sens": "Sud"},
    {"channel_id": "CHAN_003", "site_id": "SITE_002", "mode": "velo", "sens": "Est"},
    {"channel_id": "CHAN_004", "site_id": "SITE_003", "mode": "velo", "sens": "Ouest"},
    {"channel_id": "CHAN_005", "site_id": "SITE_002", "mode": "voiture", "sens": "Nord"},  # Non-bike
])

print(f"âœ“ Created mock silver_channels ({len(mock_channels)} rows)")
print(mock_channels)

âœ“ Created mock silver_channels (5 rows)
  channel_id   site_id     mode   sens
0   CHAN_001  SITE_001     velo   Nord
1   CHAN_002  SITE_001     velo    Sud
2   CHAN_003  SITE_002     velo    Est
3   CHAN_004  SITE_003     velo  Ouest
4   CHAN_005  SITE_002  voiture   Nord


In [7]:
# Mock silver_measures (time-series counts)
base_date = datetime(2023, 6, 1)
mock_data = []

for day in range(30):
    current_date = (base_date + timedelta(days=day)).date()
    # CHAN_001: 100-200 bikes/day
    mock_data.append({"channel_id": "CHAN_001", "date": current_date, "flux": 150 + day * 2, "is_valid": True})
    # CHAN_002: 80-120 bikes/day
    mock_data.append({"channel_id": "CHAN_002", "date": current_date, "flux": 100 + day, "is_valid": True})
    # CHAN_003: 200-300 bikes/day
    mock_data.append({"channel_id": "CHAN_003", "date": current_date, "flux": 250 + day * 3, "is_valid": True})
    # CHAN_004: 50-100 bikes/day
    mock_data.append({"channel_id": "CHAN_004", "date": current_date, "flux": 75 + day, "is_valid": True})

mock_measures = pd.DataFrame(mock_data)

print(f"âœ“ Created mock silver_measures ({len(mock_measures)} rows)")
print(mock_measures.groupby("channel_id").agg({"flux": ["count", "sum"]}))

âœ“ Created mock silver_measures (120 rows)
            flux      
           count   sum
channel_id            
CHAN_001      30  5370
CHAN_002      30  3435
CHAN_003      30  8805
CHAN_004      30  2685


## 3. Spatial Join: Link Counters to Infrastructure

Use Haversine formula to calculate distances and find nearby counter sites

In [8]:
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance in meters between two points"""
    R = 6371000  # Earth radius in meters
    
    lat1_rad, lon1_rad = radians(lat1), radians(lon1)
    lat2_rad, lon2_rad = radians(lat2), radians(lon2)
    
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    
    return R * c

# Cross join amenagements and sites
cross_join = mock_amenagements[["amenagement_id", "centroid_lat", "centroid_lon"]].merge(
    mock_sites[["site_id", "lat", "lon"]],
    how="cross"
)

# Calculate distance
cross_join["distance_m"] = cross_join.apply(
    lambda row: haversine_distance(row["centroid_lat"], row["centroid_lon"], row["lat"], row["lon"]),
    axis=1
)

# Filter within buffer
df_within_buffer = cross_join[cross_join["distance_m"] <= buffer_m].copy()

print(f"âœ“ Found {len(df_within_buffer)} amenagement-site pairs within {buffer_m}m")
print(df_within_buffer)

âœ“ Found 2 amenagement-site pairs within 200m
  amenagement_id  centroid_lat  centroid_lon   site_id     lat    lon  \
0       AMEN_001        45.764         4.835  SITE_001  45.764  4.835   
4       AMEN_002        45.770         4.840  SITE_002  45.770  4.840   

   distance_m  
0         0.0  
4         0.0  


In [9]:
# Join with channels and filter for bike mode
df_amen_channels = df_within_buffer.merge(
    mock_channels,
    on="site_id",
    how="inner"
)
df_amen_channels = df_amen_channels[df_amen_channels["mode"] == bike_mode]

print(f"âœ“ Found {len(df_amen_channels)} amenagement-channel links (bike mode only)")
print(df_amen_channels)

âœ“ Found 3 amenagement-channel links (bike mode only)
  amenagement_id  centroid_lat  centroid_lon   site_id     lat    lon  \
0       AMEN_001        45.764         4.835  SITE_001  45.764  4.835   
1       AMEN_001        45.764         4.835  SITE_001  45.764  4.835   
2       AMEN_002        45.770         4.840  SITE_002  45.770  4.840   

   distance_m channel_id  mode  sens  
0         0.0   CHAN_001  velo  Nord  
1         0.0   CHAN_002  velo   Sud  
2         0.0   CHAN_003  velo   Est  


In [10]:
# Create gold_link_amenagement_channel
gold_link = df_amen_channels[["amenagement_id", "channel_id", "site_id", "distance_m"]].drop_duplicates()

print(f"âœ“ Created gold_link_amenagement_channel ({len(gold_link)} rows)")
print(gold_link)

âœ“ Created gold_link_amenagement_channel (3 rows)
  amenagement_id channel_id   site_id  distance_m
0       AMEN_001   CHAN_001  SITE_001         0.0
1       AMEN_001   CHAN_002  SITE_001         0.0
2       AMEN_002   CHAN_003  SITE_002         0.0


## 4. Aggregate Daily Flows per Infrastructure

In [17]:
# Join links with measures
df_flows = gold_link.merge(
    mock_measures[mock_measures["is_valid"]],
    on="channel_id",
    how="inner"
)

print(f"âœ“ Joined {len(df_flows)} measure records")
print(df_flows[["amenagement_id", "channel_id", "date", "flux"]].head(10))

âœ“ Joined 90 measure records
  amenagement_id channel_id        date  flux
0       AMEN_001   CHAN_001  2023-06-01   150
1       AMEN_001   CHAN_001  2023-06-02   152
2       AMEN_001   CHAN_001  2023-06-03   154
3       AMEN_001   CHAN_001  2023-06-04   156
4       AMEN_001   CHAN_001  2023-06-05   158
5       AMEN_001   CHAN_001  2023-06-06   160
6       AMEN_001   CHAN_001  2023-06-07   162
7       AMEN_001   CHAN_001  2023-06-08   164
8       AMEN_001   CHAN_001  2023-06-09   166
9       AMEN_001   CHAN_001  2023-06-10   168


In [18]:
# Aggregate by amenagement_id and date
gold_flow_daily = df_flows.groupby(["amenagement_id", "date"]).agg(
    flux_estime=("flux", "sum"),
    n_channels=("channel_id", "nunique")
).reset_index().sort_values(["amenagement_id", "date"])

print(f"âœ“ Created gold_flow_amenagement_daily ({len(gold_flow_daily)} rows)")
print(gold_flow_daily.head(20))

âœ“ Created gold_flow_amenagement_daily (60 rows)
   amenagement_id        date  flux_estime  n_channels
0        AMEN_001  2023-06-01          250           2
1        AMEN_001  2023-06-02          253           2
2        AMEN_001  2023-06-03          256           2
3        AMEN_001  2023-06-04          259           2
4        AMEN_001  2023-06-05          262           2
5        AMEN_001  2023-06-06          265           2
6        AMEN_001  2023-06-07          268           2
7        AMEN_001  2023-06-08          271           2
8        AMEN_001  2023-06-09          274           2
9        AMEN_001  2023-06-10          277           2
10       AMEN_001  2023-06-11          280           2
11       AMEN_001  2023-06-12          283           2
12       AMEN_001  2023-06-13          286           2
13       AMEN_001  2023-06-14          289           2
14       AMEN_001  2023-06-15          292           2
15       AMEN_001  2023-06-16          295           2
16       AMEN_0

## 5. Data Quality Checks

In [None]:
# Check 1: No duplicate amenagement-channel links
duplicates = gold_link.groupby(["amenagement_id", "channel_id"]).size()
assert (duplicates == 1).all(), "FAILED: Found duplicate amenagement-channel links"
print("âœ“ No duplicate links")

# Check 2: All amenagements with data
amen_with_data = gold_flow_daily["amenagement_id"].nunique()
print(f"âœ“ {amen_with_data} amenagements with flow data")

# Check 3: flux_estime should be non-negative
assert (gold_flow_daily["flux_estime"] >= 0).all(), "FAILED: Found negative flux values"
print("âœ“ All flux values are non-negative")

# Check 4: n_channels should be >= 1
assert (gold_flow_daily["n_channels"] >= 1).all(), "FAILED: Found days with 0 channels"
print("âœ“ All days have at least 1 channel")

print("\n All quality checks passed!")

âœ“ No duplicate links
âœ“ 2 amenagements with flow data
âœ“ All flux values are non-negative
âœ“ All days have at least 1 channel

ðŸŽ‰ All quality checks passed!


## 6. Summary Statistics

In [20]:
# Summary by amenagement
summary = gold_flow_daily.groupby("amenagement_id").agg(
    total_days=("date", "count"),
    total_flux=("flux_estime", "sum"),
    avg_daily_flux=("flux_estime", "mean"),
    max_channels=("n_channels", "max")
).reset_index().sort_values("avg_daily_flux", ascending=False)

print("Summary by infrastructure:")
print(summary)

Summary by infrastructure:
  amenagement_id  total_days  total_flux  avg_daily_flux  max_channels
0       AMEN_001          30        8805           293.5             2
1       AMEN_002          30        8805           293.5             1


In [21]:
# Daily flows for AMEN_001
print("\nDaily flows for AMEN_001:")
print(gold_flow_daily[gold_flow_daily["amenagement_id"] == "AMEN_001"].head(10))


Daily flows for AMEN_001:
  amenagement_id        date  flux_estime  n_channels
0       AMEN_001  2023-06-01          250           2
1       AMEN_001  2023-06-02          253           2
2       AMEN_001  2023-06-03          256           2
3       AMEN_001  2023-06-04          259           2
4       AMEN_001  2023-06-05          262           2
5       AMEN_001  2023-06-06          265           2
6       AMEN_001  2023-06-07          268           2
7       AMEN_001  2023-06-08          271           2
8       AMEN_001  2023-06-09          274           2
9       AMEN_001  2023-06-10          277           2


## 7. Save Outputs (CSV format)

In [22]:
# Create output directories
os.makedirs(f"../{silver_dir}", exist_ok=True)
os.makedirs(f"../{gold_dir}", exist_ok=True)

# Save Silver mock data
mock_amenagements.to_csv(f"../{silver_dir}/silver_amenagements.csv", index=False)
mock_sites.to_csv(f"../{silver_dir}/silver_sites.csv", index=False)
mock_channels.to_csv(f"../{silver_dir}/silver_channels.csv", index=False)
mock_measures.to_csv(f"../{silver_dir}/silver_measures.csv", index=False)

# Save Gold outputs
gold_link.to_csv(f"../{gold_dir}/gold_link_amenagement_channel.csv", index=False)
gold_flow_daily.to_csv(f"../{gold_dir}/gold_flow_amenagement_daily.csv", index=False)

print("âœ“ All files saved to CSV format")
print(f"   Silver: {silver_dir}/")
print(f"   Gold: {gold_dir}/")

âœ“ All files saved to CSV format
   Silver: data/silver/
   Gold: data/gold/


## 8. Next Steps

âœ… **Module 2 Logic Complete** â€” All spatial joins and aggregations work!

**Migration to PySpark (when needed):**
```python
# Pandas â†’ PySpark conversion is straightforward:
df_spark = spark.createDataFrame(df_pandas)

# Most operations map directly:
# pandas: df.merge()        â†’ PySpark: df.join()
# pandas: df.groupby()      â†’ PySpark: df.groupBy()
# pandas: df.apply()        â†’ PySpark: @udf or built-in functions
```

**When Module 1 delivers real data:**
1. If data is small (<1GB): Continue with pandas
2. If data is large: Convert to PySpark using the logic you've developed here
3. Run on Linux or properly configured Spark environment

**Share with colleagues:**
- The [docs/module2_requirements.md](../docs/module2_requirements.md) specifies your data needs
- The CSV outputs show expected schema and sample data