"""
01_prepare_fsa.py
Teacher-style, lab-friendly script to prepare FSA polygons for RentAtlas.

What it does:
- finds an FSA polygon source inside datadump/ (shapefile or geojson)
- loads it into a GeoDataFrame
- ensures a consistent CRS for processing (MTM8 if you prefer, but we'll export WGS84 for the web)
- writes data/processed/fsa.geojson (EPSG:4326) for the map to use

Run in your venv or convert the cells to a notebook.
"""


In [9]:
import pyproj
import sys
from pathlib import Path

In [None]:
import os
import glob
import geopandas as gpd

In [None]:
shapefile_path = Path("../data/raw/fsa/Montreal_fsa.geojson")

if not shapefile_path.exists():
    print(f"ERROR: {shapefile_path} not found. Check path and try again.")
    print(f"Current working directory: {Path.cwd()}")
    sys.exit(1)

# 1. Read file (GeoPandas uses Fiona under the hood)
gdf = gpd.read_file(shapefile_path)

# 2. Quick inspection (safe, cheap)
print("CRS:", gdf.crs)
print("Shape:", gdf.shape)
print("Columns:", list(gdf.columns))
print(gdf.head())

# 3. Ensure a consistent CRS for your project (example: use EPSG:4326 or whatever your lab uses)
# Check lab notebooks for preferred CRS. For web/Leaflet, GeoJSON in WGS84 (EPSG:4326) is common.
target_crs = "EPSG:4326"
if gdf.crs is None:
    print("Input has no CRS — please confirm what CRS this file should be in.")
else:
    if gdf.crs.to_string() != target_crs:
        print(f"Reprojecting from {gdf.crs} to {target_crs} ...")
        gdf = gdf.to_crs(target_crs)

# 4. Optional: geometry validity check and repair
gdf["geometry"] = gdf["geometry"].buffer(0)

# 5. Export a processed GeoJSON for the web (trim columns if you want)
out_dir = Path("../data/processed")
out_dir.mkdir(parents=True, exist_ok=True)

# Keep only the columns you need for the web (reduce size)
keep_cols = ["FSA", "geometry"] if "FSA" in gdf.columns else [c for c in gdf.columns if c != "geometry"] + ["geometry"]
try:
    gdf = gdf[keep_cols]
except Exception:
    # fallback: keep everything but it's okay
    pass

out_file = out_dir / "fsa_master.geojson"
gdf.to_file(out_file, driver="GeoJSON")
print("Wrote:", out_file)


CRS: EPSG:4326
Shape: (98, 4)
Columns: ['CFSAUID', 'PRUID', 'PRNAME', 'geometry']
  CFSAUID PRUID           PRNAME  \
0     H0M    24  Quebec / Québec   
1     H1A    24  Quebec / Québec   
2     H3Z    24  Quebec / Québec   
3     H3Y    24  Quebec / Québec   
4     H1B    24  Quebec / Québec   

                                            geometry  
0  MULTIPOLYGON (((-74.5196 45.03463, -74.51939 4...  
1  MULTIPOLYGON (((-73.47668 45.70214, -73.47722 ...  
2  MULTIPOLYGON (((-73.58887 45.49044, -73.58634 ...  
3  MULTIPOLYGON (((-73.60089 45.49426, -73.60104 ...  
4  MULTIPOLYGON (((-73.50219 45.65192, -73.50231 ...  
Wrote: ../data/processed/fsa_master.geojson


In [17]:
# get head of file fsa_master to confirm
!head -n 10 ../data/processed/fsa_master.geojson

261474.07s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


{
"type": "FeatureCollection",
"name": "fsa_master",
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } },
"features": [
{ "type": "Feature", "properties": { "CFSAUID": "H1A", "PRUID": "24", "PRNAME": "Quebec / Québec", "FSA_CODE": "H1A", "crime_count": 4194.0, "parks_count": 93, "crime_score": 0.57781881604596286, "parks_score": 0.59619825136241444, "final_score": 0.43702288028484809, "population": null, "crime_rate_per_1000": null, "fsa_area_km2": 31.204847785552872, "parks_area_km2": 6.1172920382056706, "parks_area_per_sqkm": 0.19603659278344041, "transit_route_km": 273.59767762060073, "transit_km_per_sqkm": 8.7677940139567099, "transit_score": 0.13705157344616709, "parks_mean_distance_m": 330.06241548447326, "parks_area_score": 0.26873271442096724, "parks_prox_score": 0.92366378830386175 }, "geometry": { "type": "MultiPolygon", "coordinates": [ [ [ [ -73.476682328669852, 45.702144818881642 ], [ -73.477220069535363, 45.70173795851872 ], [ -73.47735336