In [1]:
import os
import csv
import logging
import ee
import urllib.request
import numpy as np
import rasterio
import shutil
from pathlib import Path
from PIL import Image
import yaml
from dotenv import load_dotenv
from ee import ServiceAccountCredentials
load_dotenv()
# ─────────────────────────── Load config.yaml ───────────────────────────
with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

RAW_DATA_DIR        = Path(config["raw_data_dir"])
PROCESSED_DATA_DIR  = Path(config["processed_data_dir"])
INTERIM_DATA_DIR    = Path(config["interim_data_dir"])
RESULTS_DIR         = Path(config["results_dir"])
EMBEDDING_DIR       = Path(config["embedding_parquet_dir"])
AOI_CRS             = config["aoi_crs"]
METRIC_CRS          = config["metric_crs"]
AOI_BOX             = config["aoi_box"]

# Обновлённые абсолютные пути
DTM_DIR = RAW_DATA_DIR / "datasets/nasa-amazon-lidar-2008-2018/Nasa_lidar_2008_to_2018_DTMs"
PATH_TO_COORDS_CSV = RAW_DATA_DIR / "datasets/nasa-amazon-lidar-2008-2018/cms_brazil_lidar_tile_inventory.csv"

# Папка predicted внутри results/
OUT_DIR = RESULTS_DIR / "predicted"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Другие параметры
GSA_email = os.getenv("GSA_email")
KEY_PATH = os.environ["GOOGLE_APPLICATION_CREDENTIALS"]

MAX_PHOTOS_PER_PREFIX = 3
BUFFER_RADIUS_METERS = 1500
DATE_TO_DOWNLOAD = "2025-05-01"
AZ1, ALT1 = 315, 45
AZ2, ALT2 = 45, 30

EE_COLLECTION_S1 = "COPERNICUS/S1_GRD"
VIS_PARAMS_S1 = {'bands': ['VV'], 'min': -25, 'max': 5}
EE_COLLECTION_S2 = "COPERNICUS/S2_SR_HARMONIZED"
VIS_PARAMS_S2 = {'bands': ['B4', 'B3', 'B2'], 'min': 0, 'max': 3000, 'gamma': 1.3}

if not PATH_TO_COORDS_CSV.exists():
    raise FileNotFoundError(f"CSV not found: {PATH_TO_COORDS_CSV}")

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s"
)

creds = ServiceAccountCredentials(GSA_email, KEY_PATH)
ee.Initialize(credentials=creds, project="kaggle-ai-to-z")
# ─────────────────────────── Helpers ────────────────────────────
def get_best_s1(pt, start, end):
    geom = ee.Geometry.Point(pt).buffer(BUFFER_RADIUS_METERS).bounds()
    img = (ee.ImageCollection(EE_COLLECTION_S1)
           .filterBounds(ee.Geometry.Point(pt))
           .filterDate(start, end)
           .sort('system:time_start')
           .first())
    return img, geom

def get_best_s2(pt, start, end, max_cloud=30):
    geom = ee.Geometry.Point(pt).buffer(BUFFER_RADIUS_METERS).bounds()
    img = (ee.ImageCollection(EE_COLLECTION_S2)
           .filterBounds(ee.Geometry.Point(pt))
           .filterDate(start, end)
           .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', max_cloud))
           .sort('CLOUDY_PIXEL_PERCENTAGE')
           .first())
    return img, geom

def save_jpg(img, region, out_path, vis_params):
    vis = img.visualize(**vis_params)
    url = vis.getThumbURL({'region': region, 'dimensions': 800, 'format': 'jpg'})
    data = urllib.request.urlopen(url).read()
    out_path.write_bytes(data)

def save_tif(img, region, out_path, bands=None):
    params = {'scale': 10, 'region': region, 'format': 'GEO_TIFF', 'crs': 'EPSG:4326'}
    if bands:
        params['bands'] = bands
    url = img.getDownloadURL(params)
    data = urllib.request.urlopen(url).read()
    out_path.write_bytes(data)

def hillshade(arr, az, alt):
    az, alt = np.deg2rad([az, alt])
    dy, dx = np.gradient(arr.astype("float32"), edge_order=2)
    slope = np.arctan(np.hypot(dx, dy))
    aspect = np.arctan2(dy, -dx)
    hs = (np.sin(alt) * np.cos(slope) +
          np.cos(alt) * np.sin(slope) * np.cos(az - aspect))
    return (np.clip(hs, 0, 1) * 255).astype("uint8")

# ─────────────────────────── Main ────────────────────────────────
if __name__ == "__main__":
    matches = []
    prefix_counts = {}
    with open(PATH_TO_COORDS_CSV, newline='', encoding='utf-8') as cf:
        reader = csv.DictReader(cf)
        for row in reader:
            full_stem = Path(row["filename"]).stem.replace(".laz", "")
            prefix = "_".join(full_stem.split("_")[:2])
            cnt = prefix_counts.get(prefix, 0)
            if cnt >= MAX_PHOTOS_PER_PREFIX:
                continue
            prefix_counts[prefix] = cnt + 1

            minx, miny = float(row["min_lon"]), float(row["min_lat"])
            maxx, maxy = float(row["max_lon"]), float(row["max_lat"])
            center = [(minx + maxx) / 2, (miny + maxy) / 2]
            matches.append((full_stem, center))

    logging.info(
        f"Found {len(matches)} entries across "
        f"{len(prefix_counts)} prefixes (max {MAX_PHOTOS_PER_PREFIX} each)"
    )

    for stem, pt in matches:
        logging.info(f"Processing tile {stem}")
        tile_dir = OUT_DIR / stem
        tile_dir.mkdir(exist_ok=True)

        lidar_fp = DTM_DIR / f"{stem}.tif"
        if not lidar_fp.exists():
            logging.warning(f" LiDAR TIFF missing for {stem}")
            continue
        shutil.copy(lidar_fp, tile_dir / f"{stem}_lidar.tif")

        start = "2024-01-01"
        end = f"{DATE_TO_DOWNLOAD}T23:59:59"
        region = ee.Geometry.Point(pt).buffer(BUFFER_RADIUS_METERS).bounds()

        s1_img, s1_reg = get_best_s1(pt, start, end)
        if s1_img:
            save_jpg(s1_img, s1_reg, tile_dir / f"{stem}_S1_{DATE_TO_DOWNLOAD}.jpg", VIS_PARAMS_S1)
            save_tif(s1_img, s1_reg, tile_dir / f"{stem}_S1_{DATE_TO_DOWNLOAD}.tif")
        else:
            logging.warning(f" No S1 for {stem}")

        s2_img, s2_reg = get_best_s2(pt, start, end)
        if s2_img:
            save_jpg(s2_img, s2_reg, tile_dir / f"{stem}_S2_{DATE_TO_DOWNLOAD}.jpg", VIS_PARAMS_S2)
            save_tif(s2_img, s2_reg, tile_dir / f"{stem}_S2_{DATE_TO_DOWNLOAD}.tif",
                     bands=['B4', 'B3', 'B2'])
        else:
            logging.warning(f" No S2 for {stem}")

        with rasterio.open(lidar_fp) as src:
            arr = src.read(1).astype("float32")
            nod = src.nodata
            if nod is not None:
                arr[arr == nod] = np.nan
        hs1 = hillshade(arr, AZ1, ALT1)
        hs2 = hillshade(arr, AZ2, ALT2)
        comp = np.concatenate([hs1, hs2], axis=1)
        Image.fromarray(comp).save(tile_dir / f"{stem}_lidar_hillshade.jpg", quality=90)

    logging.info("Processing complete.")


KeyError: 'GOOGLE_APPLICATION_CREDENTIALS'