In [1]:
# # %% ------------------ Imports ------------------
# import os
# import glob
# import yaml
# import pickle
# import itertools
# import numpy as np
# import pandas as pd
# import polars as pl
# import geopandas as gpd
# from shapely.geometry import Point
# from sklearn.neighbors import BallTree
# from sklearn.decomposition import PCA
# from tqdm.auto import tqdm

# # %% ------------------ Functions ------------------
# def pick_point(row):
#     pt = row.get("geometry_point")
#     if isinstance(pt, Point):
#         return pt
#     return row.geometry.centroid

# def build_tile_dataframe(df, crs="EPSG:3857"):
#     """Converts input DataFrame to GeoDataFrame and ensures geometry_point exists."""
#     gdf = gpd.GeoDataFrame(df.copy(), geometry=df["geometry"], crs=crs)
#     gdf["geometry_point"] = df.apply(pick_point, axis=1)
#     return gdf

# def load_embedding_metadata(parquet_dir):
#     """Loads coordinates and IDs from all .parquet files in the given directory."""
#     parquets = glob.glob(os.path.join(parquet_dir, "*.parquet"))
#     if not parquets:
#         raise FileNotFoundError(f"No .parquet files found in {parquet_dir}")

#     coords, ids, offsets = [], [], []
#     offset = 0

#     for path in tqdm(parquets, desc="Reading coords from parquet"):
#         part = pl.read_parquet(path, columns=['unique_id','centre_lat','centre_lon']).to_pandas()
#         coords.append(np.vstack([part["centre_lat"], part["centre_lon"]]).T)
#         ids.append(part["unique_id"].tolist())
#         offsets.append((path, offset, offset + len(part)))
#         offset += len(part)

#     return np.vstack(coords), list(itertools.chain.from_iterable(ids)), offsets

# def match_tiles_to_embeddings(gdf, emb_coords, emb_ids):
#     """Finds the closest embedding point for each tile centroid."""
#     tile_xy = np.vstack([gdf.geometry_point.y.values, gdf.geometry_point.x.values]).T
#     tile_rad = np.radians(tile_xy)
#     tree = BallTree(np.radians(emb_coords), metric='haversine')
#     dist_rad, idx = tree.query(tile_rad, k=1)
#     dist_m = dist_rad[:, 0] * 6_371_000
#     gdf["match_id"] = [emb_ids[i] for i in idx[:, 0]]
#     gdf["dist_to_emb"] = dist_m
#     return gdf

# def load_required_embeddings(needed_ids, file_offsets, emb_ids_flat):
#     """Loads only those embedding vectors that match tile centroids."""
#     emb_vectors = {}
#     emb_cols = None

#     for path, start, end in tqdm(file_offsets, desc="Loading embeddings blocks"):
#         block_ids = emb_ids_flat[start:end]
#         want = needed_ids.intersection(block_ids)
#         if not want:
#             continue

#         part_pl = (
#             pl.read_parquet(path)
#               .filter(pl.col("unique_id").is_in(list(want)))
#               .select(["unique_id", "embedding"])
#         )
#         part = part_pl.to_pandas()
#         mat = np.vstack(part["embedding"].values)
#         cols = [f"emb_{i}" for i in range(mat.shape[1])]
#         if emb_cols is None:
#             emb_cols = cols

#         df_emb = pd.DataFrame(mat, columns=cols, index=part.index)
#         df_part = pd.concat([part[["unique_id"]], df_emb], axis=1)

#         for _, row in df_part.iterrows():
#             uid = row["unique_id"]
#             emb_vectors[uid] = {c: row[c] for c in cols}

#     return emb_vectors, emb_cols

# def attach_embeddings(gdf, emb_vectors, emb_cols):
#     """Maps embedding values to each tile based on matched embedding ID."""
#     for c in emb_cols:
#         gdf[c] = gdf["match_id"].map(lambda uid: emb_vectors.get(uid, {}).get(c, np.nan))
#     return gdf

# def add_embeddings_to_tiles(df, parquet_dir):
#     """Full pipeline: match tiles to embeddings and attach vectors."""
#     gdf = build_tile_dataframe(df)
#     emb_coords, emb_ids_flat, file_offsets = load_embedding_metadata(parquet_dir)
#     gdf = match_tiles_to_embeddings(gdf, emb_coords, emb_ids_flat)
#     needed_ids = set(gdf["match_id"])
#     emb_vectors, emb_cols = load_required_embeddings(needed_ids, file_offsets, emb_ids_flat)
#     gdf = attach_embeddings(gdf, emb_vectors, emb_cols)
#     return gdf, emb_cols

# # %% ------------------ Main ------------------
# if __name__ == "__main__":
#     # Load config.yaml
#     with open("config.yaml", "r") as f:
#         config = yaml.safe_load(f)

#     processed_path = config["processed_data_dir"]
#     parquet_dir    = config["embedding_parquet_dir"]
#     out_path       = os.path.join(processed_path, "all_tiles_features_with_emb.pkl")

#     # Load input data (tiles with soil features)
#     input_path = os.path.join(processed_path, "all_tiles_features_with_soil.pkl")
#     with open(input_path, "rb") as f:
#         df = pickle.load(f)

#     # Add embeddings
#     gdf_tiles, emb_cols = add_embeddings_to_tiles(df, parquet_dir)

#     # PCA on embeddings
#     gdf_tiles[emb_cols] = gdf_tiles[emb_cols].astype("float16")
#     gdf_tiles["has_geoglyph"] = gdf_tiles["has_geoglyph"].astype(int)
#     X = gdf_tiles[emb_cols].fillna(0).to_numpy()
#     pca = PCA(n_components=7, random_state=42)
#     pcs = pca.fit_transform(X)
#     for i in range(pcs.shape[1]):
#         gdf_tiles[f"PC{i+1}"] = pcs[:, i]
#     gdf_tiles = gdf_tiles.copy()  # Defragment the frame to improve performance

#     # Final column selection
#     base_cols = [
#         "tile_id", "n_geoglyphs", "has_geoglyph",
#         "mean_elev_m", "mean_slope_deg", "geometry_point",
#         "is_mountain", "dist_to_mountain_m", "dist_to_river_m",
#         "country", 
##"ord_flow", "upland_skm", 
#           "drainage_density_m",
#         "tile_area_km2", "drainage_density", "tri", "twi",
#         "curv_plan", "curv_prof", "geometry", "coordinates",
#         "civil", "type", "source", "bbox", "longitude", "latitude",
#         "region", "clay_0_5cm", "ph_h2o_0_5cm", "soc_0_5cm"
#     ]
#     pc_cols = [f"PC{i}" for i in range(1, 8)]
#     all_cols = base_cols + pc_cols
#     missing = [c for c in all_cols if c not in gdf_tiles.columns]
#     if missing:
#         raise KeyError(f"Missing expected columns in gdf_tiles: {missing}")

#     # Save result
#     tiles_reduced = gdf_tiles[all_cols].copy()
#     with open(out_path, "wb") as f:
#         pickle.dump(tiles_reduced, f)

#     print(f"[✓] Saved {tiles_reduced.shape[1]} columns × {tiles_reduced.shape[0]} rows to:\n  {out_path}")

In [2]:
# with open(input_path, "rb") as f:
#     tiles = pickle.load(f)

In [3]:
# %% ------------------ Imports ------------------
import os
import glob
import yaml
import pickle
import itertools
import numpy as np
import pandas as pd
import polars as pl
import geopandas as gpd
from shapely.geometry import Point
from sklearn.neighbors import BallTree
from sklearn.decomposition import PCA
from tqdm.auto import tqdm

# %% ------------------ Functions ------------------
def pick_point(row):
    pt = row.get("geometry_point")
    if isinstance(pt, Point):
        return pt
    return row.geometry.centroid

def build_tile_dataframe(df, crs="EPSG:3857"):
    gdf = gpd.GeoDataFrame(df.copy(), geometry=df["geometry"], crs=crs)
    gdf["geometry_point"] = df.apply(pick_point, axis=1)
    return gdf

def load_embedding_metadata(parquet_dir):
    parquets = glob.glob(os.path.join(parquet_dir, "*.parquet"))
    if not parquets:
        raise FileNotFoundError(f"No .parquet files found in {parquet_dir}")

    coords, ids, offsets = [], [], []
    offset = 0

    for path in tqdm(parquets, desc="Reading coords from parquet"):
        part = pl.read_parquet(path, columns=['unique_id','centre_lat','centre_lon']).to_pandas()
        coords.append(np.vstack([part["centre_lat"], part["centre_lon"]]).T)
        ids.append(part["unique_id"].tolist())
        offsets.append((path, offset, offset + len(part)))
        offset += len(part)

    return np.vstack(coords), list(itertools.chain.from_iterable(ids)), offsets

def match_tiles_to_embeddings(gdf, emb_coords, emb_ids):
    gdf = gdf.to_crs("EPSG:4326")
    tile_xy = np.vstack([gdf.geometry_point.y.values, gdf.geometry_point.x.values]).T
    tile_rad = np.radians(tile_xy)
    tree = BallTree(np.radians(emb_coords), metric='haversine')
    dist_rad, idx = tree.query(tile_rad, k=1)
    dist_m = dist_rad[:, 0] * 6_371_000
    gdf["match_id"] = [emb_ids[i] for i in idx[:, 0]]
    gdf["dist_to_emb"] = dist_m
    return gdf

def load_required_embeddings(needed_ids, file_offsets, emb_ids_flat):
    emb_vectors = {}
    emb_cols = None

    for path, start, end in tqdm(file_offsets, desc="Loading embeddings blocks"):
        block_ids = emb_ids_flat[start:end]
        want = needed_ids.intersection(block_ids)
        if not want:
            continue

        part_pl = (
            pl.read_parquet(path)
              .filter(pl.col("unique_id").is_in(list(want)))
              .select(["unique_id", "embedding"])
        )
        part = part_pl.to_pandas()
        mat = np.vstack(part["embedding"].values)
        cols = [f"emb_{i}" for i in range(mat.shape[1])]
        if emb_cols is None:
            emb_cols = cols

        df_emb = pd.DataFrame(mat, columns=cols)
        df_emb.insert(0, "unique_id", part["unique_id"].values)

        for _, row in df_emb.iterrows():
            emb_vectors[row["unique_id"]] = {c: row[c] for c in cols}

    return emb_vectors, emb_cols

def attach_embeddings(gdf, emb_vectors, emb_cols):
    emb_items = list(emb_vectors.items())
    emb_df = pd.DataFrame(
        [v for _, v in tqdm(emb_items, desc="Building embedding DataFrame")],
        index=[k for k, _ in emb_items]
    )
    emb_df.index.name = "match_id"
    gdf = gdf.merge(emb_df, how="left", left_on="match_id", right_index=True)
    return gdf

def add_embeddings_to_tiles(df, parquet_dir):
    gdf = build_tile_dataframe(df)
    emb_coords, emb_ids_flat, file_offsets = load_embedding_metadata(parquet_dir)
    gdf = match_tiles_to_embeddings(gdf, emb_coords, emb_ids_flat)
    needed_ids = set(gdf["match_id"])
    emb_vectors, emb_cols = load_required_embeddings(needed_ids, file_offsets, emb_ids_flat)
    gdf = attach_embeddings(gdf, emb_vectors, emb_cols)
    return gdf, emb_cols

# %% ------------------ Main ------------------
if __name__ == "__main__":
    with open("config.yaml", "r") as f:
        config = yaml.safe_load(f)

    processed_path = config["processed_data_dir"]
    parquet_dir    = config["embedding_parquet_dir"]
    out_path       = os.path.join(processed_path, "all_tiles_features_with_emb.pkl")

    input_path = os.path.join(processed_path, "all_tiles_features_with_soil.pkl")
    with open(input_path, "rb") as f:
        df = pickle.load(f)

    gdf_tiles, emb_cols = add_embeddings_to_tiles(df, parquet_dir)

    gdf_tiles[emb_cols] = gdf_tiles[emb_cols].astype("float16")
    gdf_tiles["has_geoglyph"] = gdf_tiles["has_geoglyph"].astype(int)
    X = gdf_tiles[emb_cols].fillna(0).to_numpy()
    pca = PCA(n_components=7, random_state=42)
    pcs = pca.fit_transform(X)
    for i in range(pcs.shape[1]):
        gdf_tiles[f"PC{i+1}"] = pcs[:, i]
    gdf_tiles = gdf_tiles.copy()

    base_cols = [
        "tile_id", "n_geoglyphs", "has_geoglyph",
        "mean_elev_m", "mean_slope_deg", "geometry_point",
        "is_mountain", "dist_to_mountain_m", "dist_to_river_m",
        "country",
         #"ord_flow", "upland_skm",
          "drainage_density_m",
        "tile_area_km2", "drainage_density", "tri", "twi",
        "curv_plan", "curv_prof", "geometry", "coordinates",
        #"civil", "type", 
        "source", "bbox", "longitude", "latitude",
        "region", "clay_0_5cm", "ph_h2o_0_5cm", "soc_0_5cm"
    ]
    pc_cols = [f"PC{i}" for i in range(1, 8)]
    all_cols = base_cols + pc_cols
    missing = [c for c in all_cols if c not in gdf_tiles.columns]
    if missing:
        raise KeyError(f"Missing expected columns in gdf_tiles: {missing}")

    tiles_reduced = gdf_tiles[all_cols].copy()
    with open(out_path, "wb") as f:
        pickle.dump(tiles_reduced, f)

    print(f"[✓] Saved {tiles_reduced.shape[1]} columns × {tiles_reduced.shape[0]} rows to:\n  {out_path}")


  from .autonotebook import tqdm as notebook_tqdm
Reading coords from parquet: 100%|██████████| 23/23 [00:02<00:00,  9.47it/s]
Loading embeddings blocks: 100%|██████████| 23/23 [02:00<00:00,  5.25s/it]
Building embedding DataFrame: 100%|██████████| 1859/1859 [00:00<00:00, 1940087.37it/s]
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)
  super().__setitem__(key, value)


KeyError: "Missing expected columns in gdf_tiles: ['civil', 'type']"