<a href="https://colab.research.google.com/github/RobinHamers/mombasa_solutions/blob/main/get_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Imports



In [3]:
%pip install -q -U geopandas contextily h3pandas shapely

In [None]:
import os
import pandas as pd
import geopandas as gpd
import sys
from functools import reduce
import contextily as ctx
import numpy as np
import h3pandas

In [None]:
DB_PTH_DCT = {

    'metrics': 'weo-data_dashboard_Heat-Risk-.zip',
    'medical_care': 'weo-data_dashboard_medical_care.geojson',
    'medical_care': 'weo-data_dashboard_medical_care.geojson',
}

INDEX = column_to_merge_on = 'h3'

In [None]:
def collapse_duplicates(df, index_col):
    def collapse_strings(series):
        if series.dtype == object:
            values = series.dropna().astype(str)
            if values.empty:
                return np.nan
            seen = set()
            unique_values = []
            for v in values:
                if v and v not in seen:
                    unique_values.append(v)
                    seen.add(v)
            return ', '.join(unique_values)
        else:
            return series.dropna().iloc[0] if series.notna().any() else pd.NA

    collapsed_df = df.groupby(index_col).agg(collapse_strings).reset_index()
    return collapsed_df

def get_data(data_dir="./", db_pth_dct=DB_PTH_DCT, index=INDEX):

    db_gdfs = {}

    for key, filename in db_pth_dct.items():
        ext = os.path.splitext(filename)[1].lower()
        path = os.path.join(data_dir, filename)
        if ext in [".csv"]:
            df = pd.read_csv(path)
            db_gdfs[key] = df
        elif ext in [".geojson", ".gpkg", ".zip"]:
            try:
                gdf = gpd.read_file(path)

                if gdf.geometry.name != 'geometry':
                    raise ValueError(f"GeoDataFrame {gdf.name} does not have a 'geometry' column.")
                if gdf.crs is None:
                    raise ValueError(f"GeoDataFrame {gdf.name} does not have a CRS defined.")

                if gdf.crs is not None and gdf.crs.to_string() != "EPSG:4326":
                    gdf = gdf.to_crs("EPSG:4326")

                if key != 'comments':
                    gdf = gdf.rename(columns={'name': key})
                elif key == 'comments':
                    gdf = gdf.rename(columns={'text': key})

                # Handle different geometry types for H3 assignment
                if gdf.geometry.iloc[0].geom_type == "Point":
                    # For Point geometries, assign H3 index directly
                    gdf = gdf.h3.geo_to_h3(resolution=10, set_index=False)
                elif gdf.geometry.iloc[0].geom_type == "MultiPoint":
                    # For MultiPoint geometries, calculate centroid and assign H3 index
                    gdf = gdf.explode(ignore_index=True)
                    gdf = gdf.h3.geo_to_h3(resolution=10, set_index=False)
                elif gdf.geometry.iloc[0].geom_type == "Polygon":
                    gdf = gdf.h3.polyfill(10+4, explode=True).set_index('h3_polyfill').h3.h3_to_parent_aggregate(10, operation = {'emergency_assemble_areas': 'first',})  # Take the first value in each group# Add other columns as needed, e.g., 'count': 'sum'
                    gdf = gdf.reset_index()
                else:
                    print(f"Unsupported geometry type {gdf.geometry.iloc[0].geom_type} for {key} in {filename}. Skipping H3 assignment.", file=sys.stderr)

                gdf['h3_int'] = gdf['h3'].apply(lambda x: int(x, 16) if pd.notna(x) else None)
                db_gdfs[key] = gdf

            except Exception as e:
                db_gdfs[key] = None
                print(f"Error with {key} DB, {filename}: {e}", file=sys.stderr)
        else:
            db_gdfs[key] = None



    # Merge all DataFrames in db_gdfs on the 'INDEX' column
    merged_gdf = reduce(lambda left, right: pd.merge(left, right, on=INDEX, how='outer', suffixes=('', '_dup')),  [df for df in db_gdfs.values() if isinstance(df, pd.DataFrame) and INDEX in df.columns])



    #Filter the polygons that have a special feature, e.g., 'densely_populated_at_risk_people'
    # Get all keys except 'metrics'
    non_metrics_keys = [k for k in db_gdfs.keys() if k != 'metrics']
    # Only keep columns that exist in merged_gdf
    cols_to_check = [k for k in non_metrics_keys if k in merged_gdf.columns]

    merged_gdf.to_csv("df_export.csv", index=False)


    for key, gdf in db_gdfs.items():
        if gdf is not None:
            print(f"{key} unique polygons on total rows in DB: {gdf[INDEX].nunique()}/{len(gdf)}.")
        else:
            print(f"{key} DB is None or empty.")

    return merged_gdf.head()


if __name__ == "__main__":

    head = get_data(data_dir="/content/")
    print("Head of the output = ")
    print(head)

(1, 54)
number of unique h3 indices: 1
Filtered DataFrame shape: (171805, 54)
Added 1 missing rows from pois_df to selected_rows.
metrics unique polygons on total rows in DB: 315603/315603.
medical_care unique polygons on total rows in DB: 1/1.
Path of the output =    felt:feature  felt:has_geometry       felt:h3_index                  h3  \
0        237694               True  647365069018628096  624847070881808383   
1        237921               True  647365069043466240  624847070906646527   
2         15877               True  647365069046939648  624847070910119935   
3         15884               True  647365069047398400  624847070910578687   
4        237973               True  647365069047955456  624847070911135743   

   flood_risk  tree_count_sum  fire_risk_202501  fire_risk_202502  \
0         1.0             6.0               1.0               1.0   
1         2.0             8.0               1.0               1.0   
2         1.0             2.0               1.0           