<a href="https://colab.research.google.com/github/RobinHamers/mombasa_solutions/blob/main/get_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Imports



In [28]:
%pip install -q -U geopandas contextily h3pandas shapely

In [29]:
import os
import pandas as pd
import geopandas as gpd
import sys
from functools import reduce
import contextily as ctx
import numpy as np
import h3pandas

In [39]:
DB_PTH_DCT = {
    'metrics_day': '/content/LST-August-2024-Day.geojson',
    'metrics_night': '/content/LST-August-2024-Night-.geojson',
    'metrics_canopy_cover': '/content/Mombasa-Canopy-Cover-.geojson',
    'metrics_heat_retention': '/content/Heat-Retention-Risk-202408-.geojson',
    #'medical_care': 'weo-data_dashboard_medical_care.geojson',
    #'medical_care': 'weo-data_dashboard_medical_care.geojson',
}

INDEX = column_to_merge_on = 'h3'

In [40]:
df = gpd.read_file("/content/LST-August-2024-Day.geojson")
df = gpd.read_file("/content/LST-August-2024-Night-.geojson")
df = gpd.read_file("/content/Mombasa-Canopy-Cover-.geojson")
df = gpd.read_file("/content/Heat-Retention-Risk-202408-.geojson")

# Displaying the first few rows to check if it loaded correctly
df.head()

Unnamed: 0,felt:feature,h3,metric,date,label,geometry
0,1,623665692029255679,0.0,2024-08-01,heat_retention_risk_max,POINT Z (39.73854 -4.14051 0)
1,2,623665692029288447,0.0,2024-08-01,heat_retention_risk_max,POINT Z (39.73737 -4.14006 0)
2,3,623665692029321215,0.0,2024-08-01,heat_retention_risk_max,POINT Z (39.73949 -4.13973 0)
3,4,623665692029353983,0.0,2024-08-01,heat_retention_risk_max,POINT Z (39.73832 -4.13928 0)
4,5,623665692029386751,0.0,2024-08-01,heat_retention_risk_max,POINT Z (39.73877 -4.14174 0)


In [35]:
def collapse_duplicates(df, index_col):
    def collapse_strings(series):
        if series.dtype == object:
            values = series.dropna().astype(str)
            if values.empty:
                return np.nan
            seen = set()
            unique_values = []
            for v in values:
                if v and v not in seen:
                    unique_values.append(v)
                    seen.add(v)
            return ', '.join(unique_values)
        else:
            return series.dropna().iloc[0] if series.notna().any() else pd.NA

    collapsed_df = df.groupby(index_col).agg(collapse_strings).reset_index()
    return collapsed_df

def get_data(data_dir="./", db_pth_dct=DB_PTH_DCT, index=INDEX):

    db_gdfs = {}

    for key, filename in db_pth_dct.items():
        ext = os.path.splitext(filename)[1].lower()
        path = os.path.join(data_dir, filename)
        if ext in [".csv"]:
            df = pd.read_csv(path)
            db_gdfs[key] = df
        elif ext in [".geojson", ".gpkg", ".zip"]:
            try:
                gdf = gpd.read_file(path)

                if gdf.geometry.name != 'geometry':
                    raise ValueError(f"GeoDataFrame {gdf.name} does not have a 'geometry' column.")
                if gdf.crs is None:
                    raise ValueError(f"GeoDataFrame {gdf.name} does not have a CRS defined.")

                if gdf.crs is not None and gdf.crs.to_string() != "EPSG:4326":
                    gdf = gdf.to_crs("EPSG:4326")

                if key != 'comments':
                    gdf = gdf.rename(columns={'name': key})
                elif key == 'comments':
                    gdf = gdf.rename(columns={'text': key})

                # Handle different geometry types for H3 assignment
                if gdf.geometry.iloc[0].geom_type == "Point":
                    # For Point geometries, assign H3 index directly
                    gdf = gdf.h3.geo_to_h3(resolution=10, set_index=False)
                elif gdf.geometry.iloc[0].geom_type == "MultiPoint":
                    # For MultiPoint geometries, calculate centroid and assign H3 index
                    gdf = gdf.explode(ignore_index=True)
                    gdf = gdf.h3.geo_to_h3(resolution=10, set_index=False)
                elif gdf.geometry.iloc[0].geom_type == "Polygon":
                    gdf = gdf.h3.polyfill(10+4, explode=True).set_index('h3_polyfill').h3.h3_to_parent_aggregate(10, operation = {'emergency_assemble_areas': 'first',})  # Take the first value in each group# Add other columns as needed, e.g., 'count': 'sum'
                    gdf = gdf.reset_index()
                else:
                    print(f"Unsupported geometry type {gdf.geometry.iloc[0].geom_type} for {key} in {filename}. Skipping H3 assignment.", file=sys.stderr)

                gdf['h3_int'] = gdf['h3'].apply(lambda x: int(x, 16) if pd.notna(x) else None)
                db_gdfs[key] = gdf

            except Exception as e:
                db_gdfs[key] = None
                print(f"Error with {key} DB, {filename}: {e}", file=sys.stderr)
        else:
            db_gdfs[key] = None



    # Merge all DataFrames in db_gdfs on the 'INDEX' column
    merged_gdf = reduce(lambda left, right: pd.merge(left, right, on=INDEX, how='outer', suffixes=('', '_dup')),  [df for df in db_gdfs.values() if isinstance(df, pd.DataFrame) and INDEX in df.columns])



    #Filter the polygons that have a special feature, e.g., 'densely_populated_at_risk_people'
    # Get all keys except 'metrics'
    non_metrics_keys = [k for k in db_gdfs.keys() if k != 'metrics']
    # Only keep columns that exist in merged_gdf
    cols_to_check = [k for k in non_metrics_keys if k in merged_gdf.columns]

    merged_gdf.to_csv("df_export.csv", index=False)


    for key, gdf in db_gdfs.items():
        if gdf is not None:
            print(f"{key} unique polygons on total rows in DB: {gdf[INDEX].nunique()}/{len(gdf)}.")
        else:
            print(f"{key} DB is None or empty.")

    return merged_gdf.head()


if __name__ == "__main__":

    head = get_data(data_dir="/content/")
    print("Head of the output = ")
    print(head)

Error with metrics_impervious_surfaces DB, /content/Impervious-surfaces.geojson: Failed to read GeoJSON data; At line 723239, character 65: Unterminated string


metrics_day unique polygons on total rows in DB: 29392/29392.
metrics_night unique polygons on total rows in DB: 30199/30199.
metrics_canopy_cover unique polygons on total rows in DB: 39518/39518.
metrics_heat_retention unique polygons on total rows in DB: 37881/37881.
metrics_impervious_surfaces DB is None or empty.
Head of the output = 
   felt:feature                  h3  metric date label geometry h3_10 h3_int  \
0           NaN  623665692029255679     NaN  NaT   NaN     None   NaN    NaN   
1           NaN  623665692029288447     NaN  NaT   NaN     None   NaN    NaN   
2           NaN  623665692029321215     NaN  NaT   NaN     None   NaN    NaN   
3           NaN  623665692029353983     NaN  NaT   NaN     None   NaN    NaN   
4           NaN  623665692029386751     NaN  NaT   NaN     None   NaN    NaN   

  felt:feature_dup metric_dup  ...                   geometry_dup  \
0              NaN        NaN  ...  POINT Z (39.73854 -4.14051 0)   
1              NaN        NaN  ...  POIN