<a href="https://colab.research.google.com/github/RobinHamers/mombasa_solutions/blob/main/get_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Imports



In [56]:
%pip install -q -U geopandas contextily h3pandas shapely

In [57]:
import os
import pandas as pd
import geopandas as gpd
import sys
from functools import reduce
import contextily as ctx
import numpy as np
import h3pandas

In [61]:
DB_PTH_DCT = {
    'metrics_day':'/content/LST-August-2024-Day.geojson',
    'metrics_night':'/content/LST-August-2024-Night-.geojson', # Corrected path
    'metrics_risk':'/content/Heat-Retention-Risk-202408-.geojson',
    'metrics_cover':'/content/Mombasa-Canopy-Cover-.geojson',
    'metrics_POI':'/content/Points-of-interest.geojson',
    'metrics_average': '/content/average heat risk and tree count(whole area).csv',
    'metrics_count':'/content/Tree-Location-Top.csv',



    #'medical_care': 'weo-data_dashboard_medical_care.geojson',
    #'medical_care': 'weo-data_dashboard_medical_care.geojson',
}

INDEX = column_to_merge_on = 'h3'

In [59]:
import geopandas as gpd

df = gpd.read_file("/content/LST-August-2024-Day.geojson")
df = gpd.read_file("/content//LST-August-2024-Night-.geojson")
df = gpd.read_file("/content//Mombasa-Canopy-Cover-.geojson")
df = gpd.read_file("/content//Heat-Retention-Risk-202408-.geojson")
df=  gpd.read_file("/content/Points-of-interest.geojson")
df=  gpd.read_file("/content/average heat risk and tree count(whole area).csv")
df=  gpd.read_file("/content/Tree-Location-Top.csv")
# Displaying the first few rows to check if it loaded correctly
df.head()

Unnamed: 0,felt:feature,DN,TH,COR,fid
0,1,0,6,0,-9.22337203685478e+18
1,2,1,7,0,-9.22337203685478e+18
2,3,2,12,0,-9.22337203685478e+18
3,4,3,13,0,-9.22337203685478e+18
4,5,4,6,0,-9.22337203685478e+18


In [None]:
def collapse_duplicates(df, index_col):
    def collapse_strings(series):
        if series.dtype == object:
            values = series.dropna().astype(str)
            if values.empty:
                return np.nan
            seen = set()
            unique_values = []
            for v in values:
                if v and v not in seen:
                    unique_values.append(v)
                    seen.add(v)
            return ', '.join(unique_values)
        else:
            return series.dropna().iloc[0] if series.notna().any() else pd.NA

    collapsed_df = df.groupby(index_col).agg(collapse_strings).reset_index()
    return collapsed_df

def get_data(data_dir="./", db_pth_dct=DB_PTH_DCT, index=INDEX):

    db_gdfs = {}

    for key, filename in db_pth_dct.items():
        ext = os.path.splitext(filename)[1].lower()
        path = os.path.join(data_dir, filename)
        if ext in [".csv"]:
            df = pd.read_csv(path)
            # CSVs are not H3 indexed by default, they might not have 'h3' column.
            # If they contain lat/lon, they should be converted to GeoDataFrame and H3 indexed.
            # For now, just store as DataFrame.
            db_gdfs[key] = df
        elif ext in [".geojson", ".gpkg", ".zip"]:
            try:
                gdf = gpd.read_file(path)

                if gdf.geometry.name != 'geometry':
                    raise ValueError(f"GeoDataFrame {gdf.name} does not have a 'geometry' column.")
                if gdf.crs is None:
                    raise ValueError(f"GeoDataFrame {gdf.name} does not have a CRS defined.")

                if gdf.crs is not None and gdf.crs.to_string() != "EPSG:4326":
                    gdf = gdf.to_crs("EPSG:4326")

                if key != 'comments':
                    gdf = gdf.rename(columns={'name': key})
                elif key == 'comments':
                    gdf = gdf.rename(columns={'text': key})

                # Check for valid geometries before attempting H3 assignment
                if gdf.empty or gdf.geometry.is_empty.all():
                    print(f"Skipping H3 assignment for {key} in {filename} due to empty or invalid geometries.", file=sys.stderr)
                    db_gdfs[key] = None
                    continue

                # Determine geometry type for H3 assignment
                geom_type = gdf.geometry.iloc[0].geom_type

                if geom_type == "Point":
                    gdf = gdf.h3.geo_to_h3(resolution=10, set_index=False)
                elif geom_type == "MultiPoint":
                    gdf = gdf.explode(ignore_index=True)
                    gdf = gdf.h3.geo_to_h3(resolution=10, set_index=False)
                elif geom_type in ["Polygon", "MultiPolygon"]:
                    # For 'Points-of-interest', treat polygons as points (centroids) for H3 indexing
                    if key == 'metrics_POI': # Specific handling for Points-of-interest
                        # Calculate centroid for each polygon/multipolygon
                        # Handle potential empty geometries before centroid calculation
                        gdf = gdf[~gdf.geometry.is_empty]
                        if not gdf.empty:
                            gdf['h3'] = gdf.centroid.h3.geo_to_h3(resolution=10, set_index=False)
                            # We want to keep other columns and then group by h3
                            cols_to_keep = [col for col in gdf.columns if col not in ['geometry', 'h3']]
                            if cols_to_keep:
                                # Aggregate other columns (e.g., take the first value) for each h3
                                gdf = gdf[['h3'] + cols_to_keep].groupby('h3').agg('first').reset_index()
                            else:
                                # If no other columns, just keep unique h3 values
                                gdf = pd.DataFrame(gdf['h3'].unique(), columns=['h3'])
                        else:
                            gdf = pd.DataFrame(columns=['h3'])
                    else: # General Polygon/MultiPolygon handling for area coverage
                        if geom_type == "MultiPolygon":
                            gdf = gdf.explode(ignore_index=True)

                        # Polyfill to H3 cells. This results in a GeoDataFrame with 'h3_polyfill' column.
                        h3_hexes_gdf = gdf.h3.polyfill(10+4, explode=True)

                        # Aggregate data by 'h3_polyfill'
                        cols_to_agg = [col for col in h3_hexes_gdf.columns if col not in ['h3_polyfill', 'geometry']]
                        agg_ops = {col: 'first' for col in cols_to_agg}

                        if agg_ops:
                            aggregated_df = h3_hexes_gdf.groupby('h3_polyfill').agg(agg_ops).reset_index()
                        else:
                            aggregated_df = pd.DataFrame(h3_hexes_gdf['h3_polyfill'].unique(), columns=['h3_polyfill'])

                        # Rename 'h3_polyfill' to 'h3' for consistency
                        aggregated_df = aggregated_df.rename(columns={'h3_polyfill': 'h3'})
                        gdf = aggregated_df # gdf is now a pandas DataFrame with 'h3' column
                else:
                    print(f"Unsupported geometry type {geom_type} for {key} in {filename}. Skipping H3 assignment.", file=sys.stderr)
                    db_gdfs[key] = None
                    continue

                if 'h3' in gdf.columns: # Only create h3_int if h3 column exists
                    gdf['h3_int'] = gdf['h3'].apply(lambda x: int(x, 16) if pd.notna(x) else None)

                db_gdfs[key] = gdf

            except Exception as e:
                db_gdfs[key] = None
                print(f"Error with {key} DB, {filename}: {e}", file=sys.stderr)
        else:
            db_gdfs[key] = None

    # Merge all DataFrames in db_gdfs on the 'INDEX' column
    # Ensure only DataFrames with the INDEX column are merged
    dataframes_to_merge = [df_item for df_item in db_gdfs.values() if isinstance(df_item, pd.DataFrame) and INDEX in df_item.columns]

    if not dataframes_to_merge:
        print("No dataframes with 'h3' column to merge.", file=sys.stderr)
        return pd.DataFrame(columns=[INDEX]) # Return empty DataFrame if nothing to merge

    merged_gdf = reduce(lambda left, right: pd.merge(left, right, on=INDEX, how='outer', suffixes=('', '_dup')), dataframes_to_merge)

    merged_gdf.to_csv("df_export.csv", index=False)


    for key, gdf in db_gdfs.items():
        if gdf is not None:
            if INDEX in gdf.columns: # Check if 'h3' column exists before trying to access it
                print(f"{key} unique polygons on total rows in DB: {gdf[INDEX].nunique()}/{len(gdf)}.")
            else:
                print(f"{key} DB has no '{INDEX}' column for unique polygon count.")
        else:
            print(f"{key} DB is None or empty or failed H3 indexing.")

    return merged_gdf.head()


if __name__ == "__main__":

    head = get_data(data_dir="/content/")
    print("Head of the output = ")
    print(head)