From https://planetarycomputer.microsoft.com/dataset/cil-gdpcir-cc0#Ensemble-example

In [1]:
import planetary_computer
import pystac_client

import xarray as xr
import numpy as np
import pandas as pd
from dask.diagnostics import ProgressBar
from tqdm.auto import tqdm

import os
import re
import glob
import shutil
import zipfile
from pathlib import Path

import difflib
from scipy.spatial import KDTree

import matplotlib.pyplot as plt
import geopandas as gpd
import regionmask
import cartopy.crs as ccrs

from netCDF4 import Dataset

from carbonplan import styles  # noqa: F401
import intake
import cmip6_downscaling


In [16]:
catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1/",
    modifier=planetary_computer.sign_inplace,
)
collection = catalog.get_collection("cil-gdpcir-cc-by")
item = collection.get_item("cil-gdpcir-NUIST-NESM3-ssp585-r1i1p1f1-day")
item.assets
search = catalog.search(
    collections=["cil-gdpcir-cc-by"],
    query={"cmip6:source_id": {"eq": "GFDL-CM4"}, "cmip6:experiment_id": {"eq": "ssp245"}},
)
items = search.get_all_items()
len(items)

asset = item.assets["pr"]
item = items[0]
item
ds = xr.open_dataset(asset.href, **asset.extra_fields["xarray:open_kwargs"])
ds



Unnamed: 0,Array,Chunk
Bytes,242.48 GiB,360.90 MiB
Shape,"(31390, 720, 1440)","(365, 360, 360)"
Dask graph,688 chunks in 2 graph layers,688 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 242.48 GiB 360.90 MiB Shape (31390, 720, 1440) (365, 360, 360) Dask graph 688 chunks in 2 graph layers Data type float64 numpy.ndarray",1440  720  31390,

Unnamed: 0,Array,Chunk
Bytes,242.48 GiB,360.90 MiB
Shape,"(31390, 720, 1440)","(365, 360, 360)"
Dask graph,688 chunks in 2 graph layers,688 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


Load and organise data

In [4]:
import xarray as xr
import pandas as pd
from pystac_client import Client
from planetary_computer import sign_inplace
from tqdm import tqdm

# Open the catalog
catalog = Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1/",
    modifier=sign_inplace,
)

# Get the collections
scenarios = ["ssp585"]  # Change as needed
variable_id = "pr"  # Precipitation variable

for scenario in scenarios:
    search = catalog.search(
        collections=["cil-gdpcir-cc0", "cil-gdpcir-cc-by"],
        query={"cmip6:experiment_id": {"eq": scenario}},
    )
    ensemble = search.item_collection()
    print(f"Number of items found: {len(ensemble)}")

    # Read and process each dataset
    datasets_by_model = []
    for item in tqdm(ensemble):
        asset = item.assets[variable_id]
        datasets_by_model.append(
            xr.open_dataset(asset.href, **asset.extra_fields["xarray:open_kwargs"])
        )

    # Combine datasets by model
    all_datasets = xr.concat(
        datasets_by_model,
        dim=pd.Index([ds.attrs["source_id"] for ds in datasets_by_model], name="model"),
        combine_attrs="drop_conflicts",
    )

    # Define the spatial and temporal bounds
    lon_bounds = slice(32.67161823, 35.91841716)
    lat_bounds = slice(-17.12627881, -9.36366167)
    time_range = pd.date_range("2061-01-01", "2071-01-01", freq="Y")

    # Process each year
    output_dir = "/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data_CIL/"
    yearly_files = []
    for year in time_range.year:
        yearly_subset = all_datasets.pr.sel(
            lon=lon_bounds,
            lat=lat_bounds,
            time=slice(f"{year}-01-01", f"{year}-12-31"),
        )
        yearly_file = f"{output_dir}/CIL_subset_{scenario}_{year}.nc"
        yearly_subset.to_netcdf(yearly_file)
        yearly_files.append(yearly_file)
        print(f"Saved yearly data for {year} to {yearly_file}")

    # Combine all yearly files into one NetCDF file
    combined_output = f"{output_dir}/CIL_subsetted_all_model_{scenario}.nc"
    combined_dataset = xr.open_mfdataset(yearly_files, combine="by_coords")
    #combined_dataset.to_netcdf(combined_output)
    print(f"Saved combined dataset to {combined_output}")

Number of items found: 22


  0%|          | 0/22 [00:01<?, ?it/s]


KeyboardInterrupt: 

Find lowest, median, and highest value model across all lat/long and across all time points

# Combine with grids for facilities

In [2]:
ANC = True
Inpatient = False
multiplier = 1 # no need for multiplier
years = range(2025, 2071) # final date is 1st Jan 2100
month_lengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] * len(years)
window_size = 5

if ANC:
    reporting_data = pd.read_csv(
        "/Users/rem76/Desktop/Climate_change_health/Data/monthly_reporting_ANC_by_smaller_facility_lm.csv")
elif Inpatient:
    reporting_data = pd.read_csv(
        "/Users/rem76/Desktop/Climate_change_health/Data/monthly_reporting_Inpatient_by_smaller_facility_lm.csv")
general_facilities = gpd.read_file("/Users/rem76/Desktop/Climate_change_health/Data/facilities_with_districts.shp")

facilities_with_lat_long = pd.read_csv(
    "/Users/rem76/Desktop/Climate_change_health/Data/facilities_with_lat_long_region.csv")

  facilities_with_lat_long = pd.read_csv(


In [3]:
def unzip_all_in_directory(directory):
    """
    Unzips all .zip files in the specified directory, extracting each into a separate folder.

    Parameters:
        directory (str): The path to the folder containing the .zip files.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.zip'):
            file_path = os.path.join(directory, filename)
            extract_dir = os.path.join(directory, filename[:-4])
            os.makedirs(extract_dir, exist_ok=True)

            try:
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_dir)
            except zipfile.BadZipFile:
                print(f"Skipped {filename}: not a valid zip file.")

def get_facility_lat_long(reporting_facility, facilities_df, cutoff=0.90, n_matches=3):
    """
    Function to find the closest matching facility name and return its latitude and longitude.

    Parameters:
    - reporting_facility: The facility name for which latitude and longitude are needed.
    - facilities_df : DataFrame containing facility names ('Fname') and their corresponding latitudes ('A109__Latitude') and longitudes ('A109__Longitude').
    - cutoff: The minimum similarity score for a match. Default is 0.90.
    - n_matches: The maximum number of matches to consider. Default is 3.

    Returns: match_name, lat_for_facility, long_for_facility

    """
    matching_facility_name = difflib.get_close_matches(reporting_facility, facilities_df['Fname'], n=n_matches,
                                                       cutoff=cutoff)

    if matching_facility_name:
        match_name = matching_facility_name[0]  # Access the string directly
        lat_for_facility = facilities_df.loc[facilities_df['Fname'] == match_name, "A109__Latitude"].iloc[0]
        long_for_facility = facilities_df.loc[facilities_df['Fname'] == match_name, "A109__Longitude"].iloc[0]
        return match_name, lat_for_facility, long_for_facility
    else:
        return np.nan, np.nan, np.nan

def extract_nc_files_from_unzipped_folders(directory):
    """
    Searches for .nc files in the specified directory and all its subfolders,
    and copies them to the output directory, maintaining the folder structure.

    Parameters:
        directory (str): The path to the folder containing the unzipped folders.
    """
    output_directory = os.path.join(directory, 'nc_files')
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for root, _, files in os.walk(directory):
        # Skip the output directory to prevent recursive copying
        if root == output_directory:
            continue

        for filename in files:
            if filename.endswith('.nc'):
                source_file_path = os.path.join(root, filename)
                destination_file_path = os.path.join(output_directory, filename)

                # Only copy if the file does not already exist in the output directory
                if not os.path.exists(destination_file_path):
                    shutil.copy2(source_file_path, output_directory)

In [13]:
base_dir = "/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data_CIL/"
scenarios = ["ssp245", "ssp585"]
scenarios = ["ssp126"]
scenarios = ["ssp585"]

window_size = 5

data_by_model_and_grid = {}

def calculate_cumulative_metrics(precip_data, window_size):
    """
    Calculate monthly totals and 5-day maximums for precipitation data.
    """
    # Monthly total
    monthly_total = np.sum(precip_data)

    # 5-day maximum using rolling window
    if len(precip_data) >= window_size:
        rolling_sums = np.cumsum(precip_data)
        rolling_sums[window_size:] -= rolling_sums[:-window_size]
        max_5_day = np.max(rolling_sums[window_size - 1:])
    else:
        max_5_day = np.sum(precip_data)  # Handle case where data is shorter than window size

    return monthly_total, max_5_day
for scenario in scenarios:
    print(f"Processing scenario: {scenario}")
    scenario_directory = os.path.join(base_dir, scenario)
    file_path_downscaled = f"/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data_CIL/"
    output_file = f"CIL_combined_{scenario}_2025_2070.nc"
    file_pattern = os.path.join(file_path_downscaled, "CIL_subset_ssp245_*.nc")
    data_all_models = xr.open_mfdataset(file_pattern, combine='nested', concat_dim="time")
    data_all_models.compute()
    files = sorted(Path(file_path_downscaled).glob(f"CIL_subset_{scenario}_*.nc"))

    pr_aggregated = data_all_models.mean(dim=["lat", "lon", "time"], skipna=True)
    min_model_object = pr_aggregated['pr'].idxmin(dim="model")
    min_model = min_model_object.values.item()

    overall_mean = pr_aggregated['pr'].mean(dim="model")
    abs_diff = abs(pr_aggregated['pr'] - overall_mean)
    mean_model_object = abs_diff.idxmin(dim="model")
    mean_model = mean_model_object.values.item()

    max_model_object = pr_aggregated['pr'].idxmax(dim="model")
    max_model = max_model_object.values.item()

    models_of_interest = [min_model, mean_model, max_model]
    print("Models of interest", models_of_interest)

    #Initialize cumulative storage for models of interest
    cumulative_weather_dfs = {
        model: {"monthly": pd.DataFrame(), "window": pd.DataFrame()}
        for model in models_of_interest
    }

    for file_path in files:
        print(f"Processing file: {file_path.name}")
        data_all_models = xr.open_dataset(file_path)

        for model in models_of_interest:
            if model not in data_all_models["model"].values:
                print(f"Model {model} not found in file {file_path.name}, skipping.")
                continue

            print(f"Processing model: {model}")
            data_per_model = data_all_models.sel(model=model)

            # Prepare grid data structure
            lat_data = data_per_model.variables['lat'][:]
            lon_data = data_per_model.variables['lon'][:]
            lon_grid, lat_grid = np.meshgrid(lon_data, lat_data)
            centroids = np.column_stack((lat_grid.ravel(), lon_grid.ravel()))

            grid_precip_map = {}

            for year in np.unique(data_per_model['time.year']):
                for month in range(1, 13):  # 1 to 12 for each month
                    print(f"Processing year {year}, month {month}")

                    # Extract precipitation data for this month of this year
                    month_data = data_per_model.sel(
                        time=data_per_model.time.dt.year == year
                    ).sel(time=data_per_model.time.dt.month == month)

                    # Skip if no data
                    if month_data.time.size == 0:
                        continue

                    # Get daily precipitation values for grids
                    for grid, (i, j) in enumerate(np.ndindex(len(lat_data), len(lon_data))):
                        precip_data_for_grid = month_data.isel(lat=i, lon=j).pr.values

                        if grid not in grid_precip_map:
                            grid_precip_map[grid] = {"monthly": {}, "window": {}}

                        # Calculate metrics
                        monthly, window = calculate_cumulative_metrics(precip_data_for_grid, window_size)

                        grid_precip_map[grid]["monthly"][(year, month)] = monthly
                        grid_precip_map[grid]["window"][(year, month)] = window

            # Map facilities to grids and assign metrics
            kd_tree = KDTree(centroids)

            for reporting_facility in reporting_data.columns:
                match_name, lat, lon = get_facility_lat_long(reporting_facility, facilities_with_lat_long)
                if not np.isnan(lat) and not np.isnan(lon):
                    facility_location = np.array([[lat, lon]])
                    dist, closest_grid_index = kd_tree.query(facility_location)
                    closest_grid_index = closest_grid_index[0]

                    for (year, month), metrics in grid_precip_map[closest_grid_index]["monthly"].items():
                        cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
                    for (year, month), metrics in grid_precip_map[closest_grid_index]["window"].items():
                        cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
    model_categories = {
    min_model: "lowest",
    mean_model: "mean",
    max_model: "highest"
}

# Save cumulative results
    for model, weather_dfs in cumulative_weather_dfs.items():
        category = model_categories[model]  # Get the category for the model
        for metric_type, df in weather_dfs.items():
            # Use the category in the output file name
            output_file = Path(scenario_directory) / f"{category}_{metric_type}_prediction_weather_by_facility.csv"
            df.to_csv(output_file, index=True)
            print(f"Saved {metric_type} data for {category} model to {output_file}")



Processing scenario: ssp585
Models of interest ['HadGEM3-GC31-LL', 'UKESM1-0-LL', 'INM-CM5-0']
Processing file: CIL_subset_ssp585_2024.nc
Processing model: HadGEM3-GC31-LL
Processing year 2024, month 1
Processing year 2024, month 2
Processing year 2024, month 3
Processing year 2024, month 4
Processing year 2024, month 5
Processing year 2024, month 6
Processing year 2024, month 7
Processing year 2024, month 8
Processing year 2024, month 9
Processing year 2024, month 10
Processing year 2024, month 11
Processing year 2024, month 12


  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].lo

Processing model: UKESM1-0-LL
Processing year 2024, month 1
Processing year 2024, month 2
Processing year 2024, month 3
Processing year 2024, month 4
Processing year 2024, month 5
Processing year 2024, month 6
Processing year 2024, month 7
Processing year 2024, month 8
Processing year 2024, month 9
Processing year 2024, month 10
Processing year 2024, month 11
Processing year 2024, month 12


  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].lo

Processing model: INM-CM5-0
Processing year 2024, month 1
Processing year 2024, month 2
Processing year 2024, month 3
Processing year 2024, month 4
Processing year 2024, month 5
Processing year 2024, month 6
Processing year 2024, month 7
Processing year 2024, month 8
Processing year 2024, month 9
Processing year 2024, month 10
Processing year 2024, month 11
Processing year 2024, month 12


  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["window"].loc[f"{year}-{month}", reporting_facility] = metrics
  cumulative_weather_dfs[model]["monthly"].lo

Processing file: CIL_subset_ssp585_2025.nc
Processing model: HadGEM3-GC31-LL
Processing year 2025, month 1
Processing year 2025, month 2
Processing year 2025, month 3
Processing year 2025, month 4
Processing year 2025, month 5
Processing year 2025, month 6
Processing year 2025, month 7
Processing year 2025, month 8
Processing year 2025, month 9
Processing year 2025, month 10
Processing year 2025, month 11
Processing year 2025, month 12
Processing model: UKESM1-0-LL
Processing year 2025, month 1
Processing year 2025, month 2
Processing year 2025, month 3
Processing year 2025, month 4
Processing year 2025, month 5
Processing year 2025, month 6
Processing year 2025, month 7
Processing year 2025, month 8
Processing year 2025, month 9
Processing year 2025, month 10
Processing year 2025, month 11
Processing year 2025, month 12
Processing model: INM-CM5-0
Processing year 2025, month 1
Processing year 2025, month 2
Processing year 2025, month 3
Processing year 2025, month 4
Processing year 202