In [1]:
import os
import re
import glob
import shutil
import zipfile
from pathlib import Path
import difflib

import numpy as np
import pandas as pd
import xarray as xr
from netCDF4 import Dataset

import geopandas as gpd
import regionmask
import cartopy.crs as ccrs
from scipy.spatial import KDTree

import matplotlib.pyplot as plt

from carbonplan import styles  # noqa: F401
import intake
import cmip6_downscaling

xr.set_options(keep_attrs=True)


<xarray.core.options.set_options at 0x17f56d1d0>

In [2]:
cat = intake.open_esm_datastore(
    "https://rice1.osn.mghpcc.org/carbonplan/cp-cmip/version1/catalog/osn-rechunked-global-downscaled-cmip6.json"
)

In [3]:
cat_subset = cat.search(
    experiment_id="ssp245",
    variable_id="pr",
    timescale = 'day'
)

In [4]:
dsets = cat_subset.to_dataset_dict()
dsets


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.timescale.method'


{'ScenarioMIP.CCCma.CanESM5.ssp245.day.GARD-SV': <xarray.Dataset> Size: 129GB
 Dimensions:    (lat: 721, lon: 1440, member_id: 1, time: 31046)
 Coordinates:
   * lat        (lat) float32 3kB -90.0 -89.75 -89.5 -89.25 ... 89.5 89.75 90.0
   * lon        (lon) float32 6kB -180.0 -179.8 -179.5 ... 179.2 179.5 179.8
   * time       (time) datetime64[ns] 248kB 2015-01-01 2015-01-02 ... 2099-12-31
   * member_id  (member_id) object 8B 'r1i1p1f1'
 Data variables:
     pr         (member_id, time, lat, lon) float32 129GB dask.array<chunksize=(1, 2600, 72, 144), meta=np.ndarray>
 Attributes: (12/30)
     Conventions:                                 CF-1.8
     activity_id:                                 ScenarioMIP
     cmip6_downscaling_contact:                   hello@carbonplan.org
     cmip6_downscaling_explainer:                 https://carbonplan.org/resea...
     cmip6_downscaling_institution:               CarbonPlan
     cmip6_downscaling_license:                   CC-BY-4.0
     ... 

Load datasets into notebook - NB NorESM2 has a different algorithm

In [5]:
CanESM5_ssp245 = dsets['ScenarioMIP.CCCma.CanESM5.ssp245.day.GARD-SV']
MRI_ESM2_0_ssp245 = dsets['ScenarioMIP.MRI.MRI-ESM2-0.ssp245.day.GARD-MV']
MRI_ESM1_2_HR_ssp245 = dsets['ScenarioMIP.DKRZ.MPI-ESM1-2-HR.ssp245.day.GARD-SV']

NorESM2_LM_ssp245 = dsets['ScenarioMIP.NCC.NorESM2-LM.ssp245.day.MACA']

Subset

In [6]:
malawi_region = {'lat': slice(-17.12627881, -9.36366167), 'lon': slice(32.67161823,35.91841716)}

CanESM5_ssp245 = CanESM5_ssp245.sel(**malawi_region)
MRI_ESM2_0_ssp245 = MRI_ESM2_0_ssp245.sel(**malawi_region)
MRI_ESM1_2_HR_ssp245 = MRI_ESM1_2_HR_ssp245.sel(**malawi_region)
NorESM2_LM_ssp245 = NorESM2_LM_ssp245.sel(**malawi_region)

In [7]:
lower_date = '2025-01-01'
upper_date = '2027-01-01'
CanESM5_ssp245 = CanESM5_ssp245.sel(time = slice(lower_date, upper_date))
MRI_ESM2_0_ssp245 = MRI_ESM2_0_ssp245.sel(time = slice(lower_date, upper_date))
MRI_ESM1_2_HR_ssp245 = MRI_ESM1_2_HR_ssp245.sel(time = slice(lower_date, upper_date))
NorESM2_LM_ssp245 = NorESM2_LM_ssp245.sel(time = slice(lower_date, upper_date))


In [15]:
CanESM5_ssp245_xr = xr.Dataset(
    data_vars=dict(
        pr=(["time", "lat","lon"],np.asarray(np.squeeze(CanESM5_ssp245.pr.data))),
    ),
    coords=dict(
        time=CanESM5_ssp245.time.data,
        lat=CanESM5_ssp245.lat.data,
        lon=CanESM5_ssp245.lon.data,
    ),
    attrs=dict(description="Weather related data."),)


MRI_ESM2_0_ssp245_xr = xr.Dataset(
    data_vars=dict(
        pr=(["time", "lat","lon"],np.asarray(np.squeeze(MRI_ESM2_0_ssp245.pr.data))),
    ),
    coords=dict(
        time=MRI_ESM2_0_ssp245.time.data,
        lat=MRI_ESM2_0_ssp245.lat.data,
        lon=MRI_ESM2_0_ssp245.lon.data,
    ),
    attrs=dict(description="Weather related data."),)

MRI_ESM1_2_HR_ssp245_xr = xr.Dataset(
    data_vars=dict(
        pr=(["time", "lat","lon"],np.asarray(np.squeeze(MRI_ESM1_2_HR_ssp245.pr.data))),
    ),
    coords=dict(
        time=MRI_ESM1_2_HR_ssp245.time.data,
        lat=MRI_ESM1_2_HR_ssp245.lat.data,
        lon=MRI_ESM1_2_HR_ssp245.lon.data,
    ),
    attrs=dict(description="Weather related data."),)

NorESM2_LM_ssp245_xr = xr.Dataset(
    data_vars=dict(
        pr=(["time", "lat","lon"],np.asarray(np.squeeze(NorESM2_LM_ssp245.pr.data))),
    ),
    coords=dict(
        time=NorESM2_LM_ssp245.time.data,
        lat=NorESM2_LM_ssp245.lat.data,
        lon=NorESM2_LM_ssp245.lon.data,
    ),
    attrs=dict(description="Weather related data."),
)




Save region

In [18]:
CanESM5_ssp245_xr.to_netcdf("/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data/ssp2_4_5/CanESM5_ssp245.nc")

MRI_ESM2_0_ssp245_xr.to_netcdf("/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data/ssp2_4_5/MRI_ESM2_0_ssp245.nc")


MRI_ESM1_2_HR_ssp245_xr.to_netcdf("/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data/ssp2_4_5/MRI_ESM1_2_HR_ssp245.nc")

NorESM2_LM_ssp245_xr.to_netcdf("/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data/ssp2_4_5/NorESM2_LM_ssp245.nc")


In [18]:
data_per_model  = xr.open_dataset(file)
data_per_model

# NOW MATCH TO CLINICS

In [19]:
ANC = True
Inpatient = False
monthly_cumulative = False
multiplier = 86400
years = range(2025, 2027)
month_lengths = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] * len(years)
if monthly_cumulative:
    window_size = np.nan
else:
    window_size = 5

if ANC:
    reporting_data = pd.read_csv(
        "/Users/rem76/Desktop/Climate_change_health/Data/monthly_reporting_ANC_by_smaller_facility_lm.csv")
elif Inpatient:
    reporting_data = pd.read_csv(
        "/Users/rem76/Desktop/Climate_change_health/Data/monthly_reporting_Inpatient_by_smaller_facility_lm.csv")
general_facilities = gpd.read_file("/Users/rem76/Desktop/Climate_change_health/Data/facilities_with_districts.shp")

facilities_with_lat_long = pd.read_csv(
    "/Users/rem76/Desktop/Climate_change_health/Data/facilities_with_lat_long_region.csv")


  facilities_with_lat_long = pd.read_csv(


In [13]:
def unzip_all_in_directory(directory):
    """
    Unzips all .zip files in the specified directory, extracting each into a separate folder.

    Parameters:
        directory (str): The path to the folder containing the .zip files.
    """
    for filename in os.listdir(directory):
        if filename.endswith('.zip'):
            file_path = os.path.join(directory, filename)
            extract_dir = os.path.join(directory, filename[:-4])
            os.makedirs(extract_dir, exist_ok=True)

            try:
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(extract_dir)
            except zipfile.BadZipFile:
                print(f"Skipped {filename}: not a valid zip file.")

def get_facility_lat_long(reporting_facility, facilities_df, cutoff=0.90, n_matches=3):
    """
    Function to find the closest matching facility name and return its latitude and longitude.

    Parameters:
    - reporting_facility: The facility name for which latitude and longitude are needed.
    - facilities_df : DataFrame containing facility names ('Fname') and their corresponding latitudes ('A109__Latitude') and longitudes ('A109__Longitude').
    - cutoff: The minimum similarity score for a match. Default is 0.90.
    - n_matches: The maximum number of matches to consider. Default is 3.

    Returns: match_name, lat_for_facility, long_for_facility

    """
    matching_facility_name = difflib.get_close_matches(reporting_facility, facilities_df['Fname'], n=n_matches,
                                                       cutoff=cutoff)

    if matching_facility_name:
        match_name = matching_facility_name[0]  # Access the string directly
        lat_for_facility = facilities_df.loc[facilities_df['Fname'] == match_name, "A109__Latitude"].iloc[0]
        long_for_facility = facilities_df.loc[facilities_df['Fname'] == match_name, "A109__Longitude"].iloc[0]
        return match_name, lat_for_facility, long_for_facility
    else:
        return np.nan, np.nan, np.nan

def extract_nc_files_from_unzipped_folders(directory):
    """
    Searches for .nc files in the specified directory and all its subfolders,
    and copies them to the output directory, maintaining the folder structure.

    Parameters:
        directory (str): The path to the folder containing the unzipped folders.
    """
    output_directory = os.path.join(directory, 'nc_files')
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for root, _, files in os.walk(directory):
        # Skip the output directory to prevent recursive copying
        if root == output_directory:
            continue

        for filename in files:
            if filename.endswith('.nc'):
                source_file_path = os.path.join(root, filename)
                destination_file_path = os.path.join(output_directory, filename)

                # Only copy if the file does not already exist in the output directory
                if not os.path.exists(destination_file_path):
                    shutil.copy2(source_file_path, output_directory)

In [16]:
precip_data_for_grid.pr.data

array([[ 2.4092836,  8.715088 ,  9.038933 , ...,  6.459653 , 11.19525  ,
        19.130037 ]], dtype=float32)

In [21]:
base_dir = "/Users/rem76/Desktop/Climate_change_health/Data/Precipitation_data/Downscaled_CMIP6_data/"
nc_file_directory = os.path.join(base_dir, 'nc_files')
# NB these are daily 
scenarios = [ "ssp2_4_5"] # don't have ssp19 scenario
years = range(2025, 2027)
year_lengths = [365, 366, 365, 366] * int(len(years)/4)

data_by_model_and_grid = {}
for scenario in scenarios:
    print(scenario)
    scenario_directory = os.path.join(base_dir, scenario)
    grid_centroids = {}
    
    cumulative_sum_by_models = {}
    for file in glob.glob(os.path.join(scenario_directory, "*.nc")):
        model = re.search(r'.*/(.*?)_ssp\d+', file).group(1)
        data_per_model  = xr.open_dataset(file)
        pr_data = data_per_model.variables['pr'][:]  # in kg m-2 s-1 = mm s-1 x 86400 to get to day
        lat_data = data_per_model.variables['lat'][:]
        lon_data = data_per_model.variables['lon'][:]
        lon_grid, lat_grid = np.meshgrid(lon_data, lat_data)
        centroids = np.column_stack((lat_grid.ravel(), lon_grid.ravel()))

        # Store centroids
        grid_centroids[model] = centroids
        grid_dictionary = {}
        grid = 0
        for i in lat_data:
            for j in lon_data:
                precip_data_for_grid = data_per_model.sel(lat = i, lon = j, method= "nearest") # across all time points
                grid_dictionary[grid] = precip_data_for_grid.pr.data
                grid += 1
        data_by_model_and_grid[model] = grid_dictionary

        pr_data_avg_area_model = pr_data.mean(dim=['lat', 'lon'])
        cumulative_sum_window_for_model = []
        begin_day = 0
        for year_idx, year_length in enumerate(year_lengths):
            days_for_grid = pr_data_avg_area_model[begin_day:begin_day + year_length]
            cumulative_sums = sum(days_for_grid)
            if isinstance(cumulative_sums, int):
                cumulative_sum_window_for_model.append(cumulative_sums)
            else:
                cumulative_sum_window_for_model.append(cumulative_sums.values)
            begin_day += year_length
        cumulative_sum_by_models[model] = np.mean(cumulative_sum_window_for_model)
    highest_model = max(cumulative_sum_by_models, key=lambda k: cumulative_sum_by_models[k])
    lowest_model = min(cumulative_sum_by_models, key=lambda k: cumulative_sum_by_models[k])
    sorted_models = sorted(cumulative_sum_by_models, key=lambda k: cumulative_sum_by_models[k])
    median_index = len(sorted_models) // 2
    if len(sorted_models) % 2 == 0:
        median_index -= 1
    median_model = sorted_models[median_index]
    models_of_interest = [lowest_model, median_model, highest_model]
    print("Models of interest", models_of_interest)
    
    facilities_with_location = []
    # see which facilities have reporting data and data on latitude and longitude
    median_model_by_facility_window = {}
    lowest_model_by_facility_window = {}
    highest_model_by_facility_window = {}
    median_model_by_facility_monthly = {}
    lowest_model_by_facility_monthly = {}
    highest_model_by_facility_monthly = {}
    cumulative_sum_window = {}
    cumulative_sum_monthly = {}
    for reporting_facility in reporting_data.columns:
            grid_precipitation_for_facility = {}
            match_name, lat_for_facility, long_for_facility = get_facility_lat_long(reporting_facility, facilities_with_lat_long)
            if not np.isnan(long_for_facility) and not np.isnan(lat_for_facility):
                    facility_location = np.array([lat_for_facility, long_for_facility])
                    kd_trees_by_model = {}
                    for model in models_of_interest:
                            centroids = grid_centroids[model]
                            kd_tree = KDTree(centroids)
                            distance, closest_grid_index = kd_tree.query(facility_location)
                            grid_precipitation_for_facility[model] = data_by_model_and_grid[model][closest_grid_index].data
                            cumulative_sum_monthly[reporting_facility] = []
                            cumulative_sum_window[reporting_facility] = []
                            begin_day = 0
                            for month_idx, month_length in enumerate(month_lengths):
                                days_for_grid_monthly = grid_precipitation_for_facility[model][begin_day:begin_day + month_length]
                                cumulative_sums_monthly = [
                                        sum(grid_precipitation_for_facility[model][begin_day:begin_day + month_length])
                                    ]
                                max_cumulative_sums_monthly = max(cumulative_sums_monthly)
                                cumulative_sum_monthly[reporting_facility].append(max_cumulative_sums_monthly)
                                begin_day += month_length
                            if model == models_of_interest[0]:
                                lowest_model_by_facility_monthly[reporting_facility] = cumulative_sum_monthly
                            if model == models_of_interest[1]:
                                median_model_by_facility_monthly[reporting_facility] = cumulative_sum_monthly
                            else:
                                 highest_model_by_facility_monthly[reporting_facility] = cumulative_sum_monthly

                                
                            begin_day = 0
                            for month_idx, month_length in enumerate(month_lengths):
                                days_for_grid_window = grid_precipitation_for_facility[model][begin_day:begin_day + month_length]

                                cumulative_sums_window = [
                                    sum(days_for_grid_window[day:day + window_size])
                                    for day in range(month_length - window_size + 1)
                                ]

                                max_cumulative_sums_window = max(cumulative_sums_window)   
                                cumulative_sum_window[reporting_facility].append(max_cumulative_sums_window)
                                begin_day += month_length
                            if model == models_of_interest[0]:
                                lowest_model_by_facility_window[reporting_facility] = cumulative_sum_window
                            if model == models_of_interest[1]:
                                median_model_by_facility_window[reporting_facility] = cumulative_sum_window
                            else:
                                 highest_model_by_facility_window[reporting_facility] = cumulative_sum_window
    weather_df_lowest_window = pd.DataFrame.from_dict(lowest_model_by_facility_window, orient='index').T
    weather_df_median_window = pd.DataFrame.from_dict(median_model_by_facility_window, orient='index').T
    weather_df_highest_window = pd.DataFrame.from_dict(highest_model_by_facility_window, orient='index').T
    
    weather_df_lowest_monthly = pd.DataFrame.from_dict(lowest_model_by_facility_monthly, orient='index').T
    weather_df_median_monthly = pd.DataFrame.from_dict(median_model_by_facility_monthly, orient='index').T
    weather_df_highest_monthly = pd.DataFrame.from_dict(highest_model_by_facility_monthly, orient='index').T
    # 
    # if ANC:
    #     weather_df_lowest_window.to_csv(Path(scenario_directory) / "median_model_daily_prediction_weather_by_facility_KDBall_ANC.csv", index=False)
    #     weather_df_median_window.to_csv(Path(scenario_directory) / "lowest_model_daily_prediction_weather_by_facility_KDBall_ANC.csv", index=False)
    #     weather_df_highest_window.to_csv(Path(scenario_directory) / "highest_model_daily_prediction_weather_by_facility_KDBall_ANC.csv", index=False)
    #     
    #     weather_df_lowest_monthly.to_csv(Path(scenario_directory) / "median_model_monthly_prediction_weather_by_facility_KDBall_ANC.csv", index=False)
    #     weather_df_median_monthly.to_csv(Path(scenario_directory) / "lowest_model_monthly_prediction_weather_by_facility_KDBall_ANC.csv", index=False)
    #     weather_df_highest_monthly.to_csv(Path(scenario_directory) / "highest_model_monthly_prediction_weather_by_facility_KDBall_ANC.csv", index=False)

ssp2_4_5


  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,
  return _methods._mean(a, axis=axis, dtype=dtype,


Models of interest ['MRI_ESM2_0', 'NorESM2_LM', 'MRI_ESM2_0']


In [14]:
print(len(data_by_model_and_grid[model][closest_grid_index][0]))

27393


In [23]:
highest_model_by_facility_monthly