In [None]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

In [None]:
import os
import tqdm
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import geopandas as gpd
from datetime import datetime
import rioxarray as rxr
from azure.storage.blob import ContainerClient
import xarray as xr
import matplotlib.pyplot as plt

load_dotenv()
ADMS = ["Sofala", "Inhambane", "Nampula", "Zambezia"]
AA_DATA_DIR = Path(os.getenv("AA_DATA_DIR"))
AA_DATA_DIR_NEW = Path(os.getenv("AA_DATA_DIR_NEW"))

DEV_BLOB_SAS = os.getenv("DSCI_AZ_SAS_DEV")
DEV_BLOB_NAME = "imb0chd0dev"
DEV_BLOB_URL = f"https://{DEV_BLOB_NAME}.blob.core.windows.net/"
DEV_BLOB_PROJ_URL = DEV_BLOB_URL + "projects" + "?" + DEV_BLOB_SAS
GLOBAL_CONTAINER_NAME = "global"
DEV_BLOB_GLB_URL = DEV_BLOB_URL + GLOBAL_CONTAINER_NAME + "?" + DEV_BLOB_SAS

dev_glb_container_client = ContainerClient.from_container_url(DEV_BLOB_GLB_URL)

In [None]:
adm1_path = (
    AA_DATA_DIR
    / "public"
    / "raw"
    / "moz"
    / "cod_ab"
    / "moz_admbnda_adm1_ine_20190607.shp"
)

gdf_adm1 = gpd.read_file(adm1_path)
gdf_sel = gdf_adm1[gdf_adm1.ADM1_PT.isin(ADMS)]

minx, miny, maxx, maxy = gdf_sel.total_bounds

In [None]:
blob_names = existing_files = [
    x.name for x in dev_glb_container_client.list_blobs(name_starts_with="imerg/v6/")
]

In [None]:
das = []
for blob_name in tqdm.tqdm(blob_names):
    cog_url = (
        f"https://{DEV_BLOB_NAME}.blob.core.windows.net/global/"
        f"{blob_name}?{DEV_BLOB_SAS}"
    )
    da_in = rxr.open_rasterio(cog_url, masked=True)
    da_in = da_in.sel(x=slice(minx, maxx), y=slice(miny, maxy))
    date_in = pd.to_datetime(blob_name.split(".")[0][-10:])
    da_in["date"] = date_in

    # Persisting to reduce the number of downstream Dask layers
    da_in = da_in.persist()
    das.append(da_in)

In [None]:
ds = xr.concat(das, dim="date", join="override", combine_attrs="drop")

In [None]:
# Now clip to the specific geometry
ds = ds.rio.write_crs(4326)
ds = ds.rio.set_spatial_dims(x_dim="x", y_dim="y")
ds_clip = ds.rio.clip(gdf_sel.geometry)

In [None]:
results = []

# TODO: Is there a better way to aggregate here?
# These loops will be v slow...
for day in ds_clip.date.values:

    ds_time = ds_clip.sel(date=day)

    for idx, row in gdf_sel.iterrows():
        admin_name = row["ADM1_PT"]
        polygon = row["geometry"]

        ds_clipped = ds_time.rio.clip([polygon], all_touched=True)
        total_precipitation = int(ds_clipped.sum(dim=["x", "y"]).values[0])

        results.append(
            {
                "ADM1": admin_name,
                "date": pd.to_datetime(day),
                "total_precipitation": total_precipitation,
            }
        )

In [None]:
df_precipitation = pd.DataFrame.from_dict(results)

In [None]:
# Create a plot to sanity check
plt.figure(figsize=(12, 6))

# Group by ADM1 and plot each group
for adm1, group in df_precipitation.groupby("ADM1"):
    plt.plot(group["date"], group["total_precipitation"], label=adm1)

plt.xlabel("Date")
plt.ylabel("Precipitation")
plt.title("Total daily precipitation per Province in Mozambique")
plt.legend(title="ADM1")
plt.tight_layout()
plt.grid(False)
plt.show()

In [None]:
# Now further check by plotting some specific dates
# Just observationally, the plots here make sense with the aggregations plotted above
# ds_clip.plot(x="x", y="y", col="date", col_wrap=5)

In [None]:
df_precipitation.to_csv(
    AA_DATA_DIR / "public" / "processed" / "moz" / "daily_imerg_precip_adm1_sel.csv"
)