# Notebook for processing IMERG Rainfall data in Blob

This notebook is used to process the rainfall data on Blob for each cyclone since Favio. It extracts the daily on-land rainfall values from -2 days to +5 days to landfall in a 250km radius around the landfall location.

In [54]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [55]:
import os
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import geopandas as gpd
import rioxarray as rxr
from azure.storage.blob import ContainerClient
import numpy as np
import warnings
from shapely.geometry import Point
from rasterio.features import geometry_mask
from scipy.interpolate import interp1d
from pyproj import CRS

load_dotenv()
ADMS = ["Sofala", "Inhambane", "Nampula", "Zambezia"]
AA_DATA_DIR = Path(os.getenv("AA_DATA_DIR"))
AA_DATA_DIR_NEW = Path(os.getenv("AA_DATA_DIR_NEW"))

DEV_BLOB_SAS = os.getenv("DSCI_AZ_SAS_DEV")
DEV_BLOB_NAME = "imb0chd0dev"
DEV_BLOB_URL = f"https://{DEV_BLOB_NAME}.blob.core.windows.net/"
DEV_BLOB_PROJ_URL = DEV_BLOB_URL + "projects" + "?" + DEV_BLOB_SAS
GLOBAL_CONTAINER_NAME = "global"
DEV_BLOB_GLB_URL = DEV_BLOB_URL + GLOBAL_CONTAINER_NAME + "?" + DEV_BLOB_SAS

dev_glb_container_client = ContainerClient.from_container_url(DEV_BLOB_GLB_URL)

In [56]:
warnings.filterwarnings("ignore")

In [57]:
adm1_path = (
    AA_DATA_DIR
    / "public"
    / "raw"
    / "moz"
    / "cod_ab"
    / "moz_admbnda_adm1_ine_20190607.shp"
)

gdf_adm1 = gpd.read_file(adm1_path)
gdf_sel = gdf_adm1[gdf_adm1.ADM1_PT.isin(ADMS)]
ibtracs_path = (
    Path(AA_DATA_DIR)
    / "public"
    / "raw"
    / "glb"
    / "ibtracs"
    / "IBTrACS.SI.list.v04r01.points/IBTrACS.SI.list.v04r01.points.shp"
)
adm2_path = (
    AA_DATA_DIR
    / "public"
    / "raw"
    / "moz"
    / "cod_ab"
    / "moz_admbnda_adm2_ine_20190607.shp"
)

gdf_adm2 = gpd.read_file(adm2_path)
gdf_sel_adm2 = gdf_adm2[gdf_adm2.ADM1_PT.isin(ADMS)]

minx, miny, maxx, maxy = gdf_sel.total_bounds

In [58]:
# blob_names = existing_files = [
#    x.name for x in dev_glb_container_client.list_blobs(name_starts_with="imerg/v6/")
# ]

In [59]:
gdf_ibtracs = gpd.read_file(ibtracs_path)

In [60]:
gdf_adm1_sel_buff = gdf_adm1[gdf_adm1.ADM1_PT.isin(ADMS)].buffer(250 / 111)
# also making sure to take one time step before landfall since some storms even off shore can cause a lot of rain
gdf_ibtracs_time = gdf_ibtracs[gdf_ibtracs["ISO_TIME"] >= "2003-03-11"]
# which cyclones made landfall or came close by around 50km to land
landfall_cyclones = gpd.sjoin(
    gdf_ibtracs_time, gdf_sel, how="inner", predicate="intersects"
)["NAME"].unique()

In [61]:
landfall_cyclones

array(['FAVIO', 'JOKWE', 'IZILDA', 'DANDO', 'IRINA', 'HARUNA', 'DELIWE',
       'GUITO', 'HELLEN', 'CHEDZA', 'DINEO', 'DESMOND', 'IDAI', 'KENNETH',
       'CHALANE', 'ELOISE', 'GUAMBE', 'ANA', 'GOMBE', 'JASMINE', 'FREDDY',
       'FILIPO'], dtype=object)

In [115]:
combined_df = []
landfall_locs = []
# dates = pd.date_range(start="2003-03-11", periods=len(das), freq="D")
radii = list(range(50, 501, 50))
radii

[50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

In [116]:
for cyc in landfall_cyclones:
    cyc_df = gdf_ibtracs_time[gdf_ibtracs_time["NAME"] == cyc]
    # Convert ISO_TIME to datetime and set as index
    cyc_df["ISO_TIME"] = pd.to_datetime(cyc_df["ISO_TIME"])
    cyc_df.set_index("ISO_TIME", inplace=True)

    # Resample the DataFrame to the desired frequency
    cyc_df = cyc_df.resample("30T").asfreq()

    # Interpolate numerical columns and forward fill text columns
    for column in cyc_df.columns:
        if pd.api.types.is_numeric_dtype(cyc_df[column]):
            non_nan = cyc_df[column].dropna()
            if len(non_nan) > 1:
                interp_func = interp1d(
                    non_nan.index.astype(np.int64),
                    non_nan.values,
                    kind="linear",
                    fill_value="extrapolate",
                )
                cyc_df[column] = interp_func(cyc_df.index.astype(np.int64))
        else:
            cyc_df[column] = cyc_df[column].ffill()

    # Reset index
    cyc_df.reset_index(inplace=True)
    cyc_df["geometry"] = cyc_df.apply(
        lambda row: Point(row["LON"], row["LAT"]), axis=1
    )
    cyc_df = gpd.GeoDataFrame(cyc_df, geometry="geometry")
    cyc_df["date"] = pd.to_datetime(cyc_df["ISO_TIME"]).dt.date
    cyc_sjoin = gpd.sjoin(cyc_df, gdf_adm1, how="left", predicate="within")
    cyc_df["ADM1_PT"] = cyc_sjoin["ADM1_PT"]
    cyc_df["actual_within_land"] = cyc_sjoin["index_right"].notna()
    cyc_df["point_location"] = np.where(
        cyc_df["actual_within_land"], "Within", "Outside"
    )

    first_landfall = (
        cyc_df[cyc_df["actual_within_land"]].index[0]
        if not cyc_df[cyc_df["actual_within_land"]].empty
        else None
    )

    if first_landfall is not None:
        entry_times = []
        lookback = 4
        for i in range(lookback, len(cyc_df)):
            if (
                not cyc_df["actual_within_land"].iloc[i - 1]
                and cyc_df["actual_within_land"].iloc[i]
            ):
                direction_east_to_west = True
                for j in range(i - lookback, i - 1):
                    if cyc_df["LON"].iloc[j] < cyc_df["LON"].iloc[j + 1]:
                        direction_east_to_west = False
                        break
                if direction_east_to_west:
                    entry_times.append(cyc_df["ISO_TIME"].iloc[i])

        entry_times = pd.to_datetime(entry_times)
        entry_dates = pd.Series(entry_times).dt.date.unique().tolist()

        if len(entry_dates) == 1:
            entry_times = [entry_times[0]]

        cyc_df.loc[cyc_df["ISO_TIME"].isin(entry_times), "point_location"] = (
            "Landfall"
        )
        landfall_locs.append(cyc_df[cyc_df["point_location"] == "Landfall"])
        print(cyc)
        for landfall in entry_times:
            landfall_time = pd.to_datetime(
                cyc_df[cyc_df["point_location"] == "Landfall"][
                    "ISO_TIME"
                ].values[0]
            )
            lf_dt = cyc_df[cyc_df["point_location"] == "Landfall"]

            # Iterate over each date and radius
            storm_df_list = []
            for time_step in range(-2, 6):
                date = landfall + pd.Timedelta(days=time_step)
                target_date = date.normalize()
                blob_name = f"imerg/v6/imerg-daily-late-{target_date.strftime('%Y-%m-%d')}.tif"
                cog_url = f"https://{DEV_BLOB_NAME}.blob.core.windows.net/global/{blob_name}?{DEV_BLOB_SAS}"

                try:
                    da_in = rxr.open_rasterio(cog_url, masked=True)
                    da_in = da_in.persist()
                    if da_in.rio.crs is None:
                        da_in.rio.write_crs("EPSG:4326", inplace=True)
                except Exception as e:
                    da_in = None

                if da_in is not None:
                    for radius in radii:
                        # Create a GeoDataFrame for the landfall point with the buffer
                        gdf_lf = gpd.GeoDataFrame(
                            {
                                "geometry": [
                                    Point(
                                        lf_dt["LON"].values[0],
                                        lf_dt["LAT"].values[0],
                                    )
                                ]
                            },
                            crs=CRS("EPSG:4326"),
                        )
                        gdf_lf["geometry"] = gdf_lf.buffer(
                            radius / 110.574
                        )  # Buffer radius
                        gdf_lf = gpd.overlay(
                            gdf_lf, gdf_sel, how="intersection"
                        )
                        if gdf_lf.shape[0] > 0:
                            polygon_union = gdf_lf.unary_union
                            mask = geometry_mask(
                                [polygon_union],
                                transform=da_in.rio.transform(),
                                invert=True,
                                out_shape=da_in.rio.shape,
                            )
                            masked_da = da_in.where(mask)
                            values = masked_da.values[
                                ~np.isnan(masked_da.values)
                            ]
                            median_value = (
                                np.median(values)
                                if values.size > 0
                                else np.nan
                            )
                        else:
                            median_value = np.nan

                        # Append results to storm_df_list
                        storm_df_list.append(
                            {
                                "storm": cyc,
                                "date": date,
                                "time_step": time_step,
                                "radius": radius,
                                "median_precip": median_value,
                            }
                        )

            # Create DataFrame from storm_df_list
            storm_df = pd.DataFrame(storm_df_list)
            combined_df.append(storm_df)

# Combine all DataFrames into a single DataFrame
rain_df = pd.concat(combined_df, ignore_index=True)
landfall_df = pd.concat(landfall_locs, ignore_index=True)

FAVIO
JOKWE
IZILDA
DANDO
IRINA
HARUNA
DELIWE
GUITO
HELLEN
CHEDZA
DINEO
DESMOND
IDAI
KENNETH
CHALANE
ELOISE
GUAMBE
ANA
GOMBE
JASMINE
FREDDY
FILIPO


In [118]:
rain_df[rain_df["storm"] == "KENNETH"]

Unnamed: 0,storm,date,time_step,radius,median_precip
960,KENNETH,2019-04-23 15:00:00,-2,50,
961,KENNETH,2019-04-23 15:00:00,-2,100,
962,KENNETH,2019-04-23 15:00:00,-2,150,0.000912
963,KENNETH,2019-04-23 15:00:00,-2,200,0.005736
964,KENNETH,2019-04-23 15:00:00,-2,250,0.001962
...,...,...,...,...,...
1035,KENNETH,2019-04-30 15:00:00,5,300,5.049721
1036,KENNETH,2019-04-30 15:00:00,5,350,5.077960
1037,KENNETH,2019-04-30 15:00:00,5,400,4.585343
1038,KENNETH,2019-04-30 15:00:00,5,450,4.242187


In [119]:
rain_df.to_csv(
    AA_DATA_DIR
    / "public"
    / "processed"
    / "moz"
    / "daily_imerg_cyclone_landfall_fixed.csv"
)
landfall_df.to_csv(
    AA_DATA_DIR
    / "public"
    / "processed"
    / "moz"
    / "landfall_time_location_fixed.csv"
)