# Notebook reviewing CHIRPS-GEFS and IMERG around cyclone landfall

In [27]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import os
import pandas as pd
import geopandas as gpd
from pathlib import Path
import rasterio
from datetime import datetime
from rasterstats import zonal_stats
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import re
from src.constants import *
from src.datasources import codab, imerg

In [29]:
adm1_path = (
    Path(AA_DATA_DIR)
    / "public"
    / "raw"
    / "moz"
    / "cod_ab"
    / "moz_admbnda_adm1_ine_20190607.shp"
)
gdf_adm1 = gpd.read_file(adm1_path)
total_bbox = gdf_adm1.total_bounds

In [30]:
landfall_df = pd.read_csv(
    Path(AA_DATA_DIR)
    / "public"
    / "processed"
    / "moz"
    / "landfall_time_location_fixed.csv"
)
date_cols = ["year", "month", "day"]
landfall_df[date_cols] = landfall_df[date_cols].astype(int)
landfall_df["landfall_date"] = pd.to_datetime(
    landfall_df[["year", "month", "day"]]
)

In [31]:
drive_folder = Path(AA_DATA_DIR_NEW) / "public" / "raw" / "moz" / "chirps-gefs"
output_folder = "ds-aa-moz-cyclones/raw/chirps-gefs"
STORAGE_ACCOUNT_NAME = "imb0chd0dev"
CONTAINER_NAME = "projects"
SAS_TOKEN = os.getenv("DSCI_AZ_BLOB_DEV_SAS")

In [32]:
storms = [
    storm
    for storm in os.listdir(drive_folder)
    if os.path.isdir(drive_folder / storm)
]

In [None]:
data = []
for storm_name in storms:
    print(storm_name)
    storm_path = drive_folder / storm_name

    # List release dates (subfolders within storm folder)
    release_dates = [
        release_date
        for release_date in os.listdir(storm_path)
        if os.path.isdir(storm_path / release_date)
    ]
    # Get landfall date for this storm
    landfall_row = landfall_df[landfall_df["NAME"] == storm_name]
    if landfall_row.empty:
        continue
    landfall_date = pd.to_datetime(
        landfall_row.iloc[0]["date"], format="%d/%m/%Y"
    )

    for release_date_str in release_dates:
        print(release_date_str)
        release_date = datetime.strptime(release_date_str, "%Y-%m-%d")
        release_path = storm_path / release_date_str

        # List forecast files (inside release date folder)
        forecast_files = [
            f for f in os.listdir(release_path) if f.endswith(".tif")
        ]

        for forecast_file in forecast_files:
            forecast_date_str = Path(forecast_file).stem
            print(forecast_date_str)
            match = re.search(r"data_(\d{8})_cropped", forecast_date_str)
            if match:
                forecast_date = datetime.strptime(match.group(1), "%Y%m%d")
            else:
                continue  # Skip files without a valid date

            lead_time = (forecast_date - release_date).days
            days_around_landfall_release = (release_date - landfall_date).days
            days_around_landfall_forecast = (
                forecast_date - landfall_date
            ).days

            # Read TIFF file
            tiff_path = release_path / forecast_file
            with rasterio.open(tiff_path) as src:
                array = src.read(1)  # Read first band
                transform = src.transform
                stats = zonal_stats(
                    gdf_adm1,
                    array,
                    stats=[
                        "mean",
                        "median",
                        "max",
                        "min",
                        "percentile_10",
                        "percentile_25",
                        "percentile_75",
                        "percentile_90",
                    ],
                    affine=transform,
                )

                median_value = pd.DataFrame(stats)
                admin1_name = median_value.merge(
                    gdf_adm1[["ADM1_PT", "ADM1_PCODE"]],
                    left_index=True,
                    right_index=True,
                )
                admin1_name["storm"] = storm_name
                admin1_name["release_date"] = release_date_str
                admin1_name["forecast_date"] = match.group(1)
                admin1_name["leadtime"] = lead_time
                admin1_name["days_around_landfall_release"] = (
                    days_around_landfall_release
                )
                admin1_name["days_around_landfall_forecast"] = (
                    days_around_landfall_forecast
                )

            data.extend(admin1_name.to_dict(orient="records"))

FAVIO
2007-02-17
data_20070223_cropped
data_20070219_cropped
data_20070220_cropped
data_20070222_cropped
data_20070217_cropped
data_20070225_cropped
data_20070218_cropped
data_20070224_cropped
data_20070221_cropped
2007-02-18
data_20070218_cropped
data_20070219_cropped
data_20070220_cropped
data_20070224_cropped
data_20070221_cropped
data_20070225_cropped
data_20070222_cropped
data_20070223_cropped
2007-02-19
data_20070219_cropped
data_20070221_cropped
data_20070220_cropped
data_20070222_cropped
data_20070224_cropped
data_20070225_cropped
data_20070223_cropped
2007-02-20
data_20070224_cropped
data_20070220_cropped
data_20070222_cropped
data_20070223_cropped
data_20070225_cropped
data_20070221_cropped
2007-02-21
data_20070222_cropped
data_20070224_cropped
data_20070223_cropped
data_20070225_cropped
data_20070221_cropped
2007-02-22
data_20070222_cropped
data_20070223_cropped
data_20070224_cropped
data_20070225_cropped
JOKWE
2008-03-03
data_20080310_cropped
data_20080311_cropped
data_2008

In [34]:
# Convert to DataFrame
df = pd.DataFrame(data)
df.head(13)

Unnamed: 0,min,max,mean,median,percentile_10,percentile_25,percentile_75,percentile_90,ADM1_PT,ADM1_PCODE,storm,release_date,forecast_date,leadtime,days_around_landfall_release,days_around_landfall_forecast
0,2.666953,23.110664,8.616564,7.432296,4.810526,5.936393,9.759614,15.375449,Cabo Delgado,MZ01,FAVIO,2007-02-17,20070223,6,-5,1
1,1.88276,13.682445,6.856328,6.826742,4.493655,5.520406,8.159672,9.148391,Gaza,MZ02,FAVIO,2007-02-17,20070223,6,-5,1
2,1.953866,19.771961,6.934037,5.75576,3.925403,4.541694,8.377123,12.248544,Inhambane,MZ03,FAVIO,2007-02-17,20070223,6,-5,1
3,4.826375,38.401936,18.87434,18.880661,11.243801,13.79278,22.868195,27.029818,Manica,MZ04,FAVIO,2007-02-17,20070223,6,-5,1
4,0.0,6.682936,1.320434,0.6297,0.232363,0.333623,2.046637,3.717987,Maputo,MZ05,FAVIO,2007-02-17,20070223,6,-5,1
5,0.153346,1.179141,0.520529,0.42779,0.292959,0.386663,0.5242,0.968928,Maputo City,MZ06,FAVIO,2007-02-17,20070223,6,-5,1
6,0.24528,27.181517,11.870646,11.242546,6.610762,8.725834,15.200041,17.871178,Nampula,MZ07,FAVIO,2007-02-17,20070223,6,-5,1
7,0.327455,16.6614,6.134117,4.716778,1.903681,3.239087,9.277134,12.50295,Niassa,MZ08,FAVIO,2007-02-17,20070223,6,-5,1
8,7.329636,29.266773,15.054762,14.707427,11.220572,13.154787,16.75316,19.283707,Sofala,MZ09,FAVIO,2007-02-17,20070223,6,-5,1
9,4.279633,47.143757,16.303974,15.994373,11.051746,13.295543,18.345663,20.953066,Tete,MZ10,FAVIO,2007-02-17,20070223,6,-5,1


In [35]:
df["release_date"] = pd.to_datetime(df["release_date"], format="%Y-%m-%d")
df["forecast_date"] = pd.to_datetime(df["forecast_date"], format="%Y%m%d")

In [36]:
# Convert 'lead_time' to integer, if not already
df["leadtime"] = df["leadtime"].astype(int)

# Create a custom sort order for storm names based on the original 'storms' list
storm_order = {storm_name: idx for idx, storm_name in enumerate(storms)}

# Sort the DataFrame
df_sorted = df.sort_values(
    by=["storm", "release_date", "forecast_date", "leadtime"],
    key=lambda col: col.map(storm_order) if col.name == "storm" else col,
)

In [37]:
output_drive_folder = (
    Path(AA_DATA_DIR_NEW) / "public" / "processed" / "moz" / "chirps-gefs"
)
df.to_csv(
    output_drive_folder / "chirps_gefs_storm_rainfall_forecast.csv",
    index=False,
)