In [2]:
%load_ext jupyter_black
%load_ext autoreload
%autoreload 2

In [172]:
import os
import pandas as pd
from dotenv import load_dotenv
from pathlib import Path
import geopandas as gpd
from azure.storage.blob import ContainerClient
import matplotlib.pyplot as plt
import warnings
import plotly.express as px

# Raised as part of interpolating with Pandas
warnings.simplefilter(action="ignore", category=FutureWarning)

load_dotenv()

ADMS = ["Sofala", "Inhambane", "Nampula", "Zambezia"]
THRESHOLD_SPEED_OPT1 = 89  # severe topical storm
THRESHOLD_SPEED_OPT2 = 118  # typhoon

AA_DATA_DIR = Path(os.getenv("AA_DATA_DIR"))
AA_DATA_DIR_NEW = Path(os.getenv("AA_DATA_DIR_NEW"))

DEV_BLOB_SAS = os.getenv("DSCI_AZ_SAS_DEV")
DEV_BLOB_NAME = "imb0chd0dev"
DEV_BLOB_URL = f"https://{DEV_BLOB_NAME}.blob.core.windows.net/"
DEV_BLOB_PROJ_URL = DEV_BLOB_URL + "projects" + "?" + DEV_BLOB_SAS
GLOBAL_CONTAINER_NAME = "global"
DEV_BLOB_GLB_URL = DEV_BLOB_URL + GLOBAL_CONTAINER_NAME + "?" + DEV_BLOB_SAS

dev_glb_container_client = ContainerClient.from_container_url(DEV_BLOB_GLB_URL)

## 1. Loading in raw data

In [4]:
emdat_path = (
    AA_DATA_DIR_NEW
    / "private"
    / "processed"
    / "glb"
    / "emdat"
    / "emdat-tropicalcyclone-2000-2022-processed-sids.csv"
)

ibtracs_path = (
    Path(AA_DATA_DIR)
    / "public"
    / "raw"
    / "glb"
    / "ibtracs"
    / "IBTrACS.SI.list.v04r01.points/IBTrACS.SI.list.v04r01.points.shp"
)

adm1_path = (
    AA_DATA_DIR
    / "public"
    / "raw"
    / "moz"
    / "cod_ab"
    / "moz_admbnda_adm1_ine_20190607.shp"
)

In [138]:
df_emdat = pd.read_csv(emdat_path)
gdf_adm1 = gpd.read_file(adm1_path)
gdf_ibtracs = gpd.read_file(ibtracs_path)

gdf_adm1_sel = gdf_adm1[gdf_adm1.ADM1_PT.isin(ADMS)]
df_emdat = df_emdat[df_emdat.iso3 == "MOZ"]  # Just want the impacts to Mozambique

Clean up EM-DAT data to get list of impactful cyclones that hit landfall in our AOI in Mozambique. Combine with IBTrACS data.

In [6]:
df_emdat_sel = df_emdat[df_emdat.iso3 == "MOZ"]

gdf_hist = gdf_ibtracs.merge(df_emdat_sel, left_on="SID", right_on="sid", how="right")

gdf_hist = gdf_hist[
    ["SID", "NAME", "Total Affected", "ISO_TIME", "REU_WIND", "geometry"]
].sort_values(by="Total Affected")

df_impact_summary = (
    gdf_hist.groupby(["SID", "NAME"]).max("Total Affected").reset_index()
)

In [7]:
# TODO: Split up by province
df_impact_summary

Unnamed: 0,SID,NAME,Total Affected,REU_WIND
0,2000032S11116,ELINE:LEONE,,100.0
1,2000083S17102,HUDAH,304.0,120.0
2,2003056S21042,JAPHET,23010.0,95.0
3,2007043S11071,FAVIO,162770.0,105.0
4,2008062S10064,JOKWE,220013.0,105.0
5,2009082S16039,IZILDA,7103.0,60.0
6,2012010S24049,DANDO,40042.0,45.0
7,2012018S16041,FUNSO,65000.0,110.0
8,2012056S13057,IRINA,4958.0,52.0
9,2017043S19040,DINEO,750102.0,75.0


In [139]:
# combining US Wind and REUnion for those with no wind speed
gdf_ibtracs["REU_USA_WIND"] = gdf_ibtracs["REU_WIND"].fillna(
    gdf_ibtracs["USA_WIND"] * 0.88
)

df_ibtracs = gdf_ibtracs[["SID", "NAME", "REU_USA_WIND", "ISO_TIME", "LAT", "LON"]]
df_ibtracs.loc[:, "ISO_TIME"] = pd.to_datetime(df_ibtracs["ISO_TIME"])
df_ibtracs = df_ibtracs.dropna(subset="REU_USA_WIND")

# Interpolate the IBTrACS points to 30min frequency
gdfs = []
for sid, group in df_ibtracs.groupby("SID"):
    df_interp = (
        group.set_index("ISO_TIME").resample("30min").interpolate().reset_index()
    )
    df_interp["SID"] = sid
    df_interp["NAME"] = df_interp.iloc[0].NAME
    gdf_interp = gpd.GeoDataFrame(
        data=df_interp,
        geometry=gpd.points_from_xy(df_interp["LON"], df_interp["LAT"]),
        crs=4326,
    )
    gdfs.append(gdf_interp)

gdf_ibtracs_interp = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

In [140]:
# Filter out the points that don't make landfall with the coastal provinces in Mozambique
gdf_ibtracs_interp_sel = gpd.sjoin(
    gdf_ibtracs_interp, gdf_adm1_sel, how="inner", predicate="intersects"
)[
    [
        "ISO_TIME",
        "SID",
        "NAME",
        "REU_USA_WIND",
        "LAT",
        "LON",
        "geometry",
        "ADM1_PT",
        "ADM1_PCODE",
    ]
]

In [162]:
# Now group and join to get one row per event, with relevant variables added in

df_ibtracs_sum = (
    gdf_ibtracs_interp_sel.groupby(["SID", "NAME"])
    .agg({"REU_USA_WIND": "max", "ISO_TIME": "min"})
    .reset_index()
)

# TODO: Why are there some NA ADM1 values?
df_cyclone_summary = (
    df_ibtracs_sum.merge(
        gdf_ibtracs_interp_sel[["SID", "NAME", "ADM1_PT", "REU_USA_WIND", "ISO_TIME"]],
        on=["SID", "NAME", "REU_USA_WIND", "ISO_TIME"],
        how="left",
    )
    .merge(
        df_emdat[["sid", "Total Affected"]], left_on="SID", right_on="sid", how="left"
    )
    .drop(axis=1, columns=["sid"])
    .rename(
        columns={
            "REU_USA_WIND": "MAX_SPEED_ON_LAND",
            "ADM1_PT": "ADM1_OF_MAX",
            "ISO_TIME": "TIME_OF_MAX",
            "Total Affected": "TOTAL_AFFECTED",
        }
    )
)

# Note that these classifications are based on JUST speeds while the cyclone is over our AOI
# So may not necessarily reflect the global classification of the cyclone
df_cyclone_summary["SEVERE_TROPICAL_STORM"] = (
    df["MAX_SPEED_ON_LAND"] >= THRESHOLD_SPEED_OPT1
) & (df["MAX_SPEED_ON_LAND"] < THRESHOLD_SPEED_OPT2)
df_cyclone_summary["TYPHOON"] = df["MAX_SPEED_ON_LAND"] >= THRESHOLD_SPEED_OPT2

# Also create a column with just the day, for matching against daily precipitation data
df_cyclone_summary["DAY_OF_MAX"] = pd.to_datetime(
    df_cyclone_summary["TIME_OF_MAX"].dt.strftime("%Y-%m-%d")
)

In [163]:
df_cyclone_summary

Unnamed: 0,SID,NAME,MAX_SPEED_ON_LAND,TIME_OF_MAX,ADM1_OF_MAX,TOTAL_AFFECTED,SEVERE_TROPICAL_STORM,TYPHOON,DAY_OF_MAX
0,1978347S20041,ANGELE,72.0,1978-12-19 01:00:00,Nampula,,False,False,1978-12-19
1,1981351S12060,BENEDICTE,58.208333,1981-12-24 03:30:00,Zambezia,,False,False,1981-12-24
2,1982030S15065,ELECTRE,26.4,1982-02-05 18:00:00,Zambezia,,False,False,1982-02-05
3,1984016S15073,DOMOINA,44.0,1984-01-28 08:00:00,Inhambane,,False,False,1984-01-28
4,1986005S15043,BEROBIA,42.0,1986-01-09 19:30:00,Sofala,,False,False,1986-01-09
5,1987325S06050,UNNAMED,22.0,1987-11-25 22:00:00,Nampula,,False,False,1987-11-25
6,1988055S10051,FILAO,51.0,1988-03-01 21:30:00,Zambezia,,False,False,1988-03-01
7,1994076S14078,NADIA,66.44,1994-03-24 17:30:00,Nampula,,False,False,1994-03-24
8,1995020S20038,FODAH,13.2,1995-01-21 00:00:00,Zambezia,,False,False,1995-01-21
9,1996001S08075,BONITA,64.24,1996-01-13 20:30:00,,,False,False,1996-01-13


Now load in the precipitation data from the processed IMERG files

In [156]:
imerg_path = (
    Path(AA_DATA_DIR) / "public" / "processed" / "moz" / "daily_imerg_precip_adm1.csv"
)

# TODO: Take the 2-day rolling sum?
df_imerg = pd.read_csv(imerg_path)
df_imerg = df_imerg[df_imerg.ADM1.isin(ADMS)]
df_imerg["date"] = pd.to_datetime(df_imerg["date"])

Create a simple plot to sanity check

In [225]:
fig = px.line(
    df_imerg_sel,
    x="date",
    y="total_precipitation",
    facet_row="ADM1",
    title="IMERG Observed Daily Precipitation Across Selected Provinces in Mozambique",
    template="simple_white",
)
fig.update_yaxes(tickangle=45)
fig.show()

In [193]:
df_cyclone_precip = df_cyclone_summary.merge(
    df_imerg, left_on="DAY_OF_MAX", right_on="date", how="left"
)

df_cyclone_precip = (
    df_cyclone_precip.drop(columns=["Unnamed: 0", "date"])
    .rename(columns={"total_precipitation": "TOTAL_PRECIPITATION"})
    .dropna(subset="TOTAL_PRECIPITATION") # Since we don't have precipitation before 2003
)

For cyclone events, what trends are there in the observed daily total precipitation per province and the maximum windspeed over land in Mozambique? We can break this down by province, however the max wind speed is independent of province and is the max across the total overall area. We can also color points by the estimates of impact (total affected population in the country) for each cyclone. Note that many of the cyclones from IBTrACS don't have EM-DAT estimates.

Some observations:
- We do see that higher impact cyclones **do** often have higher max wind speeds (Idai and Dineo, for example)
- Observed daily precipitation is not necessarily a good indicator of the cyclone hitting. Cyclone ANA had a 

Next steps:
- Need to do some more investigation on approaches for aggregating or summing up precipitation from IMERG. Could be: calculating 2 or 3 day rolling sums
- Might also be worth calculating return periods for precipitation values to identify what might be notable outlier observations

In [221]:
fig = px.scatter(
    df_cyclone_precip,
    x="TOTAL_PRECIPITATION",
    y="MAX_SPEED_ON_LAND",
    color="TOTAL_AFFECTED",
    facet_col="ADM1",
    template="simple_white",
    color_continuous_scale=px.colors.sequential.Reds[2:7],
    hover_data=["NAME", "DAY_OF_MAX"],
    title="Daily Precipitation vs Max Wind Speed for Cyclones Making Landfall in Mozambique",
)
fig.update_traces(marker_size=10)
fig.show()