### GET GT/NRT TRIGGERS DATAFRAMES AND MERGE BASED ON DISTRICT VULNERABILTY

In [None]:
%cd ../../

In [2]:
import click
import logging

logging.basicConfig(level="INFO")

import warnings

warnings.simplefilter(action="ignore")

from numba import jit
from dask.distributed import Client

import os
import glob
import numpy as np
import xarray as xr
import pandas as pd
from scipy.optimize import brute
from sklearn.metrics import confusion_matrix

from config.params import Params

from helper_fns import (
    triggers_da_to_df,
    merge_un_biased_probs,
)

from hip.analysis.analyses.drought import (
    get_accumulation_periods,
    concat_obs_levels,
)

INFO:root:Set disk cache path.


In [3]:
from triggers import read_aggregated_obs, read_aggregated_probs, get_window_district, filter_triggers_by_window

In [13]:
client = Client()

params = Params(iso='MOZ', index='DRYSPELL')

rfh = xr.DataArray(
    np.arange(1, 9),
    coords=dict(
        time=(
            ["time"],
            pd.date_range(
                f"{params.start_season}/1/1990",
                f"{params.end_season + 1}/28/1991",
                freq="M",
            ),
        )
    ),
)
periods = get_accumulation_periods(
    rfh, 0, 0, params.min_index_period, params.max_index_period
)

obs = read_aggregated_obs(
    f"data/{params.iso}/outputs/zarr/obs/2022_blended",
    params,
)
obs = obs.assign_coords(
    lead_time=("index", [periods[i.split(" ")[-1]][0] for i in obs.index.values])
)
obs = obs.assign_coords(
    vulnerability=(
        "district",
        [params.districts_vulnerability[d] for d in obs.district.values],
    )
)
logging.info(
    f"Completed reading of aggregated observations for the whole {params.iso} country"
)

probs_ds = read_aggregated_probs(
    f"data/{params.iso}/outputs/zarr/2022_blended",
    params,
)
probs = xr.concat(
    [
        merge_un_biased_probs(probs_ds, probs_ds, params, i.split(" ")[-1])
        for i in probs_ds.index.values
    ],
    dim="index",
)
logging.info(
    f"Completed reading of aggregated probabilities for the whole {params.iso} country"
)

# Filter year/time dimension: temporary before harmonization with analytical script
obs = obs.sel(year=probs.year.values).load()
obs = obs.sel(time=probs.year.values).load()

INFO:distributed.scheduler:State start


INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:55485
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:55484/status
INFO:distributed.scheduler:Registering Worker plugin shuffle
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:55491'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:55488'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:55489'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:55490'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:55505', name: 3, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:55505
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:55510
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:55506', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:55506
INFO:distri

In [14]:
trigs = xr.open_zarr(
    f"data/MOZ/outputs/Plots/triggers_{params.index}_{params.year}_blended_NRT.zarr"
)
score = xr.open_zarr(
    f"data/MOZ/outputs/Plots/score_{params.index}_{params.year}_blended_NRT.zarr"
)

In [15]:
# Reset cells of xarray of no interest as nan
trigs = trigs.where(probs.prob.count("year") != 0, np.nan)
score = score.where(probs.prob.count("year") != 0, np.nan)

In [16]:
# Format trigs and score into a dataframe
trigs_df = triggers_da_to_df(trigs, score).dropna()
trigs_df = trigs_df.loc[
    trigs_df.HR.astype(float) < 0
]  # remove row when trigger not found (penalty)

In [17]:
# Add window information depending on district
trigs_df["Window"] = [
    get_window_district("MOZ", row["index"].split(" ")[-1], row.district)
    for _, row in trigs_df.iterrows()
]

# Filter per lead time
df_leadtime = pd.concat(
    [
        g.sort_values(["index", "issue"]).sort_values("HR", kind="stable").head(2)
        for _, g in trigs_df.sort_values("HR").groupby(
            ["category", "district", "Window", "lead_time"],
            as_index=False,
            sort=False,
        )
    ]
)

In [18]:
# Trick to align couples of issue months inside apply_ufunc
probs_ready = probs.sel(
    issue=np.uint8(params.issue)[:-1]
).load()  # use start/end season here
probs_set = probs.sel(issue=np.uint8(params.issue)[1:]).load()
probs_set["issue"] = [i - 1 if i != 1 else 12 for i in probs_set.issue.values]

In [19]:
# Keep two pairs of triggers per window of activation
df_window = filter_triggers_by_window(
    df_leadtime,
    probs_ready,
    probs_set,
    obs,
)

In [20]:
df_window

Unnamed: 0,index,category,district,issue,trigger,trigger_value,lead_time,HR,type,Window,FR
3432,DRYSPELL NDJ,Leve,Caia,7,trigger1,0.13,11,-0.833333,NRT,Window1,0.181818
3433,DRYSPELL NDJ,Leve,Caia,7,trigger2,0.22,11,-0.833333,NRT,Window1,0.181818
269,DRYSPELL DJ,Leve,Caia,9,trigger2,0.21,12,-0.777778,NRT,Window1,0.200000
268,DRYSPELL DJ,Leve,Caia,9,trigger1,0.29,12,-0.777778,NRT,Window1,0.200000
998,DRYSPELL FM,Leve,Caia,11,trigger1,0.35,2,-0.800000,NRT,Window2,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
1246,DRYSPELL FM,Moderado,Massingir,9,trigger1,0.16,2,-0.714286,NRT,Window2,0.250000
3753,DRYSPELL NDJ,Severo,Massingir,7,trigger2,0.00,11,-1.000000,NRT,Window1,0.000000
3752,DRYSPELL NDJ,Severo,Massingir,7,trigger1,0.38,11,-1.000000,NRT,Window1,0.000000
653,DRYSPELL DJ,Severo,Massingir,9,trigger2,0.33,12,-1.000000,NRT,Window1,0.000000


In [21]:
df_window.to_csv(
    f"data/MOZ/outputs/Plots/triggers.aa.python.{params.index}.{params.year}.blended.NRT.csv",
    index=False,
)

In [22]:
spigt = pd.read_csv(f"data/MOZ/outputs/Plots/triggers.aa.python.spi.{params.year}.blended.GT.csv")
drygt = pd.read_csv(f"data/MOZ/outputs/Plots/triggers.aa.python.dryspell.{params.year}.blended.GT.csv")
trigs_gt = pd.concat([spigt, drygt])

spinrt = pd.read_csv(f"data/MOZ/outputs/Plots/triggers.aa.python.spi.{params.year}.blended.NRT.csv")
drynrt = pd.read_csv(f"data/MOZ/outputs/Plots/triggers.aa.python.dryspell.{params.year}.blended.NRT.csv")
trigs_nrt = pd.concat([spinrt, drynrt])

In [15]:
# Keep SPI by default and DRYSPELL when not available
gt_merged = pd.concat([
    wcd.sort_values('index', ascending=False).head(4)
    for (w, c, d), wcd in trigs_gt.groupby(['district', 'category', 'Window'])
])

In [23]:
# Keep SPI by default and DRYSPELL when not available
nrt_merged = pd.concat([
    wcd.sort_values('index', ascending=False).head(4)
    for (w, c, d), wcd in trigs_nrt.groupby(['district', 'category', 'Window'])
])

In [16]:
gt_merged.to_csv(
    f"data/MOZ/outputs/Plots/triggers.aa.python.spi.dryspell.{params.year}.blended.GT.csv",
    index=False,
)

In [None]:
nrt_merged.to_csv(
    f"data/MOZ/outputs/Plots/triggers.aa.python.spi.dryspell.{params.year}.blended.NRT.csv",
    index=False,
)

INFO:distributed.core:Event loop was unresponsive in Nanny for 1016.20s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Nanny for 1016.21s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Scheduler for 1016.21s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Nanny for 1016.21s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. This can cause timeouts and instability.
INFO:distributed.core:Event loop was unresponsive in Scheduler for 1016.32s.  This is often caused by long-running GIL-holding functions or moving large chunks of data. Thi

Filter vulnerability based on district

In [6]:
params = Params(iso='MOZ', index='DRYSPELL')

gt_merged = pd.read_csv(
    f"data/MOZ/outputs/Plots/triggers.aa.python.spi.dryspell.{params.year}.blended.GT.csv",
)
nrt_merged = pd.read_csv(
    f"data/MOZ/outputs/Plots/triggers.aa.python.spi.dryspell.{params.year}.blended.NRT.csv",
)

In [14]:
triggers_full = pd.DataFrame()
for d, v in params.districts_vulnerability.items():
    if v == 'GT':
        triggers_full = pd.concat([triggers_full, gt_merged.loc[gt_merged.district == d]])
    else:
        triggers_full = pd.concat([triggers_full, nrt_merged.loc[nrt_merged.district == d]])

In [16]:
triggers_full.to_csv(
    f"data/MOZ/outputs/Plots/triggers.aa.python.spi.dryspell.{params.year}.blended.csv",
    index=False,
)