# Drought Event Network

a. stein 7.28.2022

Okay. So following `explore/drought_tracking.ipynb` I did `quality_control/tset_drought_event_plot.ipynb` and found that while blob identifying and connecting over time works ... the id system is a nightmare and not very smooth to use, especially as I realized during testing that I needed one extra set of parenthesis around a split to make the id's unique. This makes sorting them a further pain and there isn't a great way to trace history despite the id being unique (writing something to then process sorting is really annoying). So, let's try making a nodal network to keep track of it instead in combination with networkx.

In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import xarray as xr
import rioxarray
import rasterio as rio
import pandas as pd
import geopandas as gpd

import matplotlib.dates as mdates

from tqdm.autonotebook import tqdm

import sys
sys.path.append('../../')
import ndrought.wrangle as wrangle
import ndrought.compare as compare
import ndrought.plotting as ndplot

import skimage

from skimage.color import rgb2gray
from skimage.measure import regionprops_table

Populating the interactive namespace from numpy and matplotlib


  


In [8]:
class EventNode():

    def __init__(self, time, area, coords, event_code):
        self.time = time
        self.area = area
        self.coords = coords
        self.event_code = event_code
        self.future: List[EventNode] = list()

    def __str__(self):
        future_events = list()
        for future_EventNode in self.future:
            future_events.append(future_EventNode.event_code)
        return f'time: {self.time}, area: {self.area}, futures: {future_events}'

    def __iter__(self):
        yield self
        for node in self.future:
            yield node

    def append_future(self, other):
        self.future.append(other)

    def check_connects(self, other, auto_connect=True):
        
        connection_found = False

        self_coord_set = set(tuple(coord) for coord in self.coords)
        other_coord_set = set(tuple(coord) for coord in other.coords)

        if len(self_coord_set.intersection(other_coord_set)) > 0:
            connection_found = True
            if auto_connect:
                self.append_future(other)
        
        return connection_found

Okay, I think above is a fairly good starting point for this class. I need to do the following:
- try constructing a string of `EventNode`'s and see if they match a test version
- figure out how to transition from nodes to timeseries
- figure out how to hold all the nodes for a timeseries together, given they can stop and end and may not all be connected

In [3]:
def identify_drought_blob(vals:np.ndarray):
    """Using sci-kit image, identify drought event blobs.

    Parameters
    ----------
    vals: np.ndarray
        Spatial values for drought data categorized
        according to the USDM scheme for a single
        time step.

    Returns
    -------
    pd.DataFrame
        Drought blobs using connectivity 2 from
        skimage.measure.label. Blobs are binary
        definitions of drought, where the measure
        exceeds D1. Each blob is provided with
        it's area, bbox, convex_area, and coordinates
        of all cells contained within the blob.    
    """

    # first we're going to make this binary
    # by setting data in a drought to 1 and
    # not in a drought to 0, including nan

    vals[(vals < 1) | np.isnan(vals)] = 0
    vals[vals > 0] = 1

    # now we are going to convert to RGBL
    (h, w) = vals.shape
    t = (h, w, 3)
    A = np.zeros(t, dtype=np.uint8)
    for i in range(h):
        for j in range(w):
            # since we already made it binary, this
            # will make 1 vals be white and 0 vals
            # be black in our RGB array
            color_val = 255*vals[i,j]
            A[i, j] = [color_val, color_val, color_val]

    # connectivity 2 will consider diagonals as connected
    blobs = skimage.measure.label(rgb2gray(A) > 0, connectivity=2)

    properties =['area', 'coords']
    df = pd.DataFrame(regionprops_table(blobs, properties=properties))
    df['drought_id'] = np.nan*np.zeros(len(df))

    return df


def connect_blobs_over_time(df_1:pd.DataFrame, df_2:pd.DataFrame):
    """Identify blobs shared between time frames.

    Parameters
    ---------
    df_1 : pd.DataFrame
        Blob dataframe at first time index.
    df_2 : pd.DataFrame
        Blob dataframe at second time index.

    Returns
    -------
    list
        Indices to each dataframe denoting which
        blobs are shared, where each tuple in the
        list is connection. The first index of
        each tuple corresponds to df_1, while the
        second index correponds to df_2
    """

    blob_pairs = []

    for idx_1, df_1_coords in enumerate(df_1.coords.values):
        df_1_coords_set = set(tuple(coord) for coord in df_1_coords)
        for idx_2, df_2_coords in enumerate(df_2.coords.values):
            df_2_coords_set = set(tuple(coord) for coord in df_2_coords)
            if len(df_1_coords_set.intersection(df_2_coords_set)) > 0:
                blob_pairs.append((idx_1, idx_2))

    return blob_pairs

def propagate_drought_id(df_1=None, df_2=None, connections=[], new_blob_num=1):

    if len(connections) > 0:

        # need to keep track of splits among multiple
        # blobs (since they are 1-to-many and we are
        # iterating through linearly)
        split_origins = dict()

        for i in np.arange(len(df_2)):
            drought_id = ""

            # ALL CONNECTIONS
            # first we need to figure out if we are connected
            connects_origins = list()
            for connect in connections:
                # this means that our current index
                # connects to a previous time's index
                if connect[1] == i:
                    # we already know it's going to index i
                    # we need to figure out where it's coming from
                    connects_origins.append(connect[0])

            # SPLITS        
            # now we need to check if this is part of a split
            split_connections = dict()
            for origin in connects_origins:
                split_counter = 0
                for connect in connections:
                    # we want to count how many times the origin is
                    # connected to something ... if it ends up being
                    # more than once then it's a split
                    if connect[0] == origin:
                        split_counter += 1
                # meaning we found a split
                if split_counter > 1:
                    split_connections[origin] = split_counter
                    # if this is a new split we found, we
                    # should make sure to save a note of it
                    if origin not in split_origins.keys():
                        split_origins[origin] = 1
                
            # so this would be if the split was found        
            if len(split_connections) > 0:
                for split_origin in split_connections.keys():
                    split_origin_id = df_1['drought_id'].values[split_origin]
                    current_split_num = split_origins[split_origin]

                    drought_id = f'({split_origin_id})-{current_split_num}'
                    
                    # iterate for the next blob it splits into
                    split_origins[split_origin] += 1

            # MERGES
            # we have a merge if more than 1 blob
            # goes into this one
            if len(connects_origins) > 1:
                merged_blob_ids = df_1.iloc[connects_origins].sort_values('area', ascending=False)['drought_id'].values
                # double check if we already had a split and began
                # writing our code for this blob, if not we need to
                # set it up
                if len(drought_id) == 0:
                    drought_id = merged_blob_ids[0]
                for id in merged_blob_ids[1:]:
                    drought_id = f'{drought_id}.({id})'
                    
            # NO SPLIT NO MERGE        
            if len(connects_origins) == 1 and len(split_connections) == 0:
                drought_id = df_1.iloc[connects_origins[0]]['drought_id']
                

            # CONNECTIONS EXIST, BUT NEW BLOB
            if len(connects_origins) == 0:
                drought_id = f'{new_blob_num}'
                new_blob_num += 1    

            df_2.loc[i, 'drought_id'] = drought_id                   

    else:
        # there were no connections, all id's start from scratch
        for i in np.arange(len(df_2)):
            df_2.loc[i, 'drought_id'] = f'{new_blob_num}'
            new_blob_num += 1

    return df_2, new_blob_num

def encode_drought_events(data:np.ndarray):
    """Detect and encode drought events.

    Parameters
    ----------
    data: np.ndarray
        Expecting first index to be temporal while second
        and third are spatial.

    Returns
    -------
    pd.DataFrame
        A multi-indexed dataframe with time as the first level
        and drought_id as the second level. 'area', 'convex_area',
        and 'coords' are also outputted in this dataframe computed 
        from sci-kit image. 
    
    """
    blob_dfs = []

    for i in tqdm(np.arange(data.shape[0]), desc='Identifying Blobs'):
        blob_dfs.append(identify_drought_blob(data[i,:,:]))
    

    #return blob_dfs

    new_blob_num = 1
    init_df, new_blob_num = propagate_drought_id(df_2=blob_dfs[0])
    init_df['time'] = 0
    encoded_blob_dfs = [init_df]    
    for i in tqdm(np.arange(len(blob_dfs)-1), desc='Encoding Blobs'):
        df_1 = encoded_blob_dfs[i]
        df_2 = blob_dfs[i+1]

        blob_pairs = connect_blobs_over_time(df_1, df_2)
        df_2_encoded, new_blob_num = propagate_drought_id(df_1, df_2, blob_pairs, new_blob_num)
        df_2_encoded['time'] = i+1
        encoded_blob_dfs.append(df_2_encoded)

    all_blobs_df = pd.concat([df[['time', 'drought_id', 'area', 'coords']] for df in encoded_blob_dfs], ignore_index=True)
    all_blobs_df = all_blobs_df.set_index(['time', 'drought_id'])
    all_blobs_df['drought_id'] = all_blobs_df.index.get_level_values(1)

    return all_blobs_df

In [4]:
dm_path = '/pool0/home/steinadi/data/drought/drought_impact/data/drought_measures'

paired_ds = xr.open_dataset(f'{dm_path}/ndrought_products/paired_USDM_SPI.nc')

In [6]:
USDM_events = encode_drought_events(paired_ds['USDM'].values)

Identifying Blobs:   0%|          | 0/1148 [00:00<?, ?it/s]

Encoding Blobs:   0%|          | 0/1147 [00:00<?, ?it/s]

In [7]:
USDM_events

Unnamed: 0_level_0,Unnamed: 1_level_0,area,convex_area,coords,drought_id
time,drought_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
56,1,653,665,"[[0, 154], [0, 155], [0, 156], [0, 157], [0, 1...",1
57,1,653,665,"[[0, 154], [0, 155], [0, 156], [0, 157], [0, 1...",1
58,1,12006,13514,"[[0, 49], [0, 50], [0, 51], [0, 52], [0, 53], ...",1
58,2,37,65,"[[6, 41], [7, 41], [7, 43], [7, 44], [7, 45], ...",2
58,3,1,1,"[[10, 53]]",3
...,...,...,...,...,...
1144,(((75.(76).(77.(78)))-1.(80))-1)-2,2,2,"[[82, 62], [82, 63]]",(((75.(76).(77.(78)))-1.(80))-1)-2
1145,(((75.(76).(77.(78)))-1.(80))-1)-1,6861,7561,"[[0, 113], [0, 114], [0, 115], [0, 116], [0, 1...",(((75.(76).(77.(78)))-1.(80))-1)-1
1145,(((75.(76).(77.(78)))-1.(80))-1)-2,2,2,"[[82, 62], [82, 63]]",(((75.(76).(77.(78)))-1.(80))-1)-2
1146,(((75.(76).(77.(78)))-1.(80))-1)-1,6771,7276,"[[0, 114], [0, 115], [0, 116], [0, 117], [0, 1...",(((75.(76).(77.(78)))-1.(80))-1)-1
