# Nexus Scatter
9.14.2022, a. stein

In [1]:
%pylab inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import xarray as xr
import rioxarray
import rasterio as rio
import pandas as pd
import geopandas as gpd

import matplotlib.dates as mdates

from tqdm.autonotebook import tqdm

import sys
sys.path.append('../../')
import ndrought.wrangle as wrangle
import ndrought.compare as compare
import ndrought.plotting as ndplot

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


  from tqdm.autonotebook import tqdm


## Load in Data

In [2]:
dm_path = '/pool0/home/steinadi/data/drought/drought_impact/data/drought_measures'

In [3]:
usdm = xr.open_dataset(f'{dm_path}/usdm/USDM_WA_20000104_20220412.nc').load()

In [4]:
intervals = ['14d', '30d', '90d', '180d', '270d', '1y', '2y', '5y', ]

In [5]:
spi = xr.open_dataset(f'{dm_path}/spi/WA/spi_usdmcat_WA.nc').load()

In [6]:
spei = xr.open_dataset(f'{dm_path}/spei/WA/spei_usdmcat_WA.nc').load()

In [7]:
eddi = xr.open_dataset(f'{dm_path}/eddi/WA/eddi_usdmcat_WA.nc').load()

In [8]:
pdsi = xr.open_dataset(f'{dm_path}/pdsi/WA/pdsi_usdmcat_WA.nc').load()

In [9]:
grace = xr.open_dataset(f'{dm_path}/grace/WA/grace_usdmcat_WA.nc').load()
grace_vars = ['gws', 'rtzsm', 'sfsm']

## Pairing

In [25]:
def pair_dates(dates_a:pd.DatetimeIndex, dates_b:pd.DatetimeIndex, dates_a_name:str, dates_b_name:str, method='last-b', realign=False):
    """Pairs dates between two metrics for comparison.
    
    Note that this was developed for SPI and USDM and should be double checked.

    WARNING: The current catch for too-many dates provided is experimental.

    Parameters
    ----------
    dates_a: DateTimeIndex
    dates_b: DateTimeIndex
    dates_a_name: str
        Name of dates_a.
    dates_b_name: str
        Name of dates_b.
    method: str
        How to pair dates between dates_a and dates_b. The following are
        currently supported:
        - last-a: match dates_b to the last dates_a available. Use if there
            is reason to believe that measure A informs measure B
        - last-b: match dates_a to the last dates_b available. Use if there
            is reason to believe that measure B informs measure A
        - nearest: dates are paired by their nearest neighbors, dropping any
            dates that are not chosen in the process. Use if there is no
            reason to believe A nor B inform each other and do not want
            an aggregate process

        To come:
        - cluster-a: match all dates_b to their nearest dates_a, which
            can result in multiple dates being assigned to a date in dates_a
            and an aggregation scheme should be used in the data. Use if
            believe B informs A and wanting to allow for multi-pairing.
        - cluster-b: match all dates_a to their nearest dates_b, which
            can result in multiple dates being assigned to a date in dates_b
            and an aggregation scheme should be used in the data. Use if
            believe A informs B and wanting to allow for multi-pairing.
        - cluster-nearest: each date is assigned to the nearest other date,
            being the only method to guarantee no dates are dropped in pairing.
            Use if there is no strong belief that one measure dominantly
            influences the other and wanting to allow for multi-pairing.
    realign: boolean, (optional)
        Whether to automatically clip dates to ensure proper pairing,
        defaults as False.

    Returns
    -------
    pd.DataFrame
        DataFrame where each row pairs an dates_b to a dates_a.
    """

    # check if times are too far out of alignment
    if dates_a[-1] - pd.Timedelta(days=7) > dates_b[-1]:
        if realign:
            dates_a = dates_a[dates_a <= dates_b[-1] + pd.Timedelta(days=7)]
        else:
            raise Exception('dates_a extends more than a week beyond dates_b, resulting in an inability to pair. Please adjust dates_a accordingly or set realign=True to (experimentally) automatically correct.')
    if dates_b[-1] - pd.Timedelta(days=7) > dates_a[-1]:
        if realign:
            dates_b = dates_b[dates_b <= dates_a[-1] + pd.Timedelta(days=7)]
        else:
            raise Exception('dates_b extends more than a week beyond dates_a, resulting in an inability to pair. Please adjust dates_b accordingly or set realign=True to (experimentally) automatically correct.')
    if dates_b[0] + pd.Timedelta(days=7) < dates_a[-1]:
        if realign:
            dates_b = dates_b[dates_b >= dates_a[0] - pd.Timedelta(days=7)]
        else:
            raise Exception('dates_b extends more than a week prior to dates_a, resulting in an inability to pair. Please adjust dates_b accordingly or set realign=True to (experimentally) automatically correct')
    if dates_a[0] + pd.Timedelta(days=7) < dates_b[-1]:
        if realign:
            dates_a = dates_a[dates_a >= dates_b[0] - pd.Timedelta(days=7)]
        else:
            raise Exception('dates_a extends more than a week prior to dates_b, resulting in an inability to pair. Please adjust dates_a accordingly or set realign=True to (experimentally) automatically correct')

    # now we need to iterate through and find which other dates are the closest
    # and pair them
    if method == 'last-b':
        pair_dates = pd.DataFrame(pd.Series(dates_b, name=dates_b_name))
        # add the column for dates_a dates
        pair_dates[dates_a_name] = np.nan * np.zeros(len(pair_dates[dates_b_name]))

        i = 0
        for date in dates_a:
            if date >= dates_b[i]:
                while i < len(dates_b) and dates_b[i] <= date:
                    i += 1
                if not isinstance(pair_dates[dates_a_name].iloc[i-1], pd.Timestamp):
                    pair_dates[dates_a_name].iloc[i-1] = date
    elif method == 'last-a':
        pair_dates = pd.DataFrame(pd.Series(dates_a, name=dates_a_name))
        # add the column for dates_b dates
        pair_dates[dates_b_name] = np.nan * np.zeros(len(pair_dates[dates_a_name]))

        i = 0
        for date in dates_b:
            if date >= dates_a[i]:
                while i < len(dates_a)-1 and dates_a[i+1] <= date:
                    i += 1
                if not isinstance(pair_dates[dates_b_name].iloc[i], pd.Timestamp):
                    pair_dates[dates_b_name].iloc[i] = date
    elif method == 'nearest':
        i = 0
        j = 0
        pairs = []

        # go through and match each date in A with the nearest date in B
        while i < len(dates_a) and j < len(dates_b):
            current_difference = np.abs(dates_a[i] - dates_b[j])

            # if we've gotten to the end of the list, match with
            # it and reset our search
            if j+1 == len(dates_b):
                pairs.append((dates_a[i], dates_b[j]))
                i += 1
                j = 0
            # if there is an exact match or the found date is closer
            # than the following, consider it a match
            elif current_difference == pd.Timedelta('0 day') or current_difference < np.abs(dates_a[i] - dates_b[j+1]):
                pairs.append((dates_a[i], dates_b[j]))
                i += 1
            # keep looking
            else:
                j += 1

        # now need to trim duplicate pairings

        paired_b = np.array(pairs)[:, 1]
        remove_pairs = []

        for date in dates_b:
            found_pairs = np.where(date == paired_b)[0]
            # check if found more than one pairing
            if len(found_pairs > 1):
                # gather differences
                deltas = []
                for duplicate in found_pairs:
                    pairing = pairs[duplicate]
                    deltas.append(np.abs(pairing[0] - pairing[1]))
                # find closest
                minimum_index = np.argmin(np.array(deltas))
                minimum_pairing = pairs[found_pairs[minimum_index]]
            # collect which pairs to remove (removing now would mess
            # with the ordering of things)
            for duplicate in found_pairs:
                pairing = pairs[duplicate]
                if pairing != minimum_pairing:
                    remove_pairs.append(pairing)
        # finally remove unfavorable duplicates
        for pair in remove_pairs:
            pairs.remove(pair)
            
        pair_dates = pd.DataFrame(data=np.array(pairs), columns=[dates_a_name, dates_b_name])
            
    #elif method == 'cluster-a':
    #elif method == 'cluster-b':
    #elif method == 'cluster-nearest':
    else:
        raise Exception(f'{method} is not a supported method. Please visit the documentation for supported methods.')

    # now drop the dates that did not get chosen
    #pair_dates = pair_dates.dropna('index')
    # reset the index
    pair_dates = pair_dates.reset_index()
    # and make sure to drop pandas trying to preserve the old index
    pair_dates = pair_dates.drop(columns='index')

    return pair_dates

In [11]:
dates_a = pd.to_datetime([f'2000-01-{i}' for i in [2, 9, 16, 23]])
dates_b = pd.to_datetime([f'2000-01-{i}' for i in [1, 6, 11, 16, 21, 26]])

In [12]:
pair_dates(dates_a, dates_b, "A", "B", method='last-b', realign=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pair_dates[dates_a_name].iloc[i-1] = date


Unnamed: 0,B,A
0,2000-01-01,2000-01-02 00:00:00
1,2000-01-06,2000-01-09 00:00:00
2,2000-01-11,
3,2000-01-16,2000-01-16 00:00:00
4,2000-01-21,2000-01-23 00:00:00
5,2000-01-26,


In [13]:
pair_dates(dates_a, dates_b, "A", "B", method='last-a', realign=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pair_dates[dates_b_name].iloc[i] = date


Unnamed: 0,A,B
0,2000-01-02,2000-01-06 00:00:00
1,2000-01-09,2000-01-11 00:00:00
2,2000-01-16,2000-01-16 00:00:00
3,2000-01-23,2000-01-26 00:00:00


In [14]:
i = 0
j = 0

pairs = []

nearest_threshold = pd.Timedelta('10 day')
difference = nearest_threshold
while i < len(dates_a) and j < len(dates_b):
    print(i, j, np.abs(dates_a[i] - dates_b[j]))

    current_difference = np.abs(dates_a[i] - dates_b[j])

    if current_difference < difference:
        difference = current_difference
        j += 1
    elif current_difference >= nearest_threshold:
        j += 1
    else:
        pairs.append((dates_a[i], dates_b[j-1]))
        difference = nearest_threshold
        i += 1
        j = 0

0 0 1 days 00:00:00
0 1 4 days 00:00:00
1 0 8 days 00:00:00
1 1 3 days 00:00:00
1 2 2 days 00:00:00
1 3 7 days 00:00:00
2 0 15 days 00:00:00
2 1 10 days 00:00:00
2 2 5 days 00:00:00
2 3 0 days 00:00:00
2 4 5 days 00:00:00
3 0 22 days 00:00:00
3 1 17 days 00:00:00
3 2 12 days 00:00:00
3 3 7 days 00:00:00
3 4 2 days 00:00:00
3 5 3 days 00:00:00


In [15]:
pairs

[(Timestamp('2000-01-02 00:00:00'), Timestamp('2000-01-01 00:00:00')),
 (Timestamp('2000-01-09 00:00:00'), Timestamp('2000-01-11 00:00:00')),
 (Timestamp('2000-01-16 00:00:00'), Timestamp('2000-01-16 00:00:00')),
 (Timestamp('2000-01-23 00:00:00'), Timestamp('2000-01-21 00:00:00'))]

In [16]:
len(dates_a)

4

In [17]:
len(dates_b)

6

In [18]:
i = 0
j = 0

pairs = []

#nearest_threshold = pd.Timedelta('10 day')
#difference = nearest_threshold
while i < len(dates_b) and j < len(dates_a):
    print(i, j, np.abs(dates_b[i] - dates_a[j]))

    current_difference = np.abs(dates_b[i] - dates_a[j])

    if j + 1 == len(dates_a):
        pairs.append((dates_b[i], dates_a[j]))
        i += 1
        j = 0
    elif current_difference == pd.Timedelta('0 day') or current_difference < np.abs(dates_b[i] - dates_a[j+1]):
        pairs.append((dates_b[i], dates_a[j]))
        i += 1
    else:
        j += 1
    

0 0 1 days 00:00:00
1 0 4 days 00:00:00
1 1 3 days 00:00:00
2 1 2 days 00:00:00
3 1 7 days 00:00:00
3 2 0 days 00:00:00
4 2 5 days 00:00:00
4 3 2 days 00:00:00
5 0 24 days 00:00:00
5 1 17 days 00:00:00
5 2 10 days 00:00:00
5 3 3 days 00:00:00


In [19]:
pairs

[(Timestamp('2000-01-01 00:00:00'), Timestamp('2000-01-02 00:00:00')),
 (Timestamp('2000-01-06 00:00:00'), Timestamp('2000-01-09 00:00:00')),
 (Timestamp('2000-01-11 00:00:00'), Timestamp('2000-01-09 00:00:00')),
 (Timestamp('2000-01-16 00:00:00'), Timestamp('2000-01-16 00:00:00')),
 (Timestamp('2000-01-21 00:00:00'), Timestamp('2000-01-23 00:00:00')),
 (Timestamp('2000-01-26 00:00:00'), Timestamp('2000-01-23 00:00:00'))]

In [20]:
# trimming

paired_a = np.array(pairs)[:, 1]

remove_pairs = []
for date in dates_a:
    found_pairs = np.where(date == paired_a)[0]
    if len(found_pairs) > 1:

        print(found_pairs)
        deltas = []
        for duplicate in found_pairs:
            pairing = pairs[duplicate]
            print(pairing)
            deltas.append(np.abs(pairing[0] - pairing[1]))
            
        deltas = np.array(deltas)
        minimum_index = np.argmin(deltas)
        minimum_pairing = pairs[found_pairs[minimum_index]]
        
        for duplicate in found_pairs:
            pairing = pairs[duplicate]
            if pairing != minimum_pairing:
                remove_pairs.append(pairing)

for pair in remove_pairs:
    pairs.remove(pair)

[1 2]
(Timestamp('2000-01-06 00:00:00'), Timestamp('2000-01-09 00:00:00'))
(Timestamp('2000-01-11 00:00:00'), Timestamp('2000-01-09 00:00:00'))
[4 5]
(Timestamp('2000-01-21 00:00:00'), Timestamp('2000-01-23 00:00:00'))
(Timestamp('2000-01-26 00:00:00'), Timestamp('2000-01-23 00:00:00'))


In [21]:
pairs

[(Timestamp('2000-01-01 00:00:00'), Timestamp('2000-01-02 00:00:00')),
 (Timestamp('2000-01-11 00:00:00'), Timestamp('2000-01-09 00:00:00')),
 (Timestamp('2000-01-16 00:00:00'), Timestamp('2000-01-16 00:00:00')),
 (Timestamp('2000-01-21 00:00:00'), Timestamp('2000-01-23 00:00:00'))]

In [22]:
pd.DataFrame(data=np.array(pairs), columns=['B', 'A'])

Unnamed: 0,B,A
0,2000-01-01,2000-01-02
1,2000-01-11,2000-01-09
2,2000-01-16,2000-01-16
3,2000-01-21,2000-01-23


In [26]:
pair_dates(dates_a, dates_b, 'A', 'B', realign=True, method='nearest')

Unnamed: 0,A,B
0,2000-01-02,2000-01-01
1,2000-01-09,2000-01-11
2,2000-01-16,2000-01-16
3,2000-01-23,2000-01-21


In [27]:
pair_dates(dates_b, dates_a, 'B', 'A', realign=True, method='nearest')

Unnamed: 0,B,A
0,2000-01-01,2000-01-02
1,2000-01-11,2000-01-09
2,2000-01-16,2000-01-16
3,2000-01-21,2000-01-23


Wonderful. I'll worry about the cluster methods another time because I'd have to contemplate aggregation methods as well.

In [None]:
usdm_dates = pd.to_datetime(usdm.date.values)
spi_dates = pd.to_datetime(spi['spi_1y'].day.values)

## Pixel Stats