# Compute KNN similarities

Computes similarities between each pair of dates based on how skillfully the history of one date predicts the history of the other.

In [1]:
## Package loading

# Autoreload packages that are modified
%load_ext autoreload
%autoreload 2

# Plotting magic
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

# Load relevant packages
import numpy as np
import pandas as pd
from sklearn import *
import sys
import subprocess
from datetime import datetime, timedelta
import netCDF4
import time
from functools import partial
import os

if os.path.basename(os.getcwd()) == "experiments":
    os.chdir(os.path.join("..",".."))

# Adds 'experiments' folder to path to load experiments_util
sys.path.insert(0, 'src/experiments')
# Load general utility functions
from experiments_util import *
# Load functionality for fitting and predicting
from fit_and_predict import *
# Load functionality for evaluation
from skill import *

## Prepare experimental results directory structure

# Set hindcast_year to None to obtain forecasts and to a specific year to obtain hindcasts
hindcast_year = None

# Choose the name of this experiment
experiment = "knn"
if hindcast_year is not None:
    experiment = "knn-hindcast_{}".format(hindcast_year) ### For hindcasts
    
# Name of cache directory for storing non-submission-date specific
# intermediate files
cache_dir = os.path.join('knn_mip')
# if cache_dir doesn't exist, create it
if not os.path.isdir(cache_dir):
    os.makedirs(cache_dir)

## Select target variable

In [3]:
gt_id = "contest_tmp2m"
gt_col= "tmp2m"
anom_col= "tmp2m"

## Compute ground truth cosine similarities between pairs of dates

In [9]:
if experiment == "knn":
    anoms=pd.read_hdf('data/tmp2m_western_us_anom_rmm.h5')

In [10]:
anoms.reset_index(['lat','lon','start_date'],inplace=True)
anoms=anoms=anoms[anoms.start_date>='1990-01-01']
anoms=anoms[anoms.start_date!=pd.Timestamp(2004,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(1984,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(1988,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(1992,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(1996,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(2000,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(2004,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(2008,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(2012,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(2016,2,29)]
anoms=anoms[anoms.start_date!=pd.Timestamp(2020,2,29)]



In [11]:
# Pivot dataframe to have one row per start date and one column per (lat,lon)
tic(); anoms = anoms.set_index(['lat','lon','start_date']).unstack(['lat','lon']); toc()
# Drop start dates that have no measurements (e.g., leap days, which have no climatology)
anoms = anoms.dropna(axis='index', how='all')
# Normalize each start_date's measurements by its Euclidean norm
tic()
norms = np.sqrt(np.square(anoms).sum(axis=1))
anoms = anoms.divide(norms, axis=0)
toc()
# Compute the cosine similarity between each pair of dates by computing all inner products
tic(); gt_cosines = anoms.dot(anoms.transpose()); toc()

Elapsed time: 3.546340 seconds.

Elapsed time: 0.059932 seconds.

Elapsed time: 2.568509 seconds.



In [12]:
gt_cosines

start_date,1990-01-01 00:00:00,1990-01-02 00:00:00,1990-01-03 00:00:00,1990-01-04 00:00:00,1990-01-05 00:00:00,1990-01-06 00:00:00,1990-01-07 00:00:00,1990-01-08 00:00:00,1990-01-09 00:00:00,1990-01-10 00:00:00,...,2020-10-28 00:00:00,2020-10-29 00:00:00,2020-10-30 00:00:00,2020-10-31 00:00:00,2020-11-01 00:00:00,2020-11-02 00:00:00,2020-11-03 00:00:00,2020-11-04 00:00:00,2020-11-05 00:00:00,2020-11-06 00:00:00
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-01,1.000000,0.992420,0.963689,0.914977,0.821526,0.757122,0.781641,0.776229,0.723318,0.687155,...,0.119491,0.178343,0.270634,0.347166,0.379979,0.404886,0.450944,0.501528,0.534553,0.529005
1990-01-02,0.992420,1.000000,0.986677,0.948266,0.861829,0.799444,0.822828,0.819273,0.772256,0.741699,...,0.041951,0.092717,0.182929,0.259321,0.297580,0.323955,0.377489,0.439809,0.497519,0.528903
1990-01-03,0.963689,0.986677,1.000000,0.982950,0.920156,0.868509,0.884219,0.879108,0.840070,0.815496,...,-0.013618,0.017157,0.092162,0.162157,0.197046,0.223191,0.276824,0.347415,0.432424,0.493169
1990-01-04,0.914977,0.948266,0.982950,1.000000,0.971926,0.931000,0.933850,0.930527,0.907576,0.886651,...,0.001678,0.007223,0.061929,0.122826,0.148260,0.173717,0.218901,0.296053,0.393411,0.447613
1990-01-05,0.821526,0.861829,0.920156,0.971926,1.000000,0.986745,0.976383,0.970023,0.961369,0.943361,...,0.019712,-0.004439,0.019602,0.067104,0.070534,0.091953,0.119086,0.192052,0.296350,0.335355
1990-01-06,0.757122,0.799444,0.868509,0.931000,0.986745,1.000000,0.990617,0.980014,0.972623,0.956448,...,0.006634,-0.027753,-0.017937,0.022109,0.016102,0.035231,0.054945,0.122045,0.229734,0.269878
1990-01-07,0.781641,0.822828,0.884219,0.933850,0.976383,0.990617,1.000000,0.994637,0.978212,0.959531,...,-0.013385,-0.033244,-0.008412,0.037099,0.040642,0.057924,0.081639,0.143404,0.239873,0.286459
1990-01-08,0.776229,0.819273,0.879108,0.930527,0.970023,0.980014,0.994637,1.000000,0.987744,0.970622,...,-0.006304,-0.024815,0.004535,0.050213,0.059630,0.076087,0.101741,0.163860,0.250324,0.291592
1990-01-09,0.723318,0.772256,0.840070,0.907576,0.961369,0.972623,0.978212,0.987744,1.000000,0.993559,...,-0.010800,-0.047330,-0.033730,0.007946,0.011762,0.030896,0.059112,0.135352,0.242714,0.288583
1990-01-10,0.687155,0.741699,0.815496,0.886651,0.943361,0.956448,0.959531,0.970622,0.993559,1.000000,...,-0.054210,-0.098830,-0.093253,-0.053202,-0.047745,-0.025305,0.011091,0.096853,0.224403,0.295329


## Define similarity measure

In [13]:
# Each date is represented by its past_days most recent observed measurements (i.e., 
# the past_days most recent measurements at least start_delta days before the date).
# The similarity of two dates is the average cosine similarity their past_days
# associated measurements.

# The number of past days that should contribute to measure of similarity
past_days = 60

## Compute similarity measure between pairs of target dates assuming start_delta = 0
That is, assuming that we have access to the ground truth measurement with start date equal to the target date.
Later we will shift by start_delta.

In [14]:
# Check if base similarities have been computed previously
regen_similarities0 = True
similarities0_file = os.path.join(
    'knn_mip/similarities0-{}-days{}.h5'.format(gt_id,past_days))
if regen_similarities0 or not os.path.isfile(similarities0_file):
    # Initially incorporate unshifted cosine similarities 
    # (representing the cosine similarity of the first past day)
    tic()
    similarities0 = gt_cosines.copy()
    toc()

    # Now, for each remaining past day, sum over additionally shifted measurements
    # NOTE: this has the effect of ignoring (i.e., skipping over) dates that don't 
    # exist in gt_cosines like leap days
    tic()
    for m in range(1,past_days):
    #for m in range(1,2):
        similarities0 += gt_cosines.shift(m, axis='rows').shift(m, axis='columns')
        sys.stdout.write(str(m)+' ')
    toc()

    # Normalize similarities by number of past days
    similarities0 /= past_days
    # Write similarities0 to file
    print "Saving similarities0 to "+similarities0_file; tic()
    similarities0.to_hdf(similarities0_file, key="data", mode="w"); toc()
else:
    # Read base similarities from disk
    print "Reading similarities0 from "+similarities0_file; tic()
    similarities0 = pd.read_hdf(similarities0_file); toc()

Elapsed time: 4.272745 seconds.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 Elapsed time: 114.933967 seconds.

Saving similarities0 to knn_mip/similarities0-contest_tmp2m-days60.h5
Elapsed time: 9.644831 seconds.



In [15]:
similarities0

start_date,1990-01-01 00:00:00,1990-01-02 00:00:00,1990-01-03 00:00:00,1990-01-04 00:00:00,1990-01-05 00:00:00,1990-01-06 00:00:00,1990-01-07 00:00:00,1990-01-08 00:00:00,1990-01-09 00:00:00,1990-01-10 00:00:00,...,2020-10-28 00:00:00,2020-10-29 00:00:00,2020-10-30 00:00:00,2020-10-31 00:00:00,2020-11-01 00:00:00,2020-11-02 00:00:00,2020-11-03 00:00:00,2020-11-04 00:00:00,2020-11-05 00:00:00,2020-11-06 00:00:00
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-01,,,,,,,,,,,...,,,,,,,,,,
1990-01-02,,,,,,,,,,,...,,,,,,,,,,
1990-01-03,,,,,,,,,,,...,,,,,,,,,,
1990-01-04,,,,,,,,,,,...,,,,,,,,,,
1990-01-05,,,,,,,,,,,...,,,,,,,,,,
1990-01-06,,,,,,,,,,,...,,,,,,,,,,
1990-01-07,,,,,,,,,,,...,,,,,,,,,,
1990-01-08,,,,,,,,,,,...,,,,,,,,,,
1990-01-09,,,,,,,,,,,...,,,,,,,,,,
1990-01-10,,,,,,,,,,,...,,,,,,,,,,


## Define prediction horizon

In [16]:
# Prediction horizon
target_horizon = "34w" # "34w" or "56w"

# Only use measurements available this many days prior to 
# official contest submission date
days_early = 365 - (14 + get_forecast_delta(target_horizon, days_early = 0)) 

## Process inputs

# Number of days between start date of most recently observed measurement
# (2 weeks to observe complete measurement) and start date of target period 
# (2 or 4 weeks plus days early days ahead)
aggregation_days = 14
start_delta = (aggregation_days + 
               get_forecast_delta(target_horizon, days_early = days_early))

In [17]:
start_delta

365

## Shift similarities by start_delta
The rows and columns of similarities represent target dates, and the similarities are now based on ground truth measurements from start_delta days prior to each target date.

In [18]:
# The earliest measurement available is from start_delta days prior to target day, 
# so shift rows and columns of similarities by start_delta and extend index accordingly
# NOTE: For some reason, shifting columns doesn't extend column index, so I'm transposing and shifting
# rows
tic()
similarities = similarities0.shift(start_delta, axis='rows', freq='D').transpose().shift(start_delta, axis='rows', freq='D')
toc()
# Index extension has the side effect of creating leap days (e.g., 2012-02-29) and removing 
# the date start_delta days later (e.g., datetime.date(2012,2,29) + timedelta(start_delta))
# Add one day to each date in the range [datetime.date(2012,2,29), 
# datetime.date(2012,2,29) + timedelta(start_delta)) to remove leap days
def fix_date(date):
    if date.is_leap_year:
        # Identify the affected dates in this current date's year
        affected_dates = pd.date_range('{}-02-29'.format(date.year), periods=start_delta, freq='D')
    elif date.replace(year=date.year-1).is_leap_year:
        # Identify the affected dates starting from prior year
        affected_dates = pd.date_range('{}-02-29'.format(date.year-1), periods=start_delta, freq='D')
    else:
        # Only modify leap year dates and dates following leap year
        return date
    # Shift date by 1 day if affected
    return date + timedelta(1) if date in affected_dates else date
tic()
new_index = [fix_date(date) for date in similarities.index]
toc()
tic()
similarities = similarities.reindex(new_index)
similarities.columns = new_index
toc()

Elapsed time: 5.401384 seconds.

Elapsed time: 2.095276 seconds.

Elapsed time: 1.209867 seconds.



In [21]:
#count(viable_similarities.columns)
import collections
print [item for item, count in collections.Counter(viable_similarities).items() if count > 1]

[]


## Restrict similarities to viable neighbors
Viable neighbors are those with available ground truth data (as evidenced by anoms or gt_cosines)

In [20]:
# Check if viable similarities have been computed previously
regen_viable_similarities = True
viable_similarities_file = os.path.join(
    cache_dir,'viable_similarities-{}-{}-days{}-early{}.h5'.format(gt_id,target_horizon,past_days,days_early))
if regen_viable_similarities or not os.path.isfile(viable_similarities_file):
    viable_similarities = similarities[similarities.index.isin(gt_cosines.index)]
    print "Saving viable_similarities to "+viable_similarities_file; tic()
    viable_similarities.to_hdf(viable_similarities_file, key="data", mode="w"); toc()
else:
    # Read viable similarities from disk
    print "Reading viable similarities from "+viable_similarities_file; tic()
    viable_similarities = pd.read_hdf(viable_similarities_file); toc()

Saving viable_similarities to knn_mip/viable_similarities-contest_tmp2m-34w-days60-early337.h5
Elapsed time: 12.941364 seconds.



In [22]:
viable_similarities

Unnamed: 0_level_0,1991-01-01 00:00:00,1991-01-02 00:00:00,1991-01-03 00:00:00,1991-01-04 00:00:00,1991-01-05 00:00:00,1991-01-06 00:00:00,1991-01-07 00:00:00,1991-01-08 00:00:00,1991-01-09 00:00:00,1991-01-10 00:00:00,...,2021-10-28 00:00:00,2021-10-29 00:00:00,2021-10-30 00:00:00,2021-10-31 00:00:00,2021-11-01 00:00:00,2021-11-02 00:00:00,2021-11-03 00:00:00,2021-11-04 00:00:00,2021-11-05 00:00:00,2021-11-06 00:00:00
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1991-01-01,,,,,,,,,,,...,,,,,,,,,,
1991-01-02,,,,,,,,,,,...,,,,,,,,,,
1991-01-03,,,,,,,,,,,...,,,,,,,,,,
1991-01-04,,,,,,,,,,,...,,,,,,,,,,
1991-01-05,,,,,,,,,,,...,,,,,,,,,,
1991-01-06,,,,,,,,,,,...,,,,,,,,,,
1991-01-07,,,,,,,,,,,...,,,,,,,,,,
1991-01-08,,,,,,,,,,,...,,,,,,,,,,
1991-01-09,,,,,,,,,,,...,,,,,,,,,,
1991-01-10,,,,,,,,,,,...,,,,,,,,,,


In [1]:
import pandas as pd
sim=pd.read_hdf('knn/viable_similarities-contest_tmp2m-34w-days60-early337.h5')

In [2]:
sim

Unnamed: 0_level_0,1987-01-01 00:00:00,1987-01-02 00:00:00,1987-01-03 00:00:00,1987-01-05 00:00:00,1987-01-06 00:00:00,1987-01-07 00:00:00,1987-01-08 00:00:00,1987-01-09 00:00:00,1987-01-10 00:00:00,1987-01-11 00:00:00,...,2019-12-22 00:00:00,2019-12-23 00:00:00,2019-12-24 00:00:00,2019-12-25 00:00:00,2019-12-26 00:00:00,2019-12-27 00:00:00,2019-12-28 00:00:00,2019-12-29 00:00:00,2019-12-30 00:00:00,2019-12-31 00:00:00
start_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1987-01-01,,,,,,,,,,,...,,,,,,,,,,
1987-01-02,,,,,,,,,,,...,,,,,,,,,,
1987-01-03,,,,,,,,,,,...,,,,,,,,,,
1987-01-05,,,,,,,,,,,...,,,,,,,,,,
1987-01-06,,,,,,,,,,,...,,,,,,,,,,
1987-01-07,,,,,,,,,,,...,,,,,,,,,,
1987-01-08,,,,,,,,,,,...,,,,,,,,,,
1987-01-09,,,,,,,,,,,...,,,,,,,,,,
1987-01-10,,,,,,,,,,,...,,,,,,,,,,
1987-01-11,,,,,,,,,,,...,,,,,,,,,,
