# Optimize distance : forecasting of tropical cylone data as a function of horizon (grid-search)

This notebook accompanies the following publication:
Paul Platzer, Arthur Avenas, Bertrand Chapron, Lucas Drumetz, Alexis Mouche, Léo Vinour. Distance Learning for Analog Methods. 2024. [⟨hal-04841334⟩](https://hal.science/hal-04841334)

It is used to run optimization algorithms for numerical experiments with IBTrACS tropical cyclone data, varying the forecast horizon.

Running the whole optimization loop might be prohibitively long depending on your computational resources. In this case, you might want to run independent loops for each permutation. This solution was used to produce the files "optim_TC_horizon_grid_search_permutXX.npz".

In [1]:
import numpy as np
import pandas
import matplotlib.pyplot as plt
import matplotlib
from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors
import os
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["OPENBLAS_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
os.environ["NUMEXPR_NUM_THREADS"] = "16"
import sys
sys.path.append('../../functions/.')
from analogs import apply_transform, find_analogues, compute_weights, compute_diffs, compute_mae_mad, compute_error
from grid_search import generate_weight_combinations, process_variable, grid_search_CRPS_TC
from TC_utils import M, Rmax_from_M, correct_vmx_ibt, Rmxa23

  from tqdm.autonotebook import tqdm


In [2]:
data_folder = '../../data/tropical_cyclone/'
output_folder = '../../output/tropical_cyclone/'

In [3]:
matplotlib.rcParams.update({'font.size': 14})
cols = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#e41a1c', '#dede00']

# Parameters for loading IBTrACS dataset

In [4]:
files = os.listdir(data_folder)

# Input variables
var_names = ['Vmax', 'Rmax_IBT', 'R34', 'fcor', 'u_trans', 'v_trans']

# Output variable to forecast: derivative of Vmax
var_y = ['Vmax']
dydt = True
ind_var_y = []
for name_tmp in var_y:
    ind_var_y.append(np.argwhere(np.array(var_names)==name_tmp)[0][0])

# Utils to compute Rmax estimate from Avenas et al. (2023)
var_A23 = ['fcor', 'Vmax', 'R34', ]
ind_A23 = []
for name_tmp in var_A23:
    ind_A23.append(np.argwhere(np.array(var_names)==name_tmp)[0][0])

# Add names of auxilliary variables (Rmax_A23 and time-derivatives)
var_names_all = var_names.copy()
var_names_all.append('Rmax_A23')
for name in var_names_all.copy():
    var_names_all.append('d'+name+'/dt')

# Add name of time since the threshold of 18m/s is crossed for Vmax
var_names_all.append('t_18')

# Optimization loop on forecast horizon

In [8]:
## Set parameters for optimization
transform_diagonal = np.ones(len(var_names_all))
k = 200
nn_algo = 'kd_tree'
corr_length_train = 24
loo = True
Nperm = 1
Nvars = 3 # maximal number of variables kept in iterated grid-search
thresh_CRPS_gain = 0
n_jobs_variables = -1
n_jobs_nnsearch = -1


# Initialize lists to store results
A_grid = []
E_grid_train = []
E_grid_test = []
hh = [] # horizon index

for h in tqdm(np.arange(4,40+4,4)):
    print('')
    print('Horizon = '+str(h*3)+'hours')
    print('')
    
    IBT = np.array(pandas.read_csv(data_folder + files[0], usecols = var_names))
    IBT = np.concatenate( [ IBT , 
             Rmxa23(IBT[:,ind_A23[0]] , IBT[:,ind_A23[1]] , IBT[:,ind_A23[2]]).reshape(-1,1) ,
                          ],  axis=1)
    IBT = np.concatenate( ( IBT[1:] , IBT[1:] - IBT[:-1] ) , axis=1 )
    IBT = np.concatenate( [ IBT ,
               3*np.arange(len(IBT)).reshape(-1,1) ],  axis=1)
    train_x = IBT[0:-h,:]
    train_y = IBT[h:,ind_var_y] - IBT[0:-h,ind_var_y] 
    ID = np.array([0]*len(IBT[0:-h,:]))
    

    for i in np.arange(1, len(files)):
        IBT = np.array(pandas.read_csv(data_folder + files[i], usecols = var_names))
        IBT = np.concatenate( [ IBT , 
                 Rmxa23(IBT[:,ind_A23[0]] , IBT[:,ind_A23[1]] , IBT[:,ind_A23[2]]).reshape(-1,1) ,
                              ],  axis=1)
        IBT = np.concatenate( ( IBT[1:] , IBT[1:] - IBT[:-1] ) , axis=1 )
        IBT = np.concatenate( [ IBT ,
               3*np.arange(len(IBT)).reshape(-1,1) ],  axis=1)
        train_x = np.concatenate([train_x, IBT[0:-h,:]])
        train_y = np.concatenate([train_y, IBT[h:,ind_var_y] - IBT[0:-h,ind_var_y]])
        ID = np.concatenate([ID, np.array([i]*len(IBT[0:-h,:]))])

    # center and reduce the 6 columns
    mean_IBTrACS = np.mean(train_x, axis=0)
    std_IBTrACS = np.std(train_x, axis=0)
    mean_y = np.mean(train_y, axis=0)
    std_y = np.std(train_y, axis=0)
    for j in range(train_x.shape[1]):
        train_x[:,j] = (train_x[:,j] - mean_IBTrACS[j]) / std_IBTrACS[j]
    for j in range(train_y.shape[1]):
        train_y[:,j] = (train_y[:,j] - mean_y[j]) / std_y[j]
        
        
    A_grid_perm = []
    E_grid_perm_train = []
    E_grid_perm_test = []
    random_state_number = []
    
    for i_perm in range(Nperm):
        # Generate random permutation (reproducible)
        random_state_number.append(1312 + i_perm)
        rs = np.random.RandomState(random_state_number[-1])
        perm = rs.permutation(len(files))
        Itest = np.argwhere(np.isin(ID, perm[:len(files)//3]))[:,0]
        Itrain = np.argwhere(np.isin(ID, perm[len(files)//3:]))[:,0]

        result_grid = grid_search_CRPS_TC(train_x, train_y, Itrain, Itest, k = k, nn_algo = nn_algo,
                     thresh_CRPS_gain = thresh_CRPS_gain, Nvars = Nvars, corr_length_train = corr_length_train,
                        n_jobs_variables = n_jobs_variables, n_jobs_nnsearch = n_jobs_nnsearch)

        A_grid_perm.append( result_grid[0].copy() )
        E_grid_perm_train.append( result_grid[1].copy() )
        E_grid_perm_test.append( result_grid[2].copy() )
        
    
    A_grid.append( A_grid_perm )
    E_grid_train.append( E_grid_perm_train )
    E_grid_test.append( E_grid_perm_test )
    hh.append(h)


A_grid = np.array(A_grid)
E_grid_train = np.array(E_grid_train)
E_grid_test = np.array(E_grid_test)
hh = np.array(hh)

  0%|          | 0/10 [00:00<?, ?it/s]


Horizon = 12hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 24hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 36hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

Exception in thread Thread-5:
Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/home/pplatzer/.local/lib/python3.10/site-packages/tqdm/_monitor.py", line 69, in run
    instances = self.get_instances()
  File "/home/pplatzer/.local/lib/python3.10/site-packages/tqdm/_monitor.py", line 49, in get_instances
    return [i for i in self.tqdm_cls._instances.copy()
  File "/usr/lib/python3.10/_weakrefset.py", line 97, in copy
    return self.__class__(self)
  File "/usr/lib/python3.10/_weakrefset.py", line 51, in __init__
    self.update(data)
  File "/usr/lib/python3.10/_weakrefset.py", line 124, in update
    for element in other:
  File "/usr/lib/python3.10/_weakrefset.py", line 65, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



Horizon = 48hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 60hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 72hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 84hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 96hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 108hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]


Horizon = 120hours

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [9]:
# Save
np.savez(output_folder + 'optim_TC_horizon_grid_search.npz',
        var_y = np.array(var_y),
        var_names_all = np.array(var_names_all),
        A_grid = A_grid,
        E_grid_train = E_grid_train,
        E_grid_test = E_grid_test,
        hh = hh,
        k = k,
        corr_length_train = corr_length_train,
        random_state_number = np.array(random_state_number)
        )