# Select variables : forecasting of tropical cylone data at 1-day horizon

This notebook accompanies the following publication:
Paul Platzer, Arthur Avenas, Bertrand Chapron, Lucas Drumetz, Alexis Mouche, Léo Vinour. Distance Learning for Analog Methods. 2024. [⟨hal-04841334⟩](https://hal.science/hal-04841334)

It is used to run grid-search optimization algorithms for numerical experiments with IBTrACS tropical cyclone data, at 1-day forecast horizon. The aim is to select the most relevant variables for this forecast and for this particular dataset.

In [1]:
import numpy as np
import pandas
import matplotlib.pyplot as plt
import matplotlib
from tqdm.notebook import tqdm
from tqdm_joblib import tqdm_joblib
from sklearn.neighbors import NearestNeighbors
import os
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["OPENBLAS_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
os.environ["NUMEXPR_NUM_THREADS"] = "16"
import sys
sys.path.append('../../functions/.')
from analogs import apply_transform, find_analogues, compute_weights, compute_diffs, compute_mae_mad, compute_error
from grid_search import generate_weight_combinations, process_variable, grid_search_CRPS_TC
from TC_utils import M, Rmax_from_M, correct_vmx_ibt, Rmxa23

  from tqdm.autonotebook import tqdm


In [2]:
matplotlib.rcParams.update({'font.size': 14})

In [3]:
cols = ['#377eb8', '#ff7f00', '#4daf4a',
                  '#f781bf', '#a65628', '#984ea3',
                  '#999999', '#e41a1c', '#dede00']

In [4]:
data_folder = '../../data/tropical_cyclone/'
output_folder = '../../output/tropical_cyclone/'

# Parameters for loading IBTrACS dataset

In [5]:
files = os.listdir(data_folder)

# Input variables
var_names = ['Vmax', 'Rmax_IBT', 'R34', 'fcor', 'u_trans', 'v_trans']

# Output variable to forecast: derivative of Vmax
var_y = ['Vmax']
dydt = True
ind_var_y = []
for name_tmp in var_y:
    ind_var_y.append(np.argwhere(np.array(var_names)==name_tmp)[0][0])

# Utils to compute Rmax estimate from Avenas et al. (2023)
var_A23 = ['fcor', 'Vmax', 'R34', ]
ind_A23 = []
for name_tmp in var_A23:
    ind_A23.append(np.argwhere(np.array(var_names)==name_tmp)[0][0])

# Add names of auxilliary variables (Rmax_A23 and time-derivatives)
var_names_all = var_names.copy()
var_names_all.append('Rmax_A23')
for name in var_names_all.copy():
    var_names_all.append('d'+name+'/dt')

# Add name of time since the threshold of 18m/s is crossed for Vmax
var_names_all.append('t_18')

# Loading and preprocessing dataset

In [6]:
## Set forecast time-horizon (multiple of 3hours)

h = 8


## Load dataset

IBT = np.array(pandas.read_csv(data_folder + files[0], usecols = var_names))
IBT = np.concatenate( [ IBT , 
         Rmxa23(IBT[:,ind_A23[0]] , IBT[:,ind_A23[1]] , IBT[:,ind_A23[2]]).reshape(-1,1) ,
                      ],  axis=1)
IBT = np.concatenate( ( IBT[1:] , IBT[1:] - IBT[:-1] ) , axis=1 )
IBT = np.concatenate( [ IBT ,
           3*np.arange(len(IBT)).reshape(-1,1) ],  axis=1)
train_x = IBT[0:-h,:]
train_y = IBT[h:,ind_var_y] - IBT[0:-h,ind_var_y] 
ID = np.array([0]*len(IBT[0:-h,:]))


for i in np.arange(1, len(files)):
    IBT = np.array(pandas.read_csv(data_folder + files[i], usecols = var_names))
    IBT = np.concatenate( [ IBT , 
             Rmxa23(IBT[:,ind_A23[0]] , IBT[:,ind_A23[1]] , IBT[:,ind_A23[2]]).reshape(-1,1) ,
                          ],  axis=1)
    IBT = np.concatenate( ( IBT[1:] , IBT[1:] - IBT[:-1] ) , axis=1 )
    IBT = np.concatenate( [ IBT ,
           3*np.arange(len(IBT)).reshape(-1,1) ],  axis=1)
    train_x = np.concatenate([train_x, IBT[0:-h,:]])
    train_y = np.concatenate([train_y, IBT[h:,ind_var_y] - IBT[0:-h,ind_var_y]])
    ID = np.concatenate([ID, np.array([i]*len(IBT[0:-h,:]))])

# center and reduce
mean_IBTrACS = np.mean(train_x, axis=0)
std_IBTrACS = np.std(train_x, axis=0)
mean_y = np.mean(train_y, axis=0)
std_y = np.std(train_y, axis=0)
for j in range(train_x.shape[1]):
    train_x[:,j] = (train_x[:,j] - mean_IBTrACS[j]) / std_IBTrACS[j]
for j in range(train_y.shape[1]):
    train_y[:,j] = (train_y[:,j] - mean_y[j]) / std_y[j]    

# Optimization: loop over permutations and regularization parameter

In [7]:
# Set parameters for optimization
transform_diagonal = np.ones(len(var_names_all))
k = 200
nn_algo = 'kd_tree'
corr_length_train = 24
loo = True
Nperm = 20
Nvars = 3 # maximal number of variables kept in iterated grid-search
thresh_CRPS_gain = 0
n_jobs_variables = 4
n_jobs_nnsearch = -1

A_grid = []
E_grid_train = []
E_grid_test = []
random_state_number = []

# for i_perm in tqdm(range(Nperm)):
# for i_perm in tqdm(range(10,Nperm)):
for i_perm in tqdm(range(13,Nperm)):
    # Generate random permutation (reproducible)
    random_state_number.append(1312 + i_perm)
    rs = np.random.RandomState(random_state_number[-1])
    perm = rs.permutation(len(files))
    Itest = np.argwhere(np.isin(ID, perm[:len(files)//3]))[:,0]
    Itrain = np.argwhere(np.isin(ID, perm[len(files)//3:]))[:,0]

    # Optimize using grid-search
    result_grid = grid_search_CRPS_TC(train_x, train_y, Itrain, Itest, k = k, nn_algo = nn_algo,
                     thresh_CRPS_gain = thresh_CRPS_gain, Nvars = Nvars, corr_length_train = corr_length_train,
                        n_jobs_variables = n_jobs_variables, n_jobs_nnsearch = n_jobs_nnsearch)

    A_grid.append( result_grid[0].copy() )
    E_grid_train.append( result_grid[1].copy() )
    E_grid_test.append( result_grid[2].copy() )

A_grid = np.array(A_grid)
E_grid_train = np.array(E_grid_train)
E_grid_test = np.array(E_grid_test)

  0%|          | 0/7 [00:00<?, ?it/s]

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

No. of vars selected = 1/15


Processing Variables:   0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

No. of vars selected = 2/15


Processing Variables:   0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/14 [00:00<?, ?it/s]

No. of vars selected = 3/15


Processing Variables:   0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/13 [00:00<?, ?it/s]

In [8]:
A_grid = np.array(A_grid)
E_grid_train = np.array(E_grid_train)
E_grid_test = np.array(E_grid_test)

In [9]:
# Save
np.savez(output_folder + 'select_vars_TC_grid_search_ter.npz',
        var_y = np.array(var_y),
        var_names_all = np.array(var_names_all),
        A_grid = A_grid,
        E_grid_train = E_grid_train,
        E_grid_test = E_grid_test,
        h = h,
        k = k,
        corr_length_train = corr_length_train,
        random_state_number = random_state_number,
)

# Re-process multiple outputs (subcomputations over different permutations)

In [19]:
## Load first file
npzfile = np.load(output_folder + 'select_vars_TC_grid_search_primo.npz')
# fixed parameters that are unchanged between different files
var_y = npzfile['var_y']
var_names_all = npzfile['var_names_all']
h = npzfile['h']
k = npzfile['k']
corr_length_train = npzfile['corr_length_train']
# fixed parameters that are unchanged between different files
A_grid = npzfile['A_grid']
E_grid_train = npzfile['E_grid_train']
E_grid_test = npzfile['E_grid_test']
random_state_number = npzfile['random_state_number'][:len(A_grid)]

## Load other files (only parameters that change)
for file_suffix in ['bis','ter']:
    npzfile = np.load(output_folder + 'select_vars_TC_grid_search_' + file_suffix + '.npz')
    # fixed parameters that are unchanged between different files
    A_grid_new = npzfile['A_grid']
    E_grid_train_new = npzfile['E_grid_train']
    E_grid_test_new = npzfile['E_grid_test']
    random_state_number_new = npzfile['random_state_number'][:len(A_grid_new)]
    
    A_grid = np.concatenate( ( A_grid , A_grid_new ) , axis = 0 )
    E_grid_train = np.concatenate( ( E_grid_train , E_grid_train_new ) , axis = 0 )
    E_grid_test = np.concatenate( ( E_grid_test , E_grid_test_new ) , axis = 0 )
    random_state_number = np.concatenate( ( random_state_number , random_state_number_new ) , axis = 0 )

In [21]:
## Save into one whole file
np.savez(output_folder + 'select_vars_TC_grid_search.npz',
        var_y = var_y,
        var_names_all = var_names_all,
        A_grid = A_grid,
        E_grid_train = E_grid_train,
        E_grid_test = E_grid_test,
        h = h,
        k = k,
        corr_length_train = corr_length_train,
        random_state_number = random_state_number,
)