## am4 random forest prelim

In [32]:
import json
BASEDIR = "../data/am4aqua/am4_all_cyclones.json"

with open(BASEDIR) as f:
    am4runs = json.load(f)
len(am4runs)

9

In [33]:
am4runsflat = {}
for sstmax in am4runs.keys():
    for date in am4runs[sstmax].keys():
        am4runsflat[f"{sstmax}_{date}"] = am4runs[sstmax][date]
del am4runs
len(am4runsflat)

2349

## preprocessing 


In [34]:
from typing import Tuple
import numpy as np
R = 1769.72

def haversine(lat_0:float,lat_1:float,lon_0:float,lon_1:float)->float:
    return 2*np.arcsin(np.sqrt(np.sin((lat_1 - lat_0)/2)**2 + np.cos(lat_1)*np.cos(lat_0)*np.sin((lon_1 - lon_0)/2)**2 ))

def bearing(lat_0:float,lat_1:float,lon_0:float,lon_1)->float:
    x = np.cos(lat_1)*np.sin(lon_1 - lon_0)
    y = np.arccos(lat_0)*np.sin(lat_1) - np.sin(lat_0)*np.cos(lat_1)*np.cos(lon_1 - lon_0)
    return np.arctan2(x,y)


def get_mean_speed(lat_0:float,lat_1:float,lon_0:float,lon_1:float,dt:float=6.0)->Tuple[float,float]:
    """
    u,v
    """
    d = haversine(np.deg2rad(lat_0),np.deg2rad(lat_1),np.deg2rad(lon_0-180),np.deg2rad(lon_1-180))
    theta = bearing(np.deg2rad(lat_0),np.deg2rad(lat_1),np.deg2rad(lon_0-180),np.deg2rad(lon_1-180))
    
    return R*d*np.sin(theta)/dt,R*d*np.cos(theta)/dt



In [35]:
for key,track in am4runsflat.items():
    am4runsflat[key] = [ [*track[i],*get_mean_speed(track[i-1][1],track[i][1],track[i-1][0],track[i][0])] for i in range(1,len(track))]

  y = np.arccos(lat_0)*np.sin(lat_1) - np.sin(lat_0)*np.cos(lat_1)*np.cos(lon_1 - lon_0)


In [36]:
am4runsflat

{'5.0_12_1983_1_1_6': [[96.56,
   11.25,
   19.72,
   989.44,
   0.00063339,
   26.148821287496784,
   5.0,
   0.429827614608441,
   3.1007240199756345],
  [96.56,
   12.25,
   19.17,
   990.2,
   0.0008901,
   25.858860699411984,
   5.0,
   0.0,
   5.147906806399007],
  [96.56, 12.25, 23.69, 987.32, 0.00104617, 25.858860699411984, 5.0, 0.0, 0.0],
  [96.56, 12.25, 23.29, 988.7, 0.00087291, 25.858860699411984, 5.0, 0.0, 0.0],
  [97.19,
   13.25,
   21.57,
   988.13,
   0.00089575,
   25.528588076542967,
   5.0,
   0.6177860739346732,
   6.0103990593605685],
  [96.56,
   13.75,
   21.19,
   988.31,
   0.00083967,
   25.348666359157953,
   5.0,
   -0.45350326809818675,
   4.045306829389697],
  [96.56, 13.75, 23.65, 988.66, 0.00074439, 25.348666359157953, 5.0, 0.0, 0.0],
  [96.56,
   14.75,
   24.1,
   987.37,
   0.0007798,
   24.95998572646531,
   5.0,
   0.0,
   5.147906806399015],
  [97.19,
   15.25,
   21.45,
   986.62,
   0.00083814,
   24.75154382465679,
   5.0,
   0.4317118312147621

In [37]:
import numpy as np

def get_train_test_idx(indict:dict,seed=42,split=0.3):
    train_to = int(np.ceil((1-split)*len(indict)))
    rng = np.random.default_rng(seed=seed)
    keys = list(indict.keys())
    rng.shuffle(keys)
    return set(keys[:train_to]),set(keys[train_to:])

In [38]:
train,test = get_train_test_idx(am4runsflat,split=0.1)
x_trainval = {k:v for k,v in am4runsflat.items() if k in train}
x_test = {k:v for k,v in am4runsflat.items() if k in test}
train,val = get_train_test_idx(x_trainval,split=0.25)
x_train = {k:v for k,v in x_trainval.items() if k in train}
x_val = {k:v for k,v in x_trainval.items() if k in val}

In [39]:
def process_into_chunks(data:dict,timestep:int=5,train_idx=None,y_idx=None)->Tuple[np.ndarray,np.ndarray]:
    X = []
    y = []
    for item in data.values():
        for t in range(len(item)-timestep-1):
            X.append([item[t+i] for i in range(timestep)])
            y.append(item[t+timestep][:-2])
    return np.stack(X),np.stack(y)

In [40]:
X_train,y_train = process_into_chunks(x_train,timestep=5)
X_validation,y_validation = process_into_chunks(x_val,timestep=5)

## training

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs=-1)
rf.fit(X_train.reshape((-1,X_train.shape[-1]*X_train.shape[-2])),y_train)

RandomForestRegressor(n_jobs=-1)

In [9]:
y_valpredict = rf.predict(X_validation.reshape((-1,X_validation.shape[-1]*X_validation.shape[-2])))

In [10]:
from sklearn.metrics.pairwise import haversine_distances

def get_total_error(y_true,y_pred):
    """
    Calculates error, uses great circle distance for lat lon, absolute square error for other variables
    """
    true_pos = y_true[:,(1,0)]*np.pi/180
    pred_pos = y_pred[:,(1,0)]*np.pi/180
    return np.mean(haversine_distances(true_pos,pred_pos)),np.dot(y_true[:,3] - y_pred[:,3],y_true[:,3] - y_pred[:,3]),np.dot(y_true[:,4] - y_pred[:,4],y_true[:,4] - y_pred[:,4])

In [11]:
get_total_error(y_validation,y_valpredict)

(1.195309246230868, 142392.91215743034, 0.0006186709752049992)

In [15]:
y_validation[:,(1,0)]

array([[   9.75, -157.81],
       [  10.25, -158.44],
       [  10.25, -159.69],
       ...,
       [  47.75,  130.94],
       [  47.25,  129.69],
       [  46.75,  128.44]])

In [14]:
y_valpredict[:,(1,0)]

array([[  12.985 , -157.7756],
       [  16.895 , -157.9746],
       [  14.835 , -158.094 ],
       ...,
       [  47.575 ,  131.013 ],
       [  47.925 ,  131.1   ],
       [  46.5   ,  130.1939]])

In [19]:
X_validation[0,:,(1,0)]

array([[   9.25,    9.25,    9.25,    9.75,    9.75],
       [-154.06, -154.69, -155.31, -155.94, -156.56]])

In [20]:
y_valpredict[0]

array([-1.577756e+02,  1.298500e+01,  2.161670e+01,  9.926874e+02,
        7.882141e-04])