In [1]:
import util
import xarray 
import numpy as np
ds = util.get_dataset()
sst_ds = xarray.open_dataset("../data/sst.wkmean.1990-present.nc")
sst_ds = sst_ds.assign_coords(lon=(((sst_ds.lon + 180) % 360) - 180)) ## Use consistent longitude
train_storms,valid_storms,test_storms = util.train_validation_test(ds,seed=42)

In [20]:
import numpy as np 

def coriolis_parameter(lat):
    return 2*np.sin(np.deg2rad(lat))

def make_X_y(ds,selected_storms,timesteps=5):
    Xout = []
    yout =[]
    storms = []
    for stormidx,storm in enumerate(selected_storms):
        usa_pres = ds.usa_pres.loc[storm]
        usa_wind = ds.usa_wind.loc[storm]
        ## All enteries have 360 points.
        valid_coords = ~(np.isnan(usa_wind) | np.isnan(usa_pres))
        lat = ds.lat.loc[storm][valid_coords]
        lon = ds.lon.loc[storm][valid_coords]
        storm_speed = ds.storm_speed.loc[storm][valid_coords]
        storm_dir = ds.storm_dir.loc[storm][valid_coords]
        usa_pres = usa_pres[valid_coords]
        usa_wind = usa_wind[valid_coords]
        time = ds.time.loc[storm][valid_coords]
        cor_param = coriolis_parameter(lat)
        try:
            sst = sst_ds.sst.interp(time=time,lat=lat,lon=lon)
        except ValueError:
            continue
        if np.isnan(sst).any():
            continue

        X = np.transpose(np.array([usa_wind,usa_pres,storm_speed,storm_dir,cor_param,sst,lat,lon]))
        for i in range(0,len(usa_wind)):
            if i+timesteps+1>=len(usa_wind):
                break
            Xout.append(X[i:i+timesteps])
            yout.append(X[i+timesteps+1][:-4])
            storms.append(stormidx)
    return np.stack(Xout),np.stack(yout),np.array(storms)

In [21]:
TIMEPOINTS = 10
x_train,y_train,storm_train = make_X_y(ds,train_storms,TIMEPOINTS)
x_valid,y_valid,storm_valid = make_X_y(ds,valid_storms,TIMEPOINTS)

### Training

In [22]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=150,n_jobs=-1)
rf.fit(x_train.reshape((x_train.shape[0],-1)),y_train)

RandomForestRegressor(n_estimators=150, n_jobs=-1)

In [23]:
pred_valid = rf.predict(x_valid.reshape((x_valid.shape[0],-1)))



In [24]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_valid,pred_valid,multioutput='raw_values',squared=False)

array([ 5.55696218,  4.34667666,  3.32375799, 55.78284987])

#### cross validation train

In [25]:
from sklearn.model_selection import GroupKFold,cross_val_score
cv = GroupKFold(n_splits=5)
cross_val_score(rf,x_train.reshape((x_train.shape[0],-1)),y_train,groups=storm_train,cv=cv,scoring='neg_root_mean_squared_error',n_jobs=-1)

array([-18.00820017, -18.80518828, -18.07674251, -17.92560441,
       -18.38403479])

### Prediction on full test set