# Example notebook for collecting data, training and evaluating the model

## Importing

In [25]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor
import xgboost as xgb

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse

import salishsea_tools.viz_tools as sa_vi

## Files Reading

In [26]:
# The location of the file
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(time_counter = (np.arange(0, len(ds2.time_counter),2)), 
    y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

# Selecting the first 2 years
dataset = ds.sel(time_counter = slice('2007-2-15', '2007-4-30'))
dataset2 = ds2.sel(time_counter = slice('2007-2-15', '2007-4-30'))


## Datasets Preparation

In [27]:
# Here I am packing all of them in one variable, named drivers

drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), 
        np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']),
        np.ravel(dataset2['Summation_of_solar_radiation']),
        np.ravel(dataset2['Mean_wind_speed']),
        np.ravel(dataset2['Mean_air_temperature']),
        ])

# Removing of nans
indx = np.where(~np.isnan(drivers).any(axis=0))
drivers = drivers[:,indx[0]]

diat = np.ravel(dataset['Diatom_Production_Rate'])
diat = diat[indx]

# Transpose to bring it to the format (samples, features)
drivers = drivers.transpose()


## Training 

In [28]:

regr = TransformedTargetRegressor(regressor=make_pipeline(MinMaxScaler(),
    xgb.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)), 
    transformer=MinMaxScaler())

regr = BaggingRegressor(regr,n_estimators=10, n_jobs=10).fit(drivers,diat)


In [29]:
dataset0 = ds.sel(time_counter = slice('2008-2-15', '2008-4-30'))
dataset02 = ds2.sel(time_counter = slice('2008-2-15', '2008-4-30'))


In [53]:
# Here I am packing all of them in one variable, named drivers

drivers2 = np.stack([np.ravel(dataset0['Temperature_(0m-15m)']),
        np.ravel(dataset0['Temperature_(15m-100m)']), 
        np.ravel(dataset0['Salinity_(0m-15m)']),
        np.ravel(dataset0['Salinity_(15m-100m)']),
        np.ravel(dataset02['Summation_of_solar_radiation']),
        np.ravel(dataset02['Mean_wind_speed']),
        np.ravel(dataset02['Mean_air_temperature']),
        ])

# Removing of nans
indx = np.where(~np.isnan(drivers2).any(axis=0))
drivers2 = drivers2[:,indx[0]]

diat2 = np.ravel(dataset['Diatom_Production_Rate'])
diat2 = diat2[indx]

# Transpose to bring it to the format (samples, features)
drivers2 = drivers2.transpose()


In [31]:

predictions = regr.predict(drivers2)
print(np.corrcoef(diat2,predictions)[0][1])
print(rmse(diat2,predictions))

0.5000662546559903
1.4363070780496323e-06


In [32]:
predictions.min()

3.6796973e-07