# Predicting Diatom concentration with functional regression (spatial means)

## Importing

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xskillscore as xs

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import r_regression

from skfda.representation.grid import FDataGrid
from skfda.ml.regression import HistoricalLinearRegression, KernelRegression

from skfda.misc.hat_matrix import NadarayaWatsonHatMatrix, LocalLinearRegressionHatMatrix, KNeighborsHatMatrix
from skfda.preprocessing.smoothing import KernelSmoother

from sklearn.metrics import root_mean_squared_error as rmse

import os
import lzma
import dill

import random

import cmocean.cm as cm
import salishsea_tools.viz_tools as sa_vi

np.warnings.filterwarnings('ignore') # For the nan mean warning


## Datasets Preparation

In [2]:
# Creation of the training - testing datasets
def datasets_preparation(dataset, dataset2, name, inputs_names):
    
    indx = np.where((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))

    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    inputs = []
    for i in inputs_names:
        inputs.append(dataset2[i].to_numpy().reshape(*dataset2[i].to_numpy().shape[:1],-1))
    inputs = np.array(inputs)

    # Deleting 29 of February
    inputs = np.delete(inputs,indx,axis=1)
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    inputs = np.array(np.split(inputs,len(np.unique(dataset.time_counter.dt.year)),axis=1))
    targets = np.array(np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0))

    # Transposing
    inputs = np.transpose(inputs, (1,2,0,3))
    targets = np.transpose(targets, (1,0,2))

    x =  np.tile(dataset2.x, len(dataset2.y))
    y =  np.tile(np.repeat(dataset2.y, len(dataset2.x)),1)

    indx = np.where((~np.isnan(targets[0]).any(axis=0)) & (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,:,:,indx[0]]
    targets = targets[:,:,indx[0]]

    inputs = np.nanmean(inputs,axis=3)
    targets = np.nanmean(targets,axis=2)

    return(inputs, targets, indx)


## File Creation

In [3]:
def file_creation(path, variable, name):

    temp = variable.to_dataset(name=name)
    temp.to_netcdf(path = path + 'targets_predictions.nc', mode='a', encoding={name:{"zlib": True, "complevel": 9}})
    

## Regressor

In [4]:
def regressor (inputs, targets, lag):

    # Printing of the correlation coefficients
    temp_inputs = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]), order='F')
    temp_inputs = temp_inputs.transpose()
    temp_targets = np.reshape(targets, (targets.shape[0]*targets.shape[1]), order='F')

    r_inputs = np.round(r_regression(temp_inputs,temp_targets),2)

    # Scaling the inputs
    scaler_inputs = make_column_transformer((StandardScaler(), np.arange(0,len(inputs))))
    temp_inputs = scaler_inputs.fit_transform(temp_inputs)
    temp_inputs = temp_inputs.transpose()
    inputs = np.reshape(temp_inputs,(len(inputs),inputs.shape[1],inputs.shape[2]), order='F')   
    
    # Scaling the targets
    scaler_targets = StandardScaler()
    temp_targets = np.expand_dims(temp_targets,-1)
    temp_targets = scaler_targets.fit_transform(temp_targets)
    targets = temp_targets.reshape(targets.shape, order='F')

    # Final transformations
    inputs = np.transpose(inputs,axes=(2,1,0))
    targets = targets.transpose()
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=10))
    kernel_estimator = LocalLinearRegressionHatMatrix(bandwidth=1)
    smoother = KernelSmoother(kernel_estimator=kernel_estimator)
    inputs = smoother.fit_transform(inputs)

    model = HistoricalLinearRegression(n_intervals=3, lag=lag)
    regr = model.fit(inputs,targets)

    return(regr,scaler_inputs,scaler_targets,smoother,r_inputs)


## Scaling

In [5]:
def scaling(regr,inputs,scaler_inputs,targets,scaler_targets,smoother):

    # targets = targets.to_basis(FourierBasis(n_basis=10)

    # # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    temp = scaler_inputs.transform(temp)
    temp = temp.transpose()        
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))
        
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets)))

    inputs = smoother.transform(inputs)

    predictions = regr.predict(inputs)

    # Post-processing of predictions
    predictions = np.array(predictions.to_grid(np.arange(0,len(targets))).data_matrix)
    predictions = np.squeeze(predictions,2)

    # # Scaling the predictions
    temp = np.ravel(predictions)
    temp = np.expand_dims(temp,axis=-1)
    temp = scaler_targets.inverse_transform(temp)
    predictions = temp.reshape(predictions.shape)
    predictions = predictions.transpose()

    return(predictions)


## Scatter Plot

In [6]:
def scatter_plot(dates, targets, predictions, name):

    indx = pd.DatetimeIndex(dates[0:75]) # From the first year

    # compute slope m and intercept b
    m, b = np.polyfit(targets, predictions, deg=1)

    fig, ax = plt.subplots()

    scatter = ax.scatter(targets,predictions, s = 10, c= indx.month)

    lims = [np.min([ax.get_xlim(), ax.get_ylim()]),
        np.max([ax.get_xlim(), ax.get_ylim()])]

    # plot fitted y = m*x + b
    ax.axline(xy1=(0, b), slope=m, color='r')

    ax.set_xlabel('targets')
    ax.set_ylabel('predictions')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.set_aspect('equal')
    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])

    ax.plot(lims, lims,linestyle = '--',color = 'k')

    fig.suptitle(name)

    plt.show()

    return(m)


## Plotting (Mean Values)

In [7]:
def plotting_mean_values(dates,mean_targets,mean_predictions,category,units,region):

    years = np.unique(dates.year)
    ticks = [0]
    
    fig, _ = plt.subplots(figsize=(19,5))
    
    mean_targets = np.ma.array(mean_targets)
    mean_predictions = np.ma.array(mean_predictions)

    for year in years[:-1]:
        ticks.append((np.where(dates.year==year)[0][-1]+1))
        mean_targets[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked
        mean_predictions[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked

    plt.plot(mean_targets, label = 'targets')
    plt.plot(mean_predictions, label = 'predictions')
    plt.xlabel('Years')
    plt.xticks(ticks,years)
    plt.suptitle('Mean '+category + ' ' +units + ' (15 Feb - 30 Apr) ' + region)
    plt.legend()
    
    plt.show()


## Evaluation

In [8]:
def evaluation(id,dates,years,targets,predictions,name):

    # For every year
    r_years = np.array([])
    rms_years = np.array([])
    slope_years = np.array([])

    for i in range (0,len(years)):

        r_year = np.round(np.corrcoef(np.ravel(targets[:,i]), np.ravel(predictions[:,i]))[0][1],3)
        rms_year = np.round(rmse(np.ravel(targets[:,i]), np.ravel(predictions[:,i])),5)
        m,_ = np.polyfit(np.ravel(targets[:,i]), np.ravel(predictions[:,i]), deg=1)
        slope_year = np.round(m,3)

        r_years = np.append(r_years,r_year)
        rms_years = np.append(rms_years,rms_year)
        slope_years = np.append(slope_years,slope_year)

        if id == 0:
            _ = scatter_plot(dates, targets[:,i], predictions[:,i], name + ' for '+ str(years[i])) 

    return r_years,rms_years,slope_years


## Training

In [None]:
name = 'Flagellate'
units = '[mmol m-2]'
category = 'Concentrations'

if name == 'Diatom':
    inputs_names = ['Summation_of_solar_radiation','Mean_wind_speed','Mean_air_temperature']
    lags = 49.3 # n_intervals=3
    # lags = 55.5 # n_intervals=4

else:
    inputs_names = ['Summation_of_solar_radiation','Mean_air_temperature','Mean_pressure', 'Mean_precipitation', 'Mean_specific_humidity']
    lags = 24.6 # n_intervals=3
    # lags = 18.5 # n_intervals=4

ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

# Low resolution

# ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
#     x=(np.arange(ds.x[0], ds.x[-1], 5)))

# ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
#     x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

years = np.unique(dataset.time_counter.dt.year)

inputs,targets,indx = datasets_preparation(dataset,dataset2,name,inputs_names)

dates = pd.DatetimeIndex(dataset['time_counter'].values)
indx2 = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx2]

regr,scaler_inputs,scaler_targets,smoother,r_inputs = regressor(inputs, targets, lags)

print('Metrics between input features and '+name)
temp = pd.DataFrame(r_inputs, index=inputs_names)
display(temp)

predictions = scaling(regr,inputs,scaler_inputs,targets,scaler_targets,smoother)


## Heatmaps

In [None]:
fig, axs = plt.subplots(1,len(inputs_names), figsize = (15,6), layout='constrained')

for i in range(0,len(inputs_names)):

    temp = regr.coef_
    coeff = temp.data_matrix
    coeff = np.where(coeff==0,np.nan,coeff)

    vmin = np.nanmin(coeff[0,:,:,i])
    vmax = np.nanmax(coeff[0,:,:,i])

    h = axs[i].imshow(coeff[0,:,:,i], cmap='bwr',aspect='auto', vmin=-np.maximum(np.abs(vmin),vmax), vmax=np.maximum(np.abs(vmin),vmax))

    cbar = fig.colorbar(h)
    axs[i].set_ylim(axs[i].get_ylim()[::-1])
    axs[i].set_xlabel('Day')
    axs[i].set_ylabel('Day')
    fig.suptitle(inputs_names[i])
    

## Time-series (Training)

In [None]:
r_train = np.round(np.corrcoef(np.ravel(targets),np.ravel(predictions))[0][1],3)
rms_train = rmse(np.ravel(targets),np.ravel(predictions))
m,_ = np.polyfit(np.ravel(targets), np.ravel(predictions), deg=1)
slope_train = np.round(m,3)

print ('The correlation coefficient during training is: ' + str(r_train))
print ('The rmse during training is: ' + str(rms_train))
print('The slope of the best fitting line during training is: '+str(slope_train))
print('\n')

dates = pd.DatetimeIndex(dataset['time_counter'].values)
indx2 = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx2]

plotting_mean_values(dates, np.reshape(targets,targets.shape[0]*targets.shape[1],order='F'), np.reshape(predictions,predictions.shape[0]*predictions.shape[1],order='F'), units, category, 'Salish Sea')

season = np.mean(targets,axis=1)
plt.plot(season)
plt.suptitle('Long-term seasonality (2007-2020)')
plt.show()

season_train = np.tile(season,targets.shape[1]) # Broadcasting season to all training years
season_train = np.reshape(season_train,(targets.shape[0],targets.shape[1]),order='F')

season_test = np.tile(season,len(np.unique(ds.sel(time_counter = slice('2021', '2024')).time_counter.dt.year))) # Broadcasting season to all testing years
season_test = np.reshape(season_test,(targets.shape[0],len(np.unique(ds.sel(time_counter = slice('2021', '2024')).time_counter.dt.year))),order='F')

r_train_season = np.round(np.corrcoef(np.ravel(targets-season_train),np.ravel(predictions-season_train))[0][1],3)
rms_train_season = rmse(np.ravel(targets-season_train),np.ravel(predictions-season_train))
m,_ = np.polyfit(np.ravel(targets-season_train), np.ravel(predictions-season_train), deg=1)
slope_train_season = np.round(m,3)

print ('The correlation coefficient during training is: ' + str(r_train_season))
print ('The rmse during training is: ' + str(rms_train_season))
print('The slope of the best fitting line during training is: '+str(slope_train_season))
    
plotting_mean_values(dates, np.reshape(targets-season_train,targets.shape[0]*targets.shape[1],order='F'), np.reshape(predictions-season_train,predictions.shape[0]*predictions.shape[1],order='F'), units, category, 'Salish Sea')


In [None]:
targets[:,0]

## Other Years

In [None]:
dataset = ds.sel(time_counter = slice('2021', '2024'))
dataset2 = ds2.sel(time_counter = slice('2021', '2024'))

dates = pd.DatetimeIndex(dataset['time_counter'].values)
years = np.unique(dataset.time_counter.dt.year)

indx = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx]

inputs, targets, indx = datasets_preparation(dataset, dataset2, name, inputs_names)

predictions = scaling(regr,inputs,scaler_inputs,targets,scaler_targets,smoother)

r_test,rms_test,slope_test = evaluation(0,dates,years,targets,predictions,name)

r_test = np.round(np.corrcoef(np.ravel(targets),np.ravel(predictions))[0][1],3)
rms_test = rmse(np.ravel(targets),np.ravel(predictions))
m,_ = np.polyfit(np.ravel(targets), np.ravel(predictions), deg=1)
slope_test = np.round(m,3)

print ('The correlation coefficient during testing is: ' + str(r_test))
print ('The rmse during testing is: ' + str(rms_test))
print ('The slope of the best fitting line during testing is: ' + str(slope_test))
print ('\n')

plotting_mean_values(dates, np.reshape(targets,targets.shape[0]*targets.shape[1],order='F'), np.reshape(predictions,predictions.shape[0]*predictions.shape[1],order='F'), units, category, 'Salish Sea')

r_test_season = np.round(np.corrcoef(np.ravel(targets-season_test),np.ravel(predictions-season_test))[0][1],3)
rms_test_season = rmse(np.ravel(targets-season_test),np.ravel(predictions-season_test))
m,_ = np.polyfit(np.ravel(targets-season_test),np.ravel(predictions-season_test), deg=1)
slope_test_season = np.round(m,3)

print ('The correlation coefficient during testing is: ' + str(r_test_season))
print ('The rmse during testing is: ' + str(rms_test_season))
print ('The slope of the best fitting line during testing is: ' + str(slope_test_season))
print ('\n')

r_test_season,rms_test_season,slope_test_season = evaluation(1,dates,years,targets-season_test,predictions-season_test,name)
plotting_mean_values(dates, np.reshape(targets-season_test,targets.shape[0]*targets.shape[1],order='F'), np.reshape(predictions-season_test,predictions.shape[0]*predictions.shape[1],order='F'), units, category, 'Salish Sea (removed seasonality)')


## Saving

In [14]:
# path = '/data/ibougoudis/MOAD/files/results/' + name + '/func_reg_s/'

# os.makedirs(path, exist_ok=True)
# with lzma.open(path + 'regr_all.xz', 'wb') as f:
    
#     dill.dump(regr, f)

# with open(path + 'metrics.pkl', 'wb') as f:
#     dill.dump([r_train,rms_train,slope_train,r_test,rms_test,slope_test,r_test_season,rms_test_season,slope_test_season], f)
