# Predicting Diatom concentration with functional clustering and a Histogram-based Gradient Boosting Regression Tree

## Importing

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xskillscore as xs

from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler

from sklearn.compose import make_column_transformer
from skfda.representation.grid import FDataGrid
from skfda.ml.clustering import KMeans

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.feature_selection import r_regression

from sklearn.metrics import root_mean_squared_error as rmse

import os
import lzma
import dill

import random

import cmocean.cm as cm
import salishsea_tools.viz_tools as sa_vi

np.warnings.filterwarnings('ignore') # For the nan mean warning


## Datasets Preparation

In [2]:
# Creation of the training - testing datasets
def datasets_preparation(dataset, dataset2, clusters, name):
    
    inputs = np.stack([
        np.ravel(dataset2['Summation_of_solar_radiation']),
        np.ravel(dataset2['Mean_air_temperature']),
        np.ravel(dataset2['Mean_precipitation']),
        np.ravel(dataset2['Mean_pressure']),
        np.repeat(dataset.time_counter.dt.dayofyear, len(dataset.x)*len(dataset.y)),
        ])
    
    x = np.tile(dataset.x, len(dataset.time_counter)*len(dataset.y))
    y = np.tile(np.repeat(dataset.y, len(dataset.x)), len(dataset.time_counter))

    targets = np.ravel(dataset[name])
    
    indx = np.where(np.isfinite(targets) & (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,indx[0]]
    targets = targets[indx[0]]

    clusters = np.tile(np.ravel(clusters), len(dataset.time_counter))
    clusters = clusters[indx[0]]

    inputs = inputs.transpose()

    return(inputs, targets, indx, clusters)


## Clustering Finalization

In [3]:
def clustering(dataset,quant,indx,name):

    # Training
    n_clusters = 6
    kmeans = KMeans(n_clusters=n_clusters)
    clusters = kmeans.fit_predict(quant)

    unique, _ = np.unique(clusters, return_counts=True)

    # Creating the map
    indx2 = np.full(len(dataset.y) * len(dataset.x),np.nan)
    indx2[indx[0]] = clusters
    clusters = np.reshape(indx2,(len(dataset.y),len(dataset.x))) 
    clusters2 = xr.DataArray(clusters,dims = ['y','x'])
    
    # Plotting
    fig, ax = plt.subplots(figsize =(5,9))

    cmap = plt.get_cmap('tab20', unique.max()+1)
    cmap.set_bad('gray')
    clus = clusters2.plot(ax=ax, cmap=cmap, vmin = unique.min(), vmax = unique.max()+1, add_colorbar=False)

    cbar = fig.colorbar(clus, ticks = unique+0.5) 
    cbar.set_ticklabels(unique+1)
    cbar.set_label('Clusters [count]')
    ax.set_title('Functional Clustering for '+ name + ' (2007-2020)')

    sa_vi.set_aspect(ax)
    plt.show()

    return(clusters)
    

## Functional Clustering (target)

In [4]:
def func_clust_target(dataset, name):

    indx = np.where((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
    
    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    # Deleting 29 of February
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    targets = np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0)

    # Means
    targets = np.nanmean(targets,axis=0)

    x =  np.tile(dataset.x, len(dataset.y))
    y =  np.tile(np.repeat(dataset.y, len(dataset.x)),1)

    indx = np.where((~np.isnan(targets).any(axis=0))& (x>10) & ((x>100) | (y<880)))
    targets = targets[:,indx[0]]

    # Converting it to an appropriate format for functional clustering
    targets = targets.transpose()
    targets2 = FDataGrid(targets)

    clusters = clustering(dataset,targets2,indx,name)

    return(clusters,0)


## Functional Clustering (Drivers)

In [5]:
def func_clust_drivers(dataset,dataset2,name):

    indx = np.where((dataset2.time_counter.dt.month==2) & (dataset2.time_counter.dt.day==29))

    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    inputs = np.stack([
        dataset2['Summation_of_solar_radiation'].to_numpy().reshape(*dataset2['Summation_of_solar_radiation'].to_numpy().shape[:1],-1),
        dataset2['Mean_air_temperature'].to_numpy().reshape(*dataset2['Mean_air_temperature'].to_numpy().shape[:1],-1),
        dataset2['Mean_precipitation'].to_numpy().reshape(*dataset2['Mean_precipitation'].to_numpy().shape[:1],-1),
        dataset2['Mean_pressure'].to_numpy().reshape(*dataset2['Mean_pressure'].to_numpy().shape[:1],-1)
        ])

    # Deleting 29 of February
    inputs = np.delete(inputs,indx,axis=1)

    # Splitting in years
    inputs = np.split(inputs,len(np.unique(dataset2.time_counter.dt.year)),axis=1)

    # Means
    inputs = np.nanmean(inputs,axis=0)

    x =  np.tile(dataset2.x, len(dataset2.y))
    y =  np.tile(np.repeat(dataset2.y, len(dataset2.x)),1)

    indx = np.where((~np.isnan(targets).any(axis=0))& (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,:,indx[0]]

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    scaler_inputs = make_column_transformer((StandardScaler(), [0,1,2,3]))
    temp = scaler_inputs.fit_transform(temp)
    temp = temp.transpose()
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2])) 

    # Converting it to an appropriate format for functional clustering
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs2 = FDataGrid(inputs, np.arange(0,len(inputs[0])))

    clusters = clustering(dataset2,inputs2,indx,'inputs')

    return(clusters, 1)


## Datasets Preparation 2

In [6]:
# Creation of the data arrays
def datasets_preparation2(variable, name, units, dataset):

    # Obtaining the daily indexes
    temp = np.reshape(np.ravel(dataset['Temperature_(15m-100m)']), (len(dataset.time_counter), len(dataset.y) * len(dataset.x)))
    x =  np.tile(dataset.x, len(dataset.y))
    y =  np.tile(np.repeat(dataset.y, len(dataset.x)),1)

    indx = np.where((~np.isnan(temp).any(axis=0)) & (x>10) & ((x>100) | (y<880)))

    variable_all = np.full((len(dataset.time_counter), len(dataset.y) * len(dataset.x)),np.nan)
    variable_all[:,indx[0]] = variable
    variable_all = np.reshape(variable_all,(len(dataset.time_counter),len(dataset.y),len(dataset.x)))

    # Preparation of the dataarray 
    array = xr.DataArray(variable_all,
        coords = {'time_counter': dataset.time_counter,'y': dataset.y, 'x': dataset.x},
        dims = ['time_counter','y','x'],
        attrs=dict(description= name,
        units=units))
        
    return (array)


## File Creation

In [7]:
def file_creation(path, variable, name):

    temp = variable.to_dataset(name=name)
    temp.to_netcdf(path = path + 'targets_predictions.nc', mode='a', encoding={name:{"zlib": True, "complevel": 9}})
    

## Regressor

In [8]:
def regressor (inputs, targets, clusters, name):

    model = TransformedTargetRegressor(regressor=make_pipeline(ColumnTransformer(
        transformers=[('drivers', StandardScaler(), [0,1,2,3])],remainder='passthrough'),
        HistGradientBoostingRegressor(categorical_features=[4])),
        transformer=StandardScaler())
    
    regr_all = []
    predictions = np.full(len(targets),np.nan) # size of targets without nans
    
    for i in range (0,len(np.unique(clusters))):
        indx = np.where(clusters==i) # indexes of the i cluster
        inputs2 = inputs[indx[0]] # inputs of the i cluster
        targets2 = targets[indx[0]]

        regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(inputs2,targets2)
        predictions[indx[0]] = regr.predict(inputs2) # putting them in the right place
        regr_all.append(regr)
    
        r = np.round(r_regression(inputs2,targets2),2)
        dict = {'Summation_of_solar_radiation': r[0], 'Mean_air_temperature': r[1], 'Mean_precipitation': r[2], 'Mean_pressure': r[3], 'Day_of_the_year': r[4]}

        print('The correlation coefficients between each input and ' + name +  ' for cluster ' + str(i+1) + ' are: ' +str(dict))

    return(regr_all,predictions)


## Scatter Plot

In [9]:
def scatter_plot(targets, predictions, name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, predictions, deg=1)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,predictions, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('predictions')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,predictions, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('predictions')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(name)

    plt.show()

    return(m)


## Seasonality

In [10]:
def seasonality (dates,targets):

    # Preparation of the dataarray 
    targets2 = xr.DataArray(targets,
        coords = {'time_counter':dates},
        dims = 'time_counter')
    
    test = targets2.groupby('time_counter.dayofyear').mean('time_counter')
    test0 = test.drop_isel(dayofyear=14) # Removing 29 Feb
    test2 = np.tile(test0,len(np.unique(dates.year)))
    indx2 = np.where((dates.month==2) & (dates.day==29)) # Finding where  29 Feb exists

    test3 = np.insert(test2,indx2[0][0],test[14]) # 2008
    test3 = np.insert(test3,indx2[0][1],test[14]) # 2012
    test3 = np.insert(test3,indx2[0][2],test[14]) # 2016
    season = np.insert(test3,indx2[0][3],test[14]) # 2020

    return(season)


## Plotting (Criteria)

In [11]:
def plotting_criteria(dates, variable, year_variable, title):
    
    indx = pd.DatetimeIndex(dates)
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=indx.month)
    plt.xticks(rotation=70)
    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    ax.plot(dates[(indx.month == 3) & (indx.day == 15)], year_variable,color='red',marker='*')
    fig.suptitle(title + ' (15 Feb - 30 Apr)')
    
    fig.show()


## Plotting (Mean Values)

In [12]:
def plotting_mean_values(dates, mean_targets, mean_predictions, units, category, region):

    years = np.unique(dates.year)
    ticks = [0]
    
    fig, _ = plt.subplots(figsize=(19,5))
    
    mean_targets = np.ma.array(mean_targets)
    mean_predictions = np.ma.array(mean_predictions)

    for year in years[:-1]:
        ticks.append((np.where(dates.year==year)[0][-1]+1))
        mean_targets[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked
        mean_predictions[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked

    plt.plot(mean_targets, label = 'targets')
    plt.plot(mean_predictions, label = 'predictions')
    plt.xlabel('Years')
    plt.xticks(ticks,years)
    plt.suptitle('Mean '+category + ' ' +units + ' (15 Feb - 30 Apr) ' + region)
    plt.legend()
    plt.show()
    

## Plotting (Maps)

In [13]:
def plotting_maps(targets, predictions, name, units):

    fig, ax = plt.subplots(2,2, figsize = (10,15), layout='tight')

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    targets.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': name + ' ' + units})
    predictions.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': name + ' ' + units})
    (targets-predictions).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': name + ' ' + units})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])

    ax[0,0].title.set_text('Targets')
    ax[0,1].title.set_text('Predictions')
    ax[1,0].title.set_text('Targets-Predictions')
    ax[1,1].axis('off')

    fig.suptitle(name + ' '+ str(targets.time_counter.dt.date.values))

    plt.show()
    

## Post Processing

In [16]:
def post_processing(dates,dataset,clusters,indx,targets,predictions,units,category):

    r_train = np.zeros(len(np.unique(clusters)))
    rms_train = np.zeros(len(np.unique(clusters)))
    slope_train = np.zeros(len(np.unique(clusters)))

    targets_mean = np.zeros((len(np.unique(clusters)),len(dataset.time_counter)))
    predictions_mean = np.zeros((len(np.unique(clusters)),len(dataset.time_counter)))

    for i in range (0,len(np.unique(clusters))):
        indx2 = np.where(clusters==i) # indexes of the j cluster
        targets2 = targets[indx2[0]] # inputs of the j cluster
        predictions2 = predictions[indx2[0]] # putting them in the right place

        r_train[i] = np.round(np.corrcoef(predictions2,targets2)[0][1],3)
        rms_train[i] = rmse(predictions2,targets2)
        m,_ = np.polyfit(targets2, predictions2, deg=1)
        slope_train[i] = np.round(m,3)

        temp = pd.DataFrame(np.vstack((r_train[i],rms_train[i],slope_train[i])).transpose(),index=['Cluster '+str(i+1)],columns=['r_train','rms_train','slope_train'])
        display(temp)

        # for the daily mean plot
        targets_mean_temp = np.reshape(targets2,(len(dataset.time_counter), int(len(indx2[0]) / len(dataset.time_counter))))
        predictions_mean_temp = np.reshape(predictions2,(len(dataset.time_counter), int(len(indx2[0]) / len(dataset.time_counter))))
        targets_mean[i] = np.mean(targets_mean_temp,axis=1)
        predictions_mean[i] = np.mean(predictions_mean_temp,axis=1)

        plotting_mean_values(dates, targets_mean[i], predictions_mean[i], units, category, 'Salish Sea Cluster '+str(i+1))

    return(r_train,rms_train,slope_train,targets_mean,predictions_mean)


## Evaluation

In [17]:
def evaluation (regr_all, clusters0, ds, ds2, name, units):

    years = np.unique(ds.time_counter.dt.year)

    # For every year
    r_years = np.array([])
    rms_years = np.array([])
    slope_years = np.array([])

    # The data arrays 
    targets_all = []
    predictions_all = []

    for year in (years):

        dataset = ds.sel(time_counter=str(year))
        dataset2 = ds2.sel(time_counter=str(year))

        inputs, targets, indx, clusters  = datasets_preparation(dataset, dataset2, clusters0, name)

        # Predictions for each regressor
        predictions = np.full(len(targets),np.nan) # size of a year without nans
        for i in range (0,len(np.unique(clusters))):
            indx2 = np.where(clusters==i) # indexes of the j cluster
            inputs2 = inputs[indx2[0]] # inputs of the j cluster
            predictions[indx2[0]] = regr_all[i].predict(inputs2) # putting them in the right place

        # Calculating the annual time-series
        m_year = scatter_plot(targets, predictions, name + ' for '+ str(year)) 
        r_year = np.round(np.corrcoef(targets, predictions)[0][1],3)
        rms_year = rmse(targets, predictions)
        
        r_years = np.append(r_years,r_year)
        rms_years = np.append(rms_years,rms_year)
        slope_years = np.append(slope_years,m_year)

        # Daily arrays
        targets = np.reshape(targets,(len(dataset.time_counter), int(len(indx[0]) / len(dataset.time_counter))))
        predictions = np.reshape(predictions,(len(dataset.time_counter), int(len(indx[0]) / len(dataset.time_counter))))
        targets_all.append (datasets_preparation2(targets, name + ' _targets', units, dataset))
        predictions_all.append(datasets_preparation2(predictions, name + ' _predictions', units, dataset))   

    # Daily arrays
    targets_all = xr.concat(targets_all, dim='time_counter')
    predictions_all = xr.concat(predictions_all, dim='time_counter')
    
    return(r_years, rms_years, slope_years, targets_all, predictions_all)


## Training

In [None]:
name = 'Flagellate_Production_Rate'
units = '[mmol N m-2 s-1]'
category = 'Production rates'

ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

dates = pd.DatetimeIndex(dataset['time_counter'].values)

# Selecting the clustering input (drivers or target)
clusters0, id = func_clust_target(dataset, name)
# clusters0, id = func_clust_drivers(dataset,dataset2,name)

inputs, targets, indx, clusters = datasets_preparation(dataset, dataset2, clusters0, name)

regr_all,predictions = regressor(inputs, targets, clusters, name)

r_train,rms_train,slope_train,targets_mean,predictions_mean = post_processing(dates,dataset,clusters,indx,targets,predictions,units,category)

season = np.zeros((len(np.unique(clusters)),len(dataset.time_counter)))

for i in range (0,len(np.unique(clusters))):
    season[i] = seasonality(dates,targets_mean[i])

plt.plot(season[:,0:75].transpose())
plt.legend(('Cluster 1','Cluster 2','Cluster 3','Cluster 4','Cluster 5','Cluster 6'))
plt.suptitle('Long-term seasonalities (2007-2020)')

for i in range (0,len(np.unique(clusters))):

    plotting_mean_values(dates, targets_mean[i]-season[i], predictions_mean[i]-season[i], units, category, 'Salish Sea (removed seasonality) (Cluster) '+str(i+1))

quant_train = dataset[name] # Keeping it for the regional seasonalities
dates_season = pd.DatetimeIndex(quant_train['time_counter'].values)


## Other Years

In [None]:
ds = ds.sel(time_counter = slice('2021', '2024'))
ds2 = ds2.sel(time_counter = slice('2021', '2024'))

dates = pd.DatetimeIndex(ds['time_counter'].values)

r_years, rms_years, slope_years, targets_all, predictions_all = evaluation(regr_all, clusters0, ds ,ds2, name, units)

r_days = xr.corr(targets_all,predictions_all, dim=['x','y'])
rms_days = xs.rmse(targets_all,predictions_all, dim=['x','y'], skipna=True)
slope_days = xs.linslope(targets_all,predictions_all, dim=['x','y'], skipna=True)


## Plotting (Results)

In [None]:
plotting_criteria(dates, r_days, r_years, 'Correlation Coefficients')
plotting_criteria(dates, rms_days, rms_years, 'Root Mean Square Errors')
plotting_criteria(dates, slope_days, slope_years, 'Slopes of the best fitting line')

# Daily maps
maps = random.sample(sorted(np.arange(0,len(targets_all.time_counter))),10)
for i in maps:

    idx = np.isfinite(np.ravel(targets_all[i]))
    scatter_plot(np.ravel(targets_all[i])[idx], np.ravel(predictions_all[i])[idx], name + ' '+ str(targets_all[i].time_counter.dt.date.values))

    plotting_maps(targets_all[i], predictions_all[i], name, units)
    

## Time-series

In [None]:
dataset = ds.sel(time_counter= slice('2021', '2024'))
dataset2 = ds2.sel(time_counter= slice('2021', '2024'))

years = np.unique(dataset.time_counter.dt.year)

targets_mean = np.zeros((len(np.unique(clusters)),len(dataset.time_counter)))
predictions_mean = np.zeros((len(np.unique(clusters)),len(dataset.time_counter)))

r_test = np.zeros((len(np.unique(clusters)),len(np.unique(dataset.time_counter.dt.year))))
rms_test = np.zeros((len(np.unique(clusters)),len(np.unique(dataset.time_counter.dt.year))))
slope_test = np.zeros((len(np.unique(clusters)),len(np.unique(dataset.time_counter.dt.year))))

inputs, targets, indx, clusters  = datasets_preparation(dataset, dataset2, clusters0, name)

predictions = np.full(len(targets),np.nan) # size of a year without nans
for i in range (0,len(np.unique(clusters))):
    indx2 = np.where(clusters==i) # indexes of the j cluster
    inputs2 = inputs[indx2[0]] # inputs of the j cluster
    targets2 = targets[indx2[0]] # targets of the j cluster
    predictions[indx2[0]] = regr_all[i].predict(inputs2) # putting them in the right place
    predictions2 = predictions[indx2[0]]

    targets_mean_temp = np.reshape(targets2,(len(dataset.time_counter), int(len(indx2[0]) / len(dataset.time_counter))))
    predictions_mean_temp = np.reshape(predictions[indx2[0]],(len(dataset.time_counter), int(len(indx2[0]) / len(dataset.time_counter))))
    targets_mean[i] = np.mean(targets_mean_temp,axis=1)
    predictions_mean[i] = np.mean(predictions_mean_temp,axis=1)

    for year in range (0, len(years)):
        r_test[i,year] = np.round(np.corrcoef(predictions_mean[i,np.where(dates.year==years[year])],targets_mean[i,np.where(dates.year==years[year])])[0][1],3)
        rms_test[i,year] = rmse(predictions_mean[i,np.where(dates.year==years[year])],targets_mean[i,np.where(dates.year==years[year])])
        m,_ = np.polyfit(np.squeeze(predictions_mean[i,np.where(dates.year==years[year])]),np.squeeze(targets_mean[i,np.where(dates.year==years[year])]), deg=1)
        slope_test[i,year] = np.round(m,3)

    temp = pd.DataFrame(np.vstack((r_test[i],rms_test[i],slope_test[i])).transpose(),index=[np.unique(dataset.time_counter.dt.year)],columns=['r_test','rms_test','slope_test'])
    display(temp)

    plotting_mean_values(dates, targets_mean[i], predictions_mean[i], units, category, 'Salish Sea Cluster '+str(i+1))
    plotting_mean_values(dates, targets_mean[i]-season[i,np.where(dates_season.year==2017)[0][0]:], predictions_mean[i]-season[i,np.where(dates_season.year==2017)[0][0]:], 
        units, category, 'Salish Sea (removed seasonality) Cluster '+str(i+1))
    

## Saving

In [22]:
# if id == 0:
#     path = '/data/ibougoudis/MOAD/files/results/' + name + '/hist_cl_target/'
# else:
#     path = '/data/ibougoudis/MOAD/files/results/' + name + '/hist_cl_drivers/'

# os.makedirs(path, exist_ok=True)
# with lzma.open(path + 'regr_all.xz', 'wb') as f:
    
#     dill.dump(regr_all, f)

# with open(path + 'metrics.pkl', 'wb') as f:
#     dill.dump([r_train,rms_train,slope_train,dates_season,season], f)

# file_creation(path, targets_all, 'Targets')
# file_creation(path, predictions_all, 'Predictions')
# file_creation(path, (targets_all-predictions_all), 'Targets - Predictions')
