# Predicting Diatom production rate with Histogram-based Gradient Boosting Regression Tree

## Importing

In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xskillscore as xs

from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.feature_selection import r_regression

from sklearn.metrics import root_mean_squared_error as rmse

import os
import lzma
import dill

import random

import cmocean.cm as cm
import salishsea_tools.viz_tools as sa_vi


## Datasets Preparation

In [None]:
# Creation of the training - testing datasets
def datasets_preparation(dataset, name, inputs_names):

    x = np.tile(dataset.x, len(dataset.time_counter)*len(dataset.y))
    y = np.tile(np.repeat(dataset.y, len(dataset.x)), len(dataset.time_counter))

    inputs = []
    
    for i in inputs_names[0:-1]:
        inputs.append(dataset[i].to_numpy().flatten())
        
    inputs.append(np.repeat(dataset.time_counter.dt.dayofyear, len(dataset.x)*len(dataset.y)))

    inputs = np.array(inputs)

    targets = np.ravel(dataset[name])
    
    indx = np.where(np.isfinite(targets) & (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,indx[0]]
    targets = targets[indx[0]]

    inputs = inputs.transpose()

    return(inputs, targets, indx)


## Datasets Preparation 2

In [None]:
# Creation of the data arrays
def datasets_preparation2(variable, name, units, dataset):

    # Obtaining the daily indexes
    temp = np.reshape(np.ravel(dataset[name]), (len(dataset.time_counter), len(dataset.y) * len(dataset.x)))
    x =  np.tile(dataset.x, len(dataset.y))
    y =  np.tile(np.repeat(dataset.y, len(dataset.x)),1)

    indx = np.where((~np.isnan(temp).any(axis=0)) & (x>10) & ((x>100) | (y<880)))

    variable_all = np.full((len(dataset.time_counter), len(dataset.y) * len(dataset.x)),np.nan)
    variable_all[:,indx[0]] = variable
    variable_all = np.reshape(variable_all,(len(dataset.time_counter),len(dataset.y),len(dataset.x)))

    # Preparation of the dataarray 
    array = xr.DataArray(variable_all,
        coords = {'time_counter': dataset.time_counter,'y': dataset.y, 'x': dataset.x},
        dims = ['time_counter','y','x'],
        attrs=dict(description= name,
        units=units))
        
    return (array)


## File Creation

In [None]:
def file_creation(path, variable, name):

    temp = variable.to_dataset(name=name)
    temp.to_netcdf(path = path + 'targets_predictions.nc', mode='a', encoding={name:{"zlib": True, "complevel": 9}})
    temp.close()
    

## Regressor

In [None]:
def regressor (inputs, targets, n_bins, drivers, spatial, inputs_names):

    if spatial == []:
        model = TransformedTargetRegressor(regressor=make_pipeline(ColumnTransformer(
            transformers=[('drivers', StandardScaler(), np.arange(0,len(drivers)))], remainder='passthrough'),
            HistGradientBoostingRegressor(categorical_features=[len(drivers)])),
            transformer=StandardScaler())

    else:
        model = TransformedTargetRegressor(regressor=make_pipeline(ColumnTransformer(
        transformers=[('drivers', StandardScaler(), np.arange(0,len(drivers))), 
            ('spatial', KBinsDiscretizer(n_bins=n_bins,encode='ordinal',strategy='quantile'), np.arange(inputs_names.index(spatial[0]),inputs_names.index(spatial[-1])+1))],
            remainder='passthrough'),
        HistGradientBoostingRegressor(categorical_features=np.arange(inputs_names.index(spatial[0]),len(inputs_names)))),
        transformer=StandardScaler())
    
    regr = model.fit(inputs,targets)

    r_inputs = np.round(r_regression(inputs,targets),2)

    return(regr, r_inputs)


## Scatter Plot

In [None]:
def scatter_plot(targets, predictions, name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, predictions, deg=1)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,predictions, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('predictions')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,predictions, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('predictions')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(name)

    plt.show()

    return(m)


## Plotting (Criteria)

In [None]:
def plotting_criteria(dates, variable, year_variable, months, period, title):
    
    indx = pd.DatetimeIndex(dates)
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=indx.month)
    plt.xticks(rotation=70)
    ax.legend(handles=scatter.legend_elements()[0], labels=months)
    ax.plot(dates[(indx.month == np.unique(indx.month)[1]) & (indx.day == len(np.unique(dates.day)) // 2)], year_variable,color='red',marker='*')
    fig.suptitle(title + ' ' + period)
    
    plt.show()


## Plotting (Mean Values)

In [None]:
def plotting_mean_values(dates, targets, predictions, mean, units, category, region, period, labels):

    r = np.round(np.corrcoef(predictions,targets)[0][1],3)
    rms = rmse(predictions,targets)  / mean * 100
    m,_ = np.polyfit(targets, predictions, deg=1)
    slope = np.round(m,3)

    temp = pd.DataFrame(np.vstack((r,rms,slope)).transpose(),columns=['r','rms [%]','slope'])
    display(temp)

    years = np.unique(dates.year)

    fig, ax = plt.subplots(figsize=(19,5))
    
    mean_targets = np.ma.array(targets)
    mean_predictions = np.ma.array(predictions)

    for year in years:
        mean_targets[(np.where(dates.year==year)[0][-1])] = np.ma.masked
        mean_predictions[(np.where(dates.year==year)[0][-1])] = np.ma.masked
        
    ax.plot(mean_targets, label = 'targets')
    ax.plot(mean_predictions, label = 'predictions')

    ticks = np.arange(0,len(years)*len(labels),len(labels)/2)
    ticks = np.int16(ticks)
    labels2=np.tile(labels,len(years))

    ax.set_xticks(ticks, labels2[ticks])

    ax2 = ax.secondary_xaxis('bottom')
    ax2.set_xticks(ticks=np.arange(0,len(years)*len(labels),len(labels)), labels=years)
    
    ax2.tick_params(length=0, pad=30)

    plt.suptitle('Mean '+category + ' ' +units + ' ' + period + ' ' + region)
    ax.legend()
    plt.show()

    return(r,rms,slope)
    

## Plotting (Maps)

In [None]:
def plotting_maps(targets, predictions, name, units):

    fig, ax = plt.subplots(2,2, figsize = (10,15), layout='tight')

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    targets.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': name + ' ' + units})
    predictions.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': name + ' ' + units})
    (targets-predictions).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': name + ' ' + units})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])

    ax[0,0].title.set_text('Targets')
    ax[0,1].title.set_text('Predictions')
    ax[1,0].title.set_text('Targets-Predictions')
    ax[1,1].axis('off')

    fig.suptitle(name + ' '+ str(targets.time_counter.dt.date.values))

    plt.show()
    

## Plotting (Regions)

In [None]:
def plot_box(ax, corn, colour):

    ax.plot([corn[2], corn[3], corn[3], corn[2], corn[2]], 
    [corn[0], corn[0], corn[1], corn[1], corn[0]], '-', color=colour)
    

## Plotting (Mean Peaks)

In [None]:
def plotting_mean_peaks(dates,targets_mean,predictions_mean,category,units,region,boxname,labels):

    years = np.unique(dates.year)
    
    fig, ax = plt.subplots(figsize=(19,5))
    
    targets_mean = np.ma.array(targets_mean)
    predictions_mean = np.ma.array(predictions_mean)

    for year in years:
      
        targets_mean[(np.where(dates.year==year)[0][-1])] = np.ma.masked
        predictions_mean[(np.where(dates.year==year)[0][-1])] = np.ma.masked

    ax.plot(targets_mean, label = 'targets')
    ax.plot(predictions_mean, label = 'predictions')

    ax.set_xticks(ticks=np.arange(0,len(years)*len(labels),len(labels)//len(years)+1), labels=np.tile(labels[np.arange(0,len(labels),len(labels)//len(years)+1)],len(years)))
    
    ax2 = ax.secondary_xaxis('bottom')
    ax2.set_xticks(ticks=np.arange(0,len(years)*len(labels),len(labels)+1), labels=years)
    
    ax2.tick_params(length=0, pad=30)

    plt.suptitle('Mean '+category + ' ' +units + ' (15 Feb - 30 Apr) ' + region + ' ' + boxname)
    ax.legend()
    plt.show()


## Plotting (Regional analysis)

In [None]:
def plotting_regional(metric,box,years,category):

    fig,ax = plt.subplots()

    for i in range (0,len(box)):
        ax.plot(years,metric[:,i],marker= '*', label=box[i])
    plt.suptitle(category+ ' (Regional analysis)')
    plt.legend()
    fig.show()


## Post Processing

In [None]:
def post_processing(dates,indx2,dataset,targets,predictions,indx,units,category,period,labels):

    targets_mean = np.reshape(targets,(len(dataset.time_counter), len(indx[0]) // len(dataset.time_counter)))
    predictions_mean = np.reshape(predictions,(len(dataset.time_counter), len(indx[0]) // len(dataset.time_counter)))

    targets_mean = targets_mean[indx2]
    predictions_mean = predictions_mean[indx2]

    targets_mean = np.mean(targets_mean,axis=1)
    predictions_mean = np.mean(predictions_mean,axis=1)

    r,rms,slope = plotting_mean_values(dates, targets_mean, predictions_mean, np.mean(targets_mean), units, category, period, 'Salish Sea', labels)

    return(r,rms,slope,targets_mean,predictions_mean)


## Evaluation

In [None]:
def evaluation (regr, ds, name, units, inputs_names):

    years = np.unique(ds.time_counter.dt.year)

    # For every year
    r_years = np.array([])
    rms_years = np.array([])
    slope_years = np.array([])

    # For all years 
    targets_all = []
    predictions_all = []

    for year in (years):

        dataset = ds.sel(time_counter=str(year))

        inputs, targets, indx = datasets_preparation(dataset, name, inputs_names)

        predictions = regr.predict(inputs)

        # Calculating the annual time-series
        m_year = scatter_plot(targets, predictions, name + ' for '+ str(year)) 
        r_year = np.corrcoef(targets, predictions)[0][1]
        rms_year = rmse(targets, predictions) / np.mean(targets) * 100

        r_years = np.append(r_years,r_year)
        rms_years = np.append(rms_years,rms_year)
        slope_years = np.append(slope_years,m_year)

        # Daily arrays
        targets = np.reshape(targets,(len(dataset.time_counter), len(indx[0]) // len(dataset.time_counter)))
        predictions = np.reshape(predictions,(len(dataset.time_counter), len(indx[0]) // len(dataset.time_counter)))
        targets_all.append (datasets_preparation2(targets, name, units, dataset))
        predictions_all.append(datasets_preparation2(predictions, name, units, dataset))   

    # Daily arrays
    targets_all = xr.concat(targets_all, dim='time_counter')
    predictions_all = xr.concat(predictions_all, dim='time_counter')
    
    return(r_years, rms_years, slope_years, targets_all, predictions_all)


## Initiation

In [None]:
name = 'Diatom_Production_Rate'
units = '[mmol N m-2 s-1]'
category = 'Production rates'

filename = '/data/ibougoudis/MOAD/files/inputs/jan_mar.nc'

drivers = ['Summation_of_solar_radiation', 'Mean_wind_speed', 'Mean_air_temperature']
spatial = ['Latitude', 'Longitude']
day_input = ['Day_of_year']
inputs_names = drivers + spatial + day_input

n_bins=255

if filename[35:42] == 'jan_mar': # 75 days, 1st period
    period = '(16 Jan - 31 Mar)'
    id = '1'
    months = ['January', 'February', 'March']

elif filename[35:42] == 'jan_apr': # 120 days, 2nd period
    period = '(01 Jan - 30 Apr)'
    id = '2'
    months = ['January', 'February', 'March', 'April']

elif filename[35:42] == 'feb_apr': # 75 days, 3rd period
    period = '(15 Feb - 30 Apr)'
    id = '3'
    months = ['February', 'March', 'April']

elif filename[35:42] == 'apr_jun': # 76 days, 4th period
    period = '(16 Apr - 30 Jun)'
    id = '4'
    months = ['April', 'May', 'June']

elif filename[35:42] == 'may_sep': # 153 days, 5th period
    period = '(01 May - 30 Sep)'
    id = '5'
    months = ['May', 'June', 'July', 'August', 'September']
   
ds = xr.open_dataset(filename)
ds0 = ds # For the regional plot

# Low resolution

ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))



## Training

In [None]:
dataset = ds.sel(time_counter = slice('2007', '2020'))

labels = np.unique(dataset.time_counter.dt.strftime('%d %b'))
indx_labels = np.argsort(pd.to_datetime(labels, format='%d %b'))
labels = labels[indx_labels]

inputs, targets, indx = datasets_preparation(dataset, name, inputs_names)

regr, r_inputs = regressor(inputs, targets, n_bins, drivers, spatial, inputs_names)

print('Metrics between input features and '+name)
temp = pd.DataFrame(r_inputs, index=inputs_names, columns=[period])
display(temp)

predictions = regr.predict(inputs)


## Time-series (Training)

In [None]:
dates = pd.DatetimeIndex(dataset['time_counter'].values)
indx2 = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx2]

r_train, rms_train, slope_train, targets_mean, predictions_mean = post_processing(dates,indx2,dataset,targets,predictions,indx,units,category,period,labels)

season = np.array(np.split(targets_mean,len(np.unique(dates.year)),axis=0))
season = np.mean(season, axis=0)

plt.plot(season)
plt.xticks(ticks=np.arange(0,len(labels),len(labels)//8+1), labels=labels[np.arange(0,len(labels),len(labels)//8+1)])
plt.suptitle('Long-term seasonality (2007-2020)')
plt.show()

season_train = np.tile(season,len(np.unique(dates.year))) # Broadcasting season to all training years

r_train_season,_,slope_train_season = plotting_mean_values(dates, targets_mean-season_train, predictions_mean-season_train, np.mean(targets_mean),
    units, category, period, 'Salish Sea (removed seasonality)', labels)

quant_train = dataset[name] # Keeping it for the regional seasonalities
dates_season = dates
quant_train = quant_train.sel(time_counter=dates)

mean = np.mean(targets)
std_targets = np.std(targets)
peak = mean + 0*std_targets

std_season = np.std(season)
std_predictions = np.std(predictions)


## Testing Years

In [None]:
dataset = ds.sel(time_counter = slice('2021', '2024'))

dates = pd.DatetimeIndex(dataset['time_counter'].values)
indx2 = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx2]

r_years, rms_years, slope_years, targets_all, predictions_all = evaluation(regr, dataset, name, units, inputs_names)

targets_all = targets_all.sel(time_counter=dates)
predictions_all = predictions_all.sel(time_counter=dates)

r_days = xr.corr(targets_all,predictions_all, dim=['x','y'])
rms_days = xs.rmse(targets_all,predictions_all, dim=['x','y'], skipna=True) / targets_all.mean() * 100
slope_days = xs.linslope(targets_all,predictions_all, dim=['x','y'], skipna=True)

plotting_criteria(dates, r_days, r_years, months, period, 'Correlation Coefficients')
plotting_criteria(dates, rms_days, rms_years, months, period, 'Root Mean Square Errors')
plotting_criteria(dates, slope_days, slope_years, months, period, 'Slopes of the best fitting line')

# Daily maps
maps = random.sample(sorted(np.arange(0,len(targets_all.time_counter))),10)
for i in maps:

    idx = np.isfinite(np.ravel(targets_all[i]))
    scatter_plot(np.ravel(targets_all[i])[idx], np.ravel(predictions_all[i])[idx], name + ' '+ str(targets_all[i].time_counter.dt.date.values))

    plotting_maps(targets_all[i], predictions_all[i], name, units)

season_test = np.tile(season,len(np.unique(dates.year))) # Broadcasting season to all testing years

std_targets_test = targets_all.std().values
std_predictions_test = predictions_all.std().values

r_test,rms_test,slope_test = plotting_mean_values(dates,targets_all.mean(['y','x'])-season_test, predictions_all.mean(['y','x'])-season_test, targets_all.mean(),
    units, category, 'Salish Sea (removed seasonality)', period, labels)

peak = np.tile(peak,4)
peak = xr.DataArray(peak, coords = {'year': np.unique(dates.year)}, dims = ['year'])


## Standard deviations

In [None]:
temp =pd.DataFrame([std_targets, std_season, std_targets_test, std_targets-std_season, std_targets_test-std_season, 
    std_predictions, std_predictions_test, std_predictions-std_season, std_predictions_test-std_season], 
    index=['targets','season','targets_test','targets-season','targets_test-season','predictions','predictions_test','predictions-season','predictions_test-season'])

display(temp)

print('Percentage of difference between testing:', (std_targets_test-std_season)*100/std_targets_test)

## Regional analysis

In [None]:
bathy = xr.open_dataset('/home/sallen/MEOPAR/grid/bathymetry_202108.nc')

fig, ax = plt.subplots(1, 1, figsize=(5, 9))
mycmap = cm.deep
mycmap.set_bad('grey')
ax.pcolormesh(ds0[name][0], cmap=mycmap)
sa_vi.set_aspect(ax)

SoG_north = [650, 730, 100, 200]
plot_box(ax, SoG_north, 'g')
SoG_center = [450, 550, 200, 300]
plot_box(ax, SoG_center, 'b')
Fraser_plume = [380, 460, 260, 330]
plot_box(ax, Fraser_plume, 'm')
SoG_south = [320, 380, 280, 350]
plot_box(ax, SoG_south, 'k')
Haro_Boundary = [290, 350, 210, 280]
plot_box(ax, Haro_Boundary, 'm')
JdF_west = [250, 425, 25, 125]
plot_box(ax, JdF_west, 'c')
JdF_east = [200, 290, 150, 260]
plot_box(ax, JdF_east, 'w')
PS_all = [0, 200, 80, 320]
plot_box(ax, PS_all, 'm')
PS_main = [20, 150, 200, 280]
plot_box(ax, PS_main, 'r')

boxnames = ['SoG_north','SoG_center','Fraser_plume','SoG_south', 'Haro_Boundary', 'JdF_west', 'JdF_east', 'PS_all', 'PS_main']
fig.legend(boxnames)

boxes = [SoG_north,SoG_center,Fraser_plume,SoG_south,Haro_Boundary,JdF_west,JdF_east,PS_all,PS_main]

regions0 = np.full((len(ds0.y),len(ds0.x)),np.nan)
for i in range (0, len(boxes)):
    regions0[boxes[i][0]:boxes[i][1], boxes[i][2]:boxes[i][3]] = i

regions0 = xr.DataArray(regions0,dims = ['y','x'])

# Low resolution

temp = []
for i in boxes:

    temp.append([x//5 for x in i])

boxes = temp

regions0 = regions0.isel(y=(np.arange(regions0.y[0], regions0.y[-1], 5)), 
    x=(np.arange(regions0.x[0], regions0.x[-1], 5)))


## Time-series (Testing)

In [None]:
r_test, rms_test, slope_test = np.zeros(len(boxes)),  np.zeros(len(boxes)), np.zeros(len(boxes))

r_test_season, slope_test_season = np.zeros(len(boxes)), np.zeros(len(boxes))

targets_sum, predictions_sum  = np.zeros((len(boxes),len(np.unique(dates.year)))), np.zeros((len(boxes),len(np.unique(dates.year))))

targets_mean, predictions_mean = np.zeros((len(boxes),len(np.unique(dates.year)))), np.zeros((len(boxes),len(np.unique(dates.year))))

targets_diff, predictions_diff = np.zeros((len(boxes),len(season)*len(np.unique(dates.year)))), np.zeros((len(boxes),len(season)*len(np.unique(dates.year))))

targets_s = np.zeros((len(season),len(np.unique(dates.year)),len(boxes)))
predictions_s = np.zeros((len(season),len(np.unique(dates.year)),len(boxes)))

rss = np.zeros(len(boxes))

for i in range (0,len(boxes)):

    targets = targets_all.where(regions0==i).mean(['y','x'])
    predictions = predictions_all.where(regions0==i).mean(['y','x'])

    r_test[i] = xr.corr(targets,predictions)
    rms_test[i] = xs.rmse(targets,predictions,skipna=True) / np.mean(targets) * 100
    slope_test[i] = xs.linslope(targets,predictions,skipna=True)

    climatology = quant_train[:,boxes[i][0]:boxes[i][1], boxes[i][2]:boxes[i][3]]
    season_test = climatology.to_numpy()
    season_test = np.reshape(season_test,(len(season),len(np.unique(dates_season.year)),climatology.shape[1]*climatology.shape[2]),order='F')
    season_test = np.nanmean(season_test,axis=(1,2))
    season_test = np.tile(season_test,len(np.unique(dates.year))) # Broadcasting season to all testing years

    targets_sum[i] = (targets-season_test).groupby(targets.time_counter.dt.year).sum().values
    predictions_sum[i] =  (predictions-season_test).groupby(predictions.time_counter.dt.year).sum().values

    targets_mean[i] = (targets-season_test).groupby(targets.time_counter.dt.year).mean().values
    predictions_mean[i] =  (predictions-season_test).groupby(predictions.time_counter.dt.year).mean().values

    rss[i] = ((targets-predictions)**2).sum().values # Similar to rms, is not affected by the seasonality

    r_test_season[i], _, slope_test_season[i] = plotting_mean_values(dates, targets-season_test, predictions-season_test, targets_all.mean(),
        units, category, boxnames[i] +' (removed seasonality)', period, labels)
    
    targets_diff[i] = (targets).groupby(targets.time_counter.dt.year).where((targets).groupby(targets.time_counter.dt.year)>peak)
    predictions_diff[i] = (predictions).groupby(predictions.time_counter.dt.year).where((predictions).groupby(predictions.time_counter.dt.year)>peak)

    plotting_mean_peaks(dates,targets_diff[i],predictions_diff[i],category,units,'Peaks',boxnames[i],labels)
    

## Saving

In [None]:
path = '/data/ibougoudis/MOAD/files/results/' + name + '/single_runs/' + name[0:4].lower() + '_' + name[7:9].lower() + '_func_reg' + id + '_boxes_s1/'

os.makedirs(path, exist_ok=True)
with lzma.open(path + 'regr_all.xz', 'wb') as f:   
    dill.dump(regr, f)

with open(path + 'r_inputs.pkl', 'wb') as f:
    dill.dump(r_inputs, f)

with open(path + 'train_metrics.pkl', 'wb') as f:
    dill.dump([r_train,rms_train,slope_train,r_train_season,slope_train_season,season.transpose()], f)

with open(path + 'test_metrics.pkl', 'wb') as f:
    dill.dump([r_test,rms_test,slope_test,r_test_season,slope_test_season,targets_sum,predictions_sum,targets_mean,predictions_mean,targets_diff,predictions_diff,rss], f)

file_creation(path, targets_all, 'Targets')
file_creation(path, predictions_all, 'Predictions')
file_creation(path, (targets_all-predictions_all), 'Targets - Predictions')

with open(path + 'readme.txt', 'w') as f:
    f.write ('name: ' + name)
    f.write('\n')
    f.write('period: ' + filename[35:42])
    f.write ('\n')
    f.write ('input_features: ')
    f.write (str([i for i in inputs_names]))
    f.write ('\n')
    f.write('n_bins: ' + str(n_bins))
    f.write ('\n')
