In [14]:
import xarray as xr # needed for reading data
import pandas as pd # Used for stroing data
import numpy as np
import pickle as pkl  # Needed for saving model objects
import os
from itertools import repeat # Needed for repeating a variable multiple times

import matplotlib.pyplot as plt

import statsmodels.api as sm    # Used for bot he SARIMA and SARIMAX models
import statsmodels.tsa as sm_tsa # Used for type checking SARIMA models
from sklearn import metrics     # Used for importing various performance measures

import multiprocessing
from multiprocessing import Pool

In [15]:
# Depending on the machine on which the code is run, data might be stored in different directories.
# Indicate which machine is used to make sure the path to the data can be found.
# Can either be "local" or "Snellius"

MACHINE = 'local'

if MACHINE == 'Snellius':
    pred_var_path = '/gpfs/work1/0/ctdas/awoude/Ritten/predictor_vars/' # For retrieving the set of aggregated scaling vectors

    SAVE_DIR = pers_file_dir = '/gpfs/work1/0/ctdas/awoude/Ritten/trained_models/' # used for storing the trained model

    SF_DIR = '/gpfs/work1/0/ctdas/awoude/Ritten/fitted_sf/' # used for storing the scaling factor produced bij ML models

    RESULTS_DIR = '/gpfs/work1/0/ctdas/awoude/Ritten/results/' # used for storing the trained model

    CPU_COUNT = 32 # Snellius allows for usage of ut to 32 threads on the same node without additional costs

elif MACHINE == 'local':
    pred_var_path = './' # For retrieving the set of aggregated scaling vectors

    SAVE_DIR = pers_file_dir = './models/' # used for storing the trained model

    SF_DIR = './fitted_sf/' # used for storing the scaling factor produced bij ML models

    RESULTS_DIR = './results/'

    CPU_COUNT = multiprocessing.cpu_count() - 2 # The -2 is placed in order to maintain a relatively fast PC when running the model
else:
    raise NotImplementedError(f'machine "{MACHINE}" has not been implemented')



SARIMA_params = {'order': (2,0,2),             # Defining the regular AR, I and MA dependencies
                 'seasonal_order': (1,0,1,52), # Defining the seasonal dependencies
                 'trend': 'c'                  # Adding an intercept term
                 }

transCom_dict = {1:'North American Boreal',
                   2:'North American Temperate',
                   7:'Eurasia Boreal',
                   8:'Eurasia Temperate',
                   11:'Europe',
                  }

In [32]:
def eval_model(model, dat, model_name, test_or_train, show_fit=False):
    '''
    Evaluate the model using the provided testing data
    :param model: The model which is to be tested
    :param dat: The data that is to be used for testing the model. Also includes any predictor variables.
                Can both be used for testing the fit on both trianing data and testing data
    :param model_name: The used ML-algorithm
    :param test_or_train: Flag to indicate whether the passed data is training data ot testing data
    :param show_fit: Flag to indicate whether a plot of the fit should be provided
    :return:
    '''
    flux_dat = dat.prior_flux_per_s.values
    if model_name == "SARIMA":
        true_dat = dat.sf_per_eco.values
        if test_or_train == 'test':
            start_index = len(model.fittedvalues)
            final_model = model.append(true_dat)
        elif test_or_train == 'train':
            start_index = 0
            final_model = model
        else:
            raise Exception(f'test_or_train not specified:{test_or_train}')
        prediction = final_model.get_prediction(start=start_index)
        predict_ci = prediction.conf_int()
        pred_dat = prediction.predicted_mean
        if show_fit:

            # Graph
            fig, ax = plt.subplots(figsize=(9,4))
            title = test_or_train + ' data: predicted sf of ecoregion ' + str(dat.eco_regions.values)
            ax.set(title=title, xlabel='Date', ylabel='Scaling factor')

            # Plot data points
            dat.plot.scatter(x='time',y='sf_per_eco', ax=ax, label='Observed', c='C00')
            # Plot predictions
            plt.plot(dat.time.values, pred_dat, label='One-step-ahead forecast', c='C01')
            ci = predict_ci
            ax.fill_between(dat.time.values, ci[:,0], ci[:,1], color='C01', alpha=0.1)

            legend = ax.legend(loc='lower right')

            plt.show()
    else:
        raise NotImplementedError(f'Model evaluation of {model_name} not implemented')

    # Make sure all provided datasets heve the same length
    assert (len(true_dat)==len(pred_dat)) and (len(true_dat)==len(flux_dat)), 'Passed datasets are do not have the same length: '

    # Determine the performance in scaling factor space
    sf_ME = (np.sum(true_dat)-np.sum(pred_dat))/len(true_dat)
    sf_MAE = metrics.mean_absolute_error(true_dat, pred_dat)
    sf_MAPE = metrics.mean_absolute_percentage_error(true_dat, pred_dat)
    sf_RMSE = np.sqrt(metrics.mean_squared_error(true_dat, pred_dat))
    sf_r2 = metrics.r2_score(true_dat, pred_dat)

    # Move evaluation to flux space
    true_flux = true_dat * flux_dat
    pred_flux = pred_dat * flux_dat

    # Determine the performance in flux space
    flux_ME = (np.sum(true_dat)-np.sum(pred_flux))/len(true_dat)
    flux_MAE = metrics.mean_absolute_error(true_flux, pred_flux)
    flux_MAPE = metrics.mean_absolute_percentage_error(true_flux, pred_flux)
    flux_RMSE = np.sqrt(metrics.mean_squared_error(true_flux, pred_flux))
    flux_r2 = metrics.r2_score(true_flux, pred_flux)
    return {'sf_ME_'+test_or_train:sf_ME,
           'sf_MAE_'+test_or_train:sf_MAE,
           'sf_MAPE_'+test_or_train:sf_MAPE,
           'sf_RMSE_'+test_or_train:sf_RMSE,
           'sf_r2_'+test_or_train:sf_r2,
           'flux_ME_'+test_or_train:flux_ME,
           'flux_MAE_'+test_or_train:flux_MAE,
           'flux_MAPE_'+test_or_train:flux_MAPE,
           'flux_RMSE_'+test_or_train:flux_RMSE,
           'flux_r2_'+test_or_train:flux_r2}

def get_model_dir(algorithm, start_year):
    '''
    The path to the directory in which the model should be saved
    :param algorithm: The algorithm used for creating the model, sometimes also refered to as 'model_name'
    :param start_year: The year at which the training data started
    :return: Directory of the (to be) saved model
    '''
    file_dir = SAVE_DIR + algorithm + '/' + start_year + '/'
    return file_dir

def get_file_name(algorithm, eco_region, extention):
    '''
    Function used for generating the file name of a model
    :param algorithm: The algorithm used for creating the model, sometimes also refered to as 'model_name'
    :param eco_region: The eco_region to which the model applies
    :return: file name of the (to be) saved model
    '''
    file_name = algorithm + '_' + str(eco_region) + '.' + extention
    return file_name

def get_model_path(algorithm, start_year, eco_region):
    '''
    Function for automatically generating the location of a saved model
    :param algorithm: The algorithm used for creating the model, sometimes also refered to as 'model_name'
    :param start_year: The year at which the training data started
    :param eco_region: The eco_region to which the model applies
    :return: The correct file path of the model.
    '''
    file_dir = get_model_dir(algorithm, start_year)
    file_name = get_file_name(algorithm, eco_region, 'pkl')
    return file_dir + file_name

def get_sf_dir(algorithm):
    '''
    Function for automatically generating the location of the fitted scaling factors
    :param algorithm: The algorithm used for creating the model, sometimes also refered to as 'model_name'
    :return: The directory at which the results file should be stored.
    '''
    file_dir = SF_DIR + algorithm + '/'
    return file_dir

def get_sf_path(algorithm, eco_region):
    '''
    Function for automatically generating the full path to the fitted scaling factors
    :param algorithm: The algorithm used for generating the scaling factors, sometimes also refered to as 'model_name'
    :param eco_region: The eco_region to which the scaling factors apply
    :return: The correct file path of the scaling factors.
    '''
    file_dir = get_sf_dir(algorithm)
    file_name = get_file_name(algorithm, eco_region, 'nc')
    return file_dir + file_name

def get_results_dir(algorithm):
    '''
    Function for automatically generating the location of the analysed results
    :param algorithm: The algorithm used for creating the model, sometimes also refered to as 'model_name'
    :return: The directory at which the results file should be stored.
    '''
    file_dir = RESULTS_DIR + algorithm + '/'
    return file_dir

def get_results_path(algorithm, eco_region):
    '''
    Function for automatically generating the full path to the analysed results
    :param algorithm: The algorithm used for generating the results, sometimes also refered to as 'model_name'
    :param eco_region: The eco_region to which the results apply
    :return: The correct file path of the results.
    '''
    file_dir = get_results_dir(algorithm)
    file_name = get_file_name(algorithm, eco_region, 'pkl')
    return file_dir + file_name

def write_model(model, model_name, start_year, eco_region):
    '''
    Function used to save a model in the correct directory with an identifiable name. Uses Pickle for saving the model object
    :param model: The model which is to be saved
    :param model_name: The used ML-algorithm
    :param start_year: The date at which the training data starts
    :param eco_region: The name of the ecoregion to which the model applies
    :return: None
    '''

    file_dir = get_model_dir(model_name, start_year)
    if not os.path.isdir(file_dir):
        os.makedirs(file_dir)
    file_name = get_file_name(model_name, eco_region)
    file = file_dir + file_name
    with open(file, "wb") as f:
        pkl.dump(model, f, protocol=5)

def write_results(data, algorithm, ecoregion):
    '''
    Used for writing results of the sub model into a pickled file
    :param data: The data that is to be pickled
    :param file_path: The location at which the data should be stored
    :return: None
    '''
    results_dir = get_results_dir(algorithm)
    if not os.path.isdir(results_dir):
        os.makedirs(results_dir)
    file_path = get_results_path(algorithm, ecoregion)
    print(f'writing intermediate results for region {ecoregion}')
    data.to_pickle(file_path, protocol=5)

def write_sf(data, algorithm, ecoregion):
    '''
    Used for writing results of the sub model into a pickled file
    :param data: The data that is to be pickled
    :param file_path: The location at which the data should be stored
    :return: None
    '''
    sf_dir = get_sf_dir(algorithm)
    if not os.path.isdir(sf_dir):
        os.makedirs(sf_dir)
    file_path = get_sf_path(algorithm, ecoregion)
    print(f'writing intermediate results for region {ecoregion}')
    data.to_netcdf(file_path)

def read_SARIMA_model(file_path, train_dat):
    '''
    Function for unpickling trained models
    :param file_path: location of the pickle file
    :return: The unpickled model
    '''
    with open(file_path, 'rb') as f:
        model = pkl.load(f)

    # In earlier version of the model, the saved objects contained too much redundant data. Some remnants may remain of these large pickled objects.
    if isinstance(model, sm_tsa.statespace.sarimax.SARIMAXResultsWrapper):
        trained_model = model
        with open(file_path, "wb") as f:
            pkl.dump(model.params, f, protocol=5)
    elif isinstance(model, np.ndarray): # The newly pickled objects should only contain a single numpy array with the parameters for each term in the SARIMA model
        target_dat = train_dat.sf_per_eco.values
        trained_model = sm.tsa.SARIMAX(target_dat, **SARIMA_params)
        trained_model = trained_model.filter(model)
    else:
        raise NotImplementedError(f'Unkown file type: {type(model)} encountered when loading model')
    print(f"finished loading model {file_path}")
    return trained_model

def train_model(train_dat, model_name, eco_region, display = 0):
    '''
    Function for training a model on the provided training data
    :param train_dat: The data used for training. Includes both target data and predictor data
    :param model_name: Name of the ML-algorithm to be used for training
    :return: A trianed model
    '''

    start_year = str(train_dat.time.dt.year.min().values)

    print(f"starting process for {model_name}, at ecoregion {eco_region} using data starting at {start_year}")

    if model_name == 'SARIMA':
        target_data = train_dat.sf_per_eco
        model = sm.tsa.statespace.SARIMAX(target_data.values,
                                         **SARIMA_params
                                          )
        fitted_model=model.fit(maxiter=100, disp=display) # method='cg'
    else:
        raise NotImplementedError(f'Training of model {model_name} has not been implemented')
#     eco_region = str(train_dat.eco_regions.values)

    # Save model for future usage
    write_model(fitted_model, model_name, start_year, eco_region)
    return fitted_model

def test_eco_region(eco_dat, model_name):
    region, data = eco_dat
    results_df = pd.DataFrame()
    sf_data = [None] * 17

    #Set aside the testing data. Using the classical 80%-20% split
    test_ds = data.loc[dict(time=slice("2017", "2020"))]
    region_dat = data.loc[dict(time=slice("2000", "2016"))]
    for year in range(2000, 2017):
        # Load the model if it has been trained already
        file_name = get_model_path(model_name, str(year), region)

        # Determine training data and train model
        train_ds = region_dat.loc[dict(time=slice(str(year), "2016"))]


        try:
            trained_model = read_SARIMA_model(file_name, train_ds)
            print(f'file "{file_name}" has already contains a trained model. Skipping training process')

        except pkl.UnpicklingError:
            print(f'Could no unpickle model from eco-region {region} and starting year {year}. Training new model')
            # Pickled model somehow got corrupted. Train a new model
            trained_model = train_model(train_ds, model_name, region)
        except (EOFError, FileNotFoundError):
            print(f'No file exists, or the the existing file is empty. Training new model')
            # If no model exists, train a new one
            trained_model = train_model(train_ds, model_name, region)


        # Evaluate the model, both on training and testing data
        print(f'Generating perforamce on training set - region: {region}, year: {year}')
        train_results = eval_model(trained_model, train_ds, model_name, 'train', show_fit=False)
        print(f'Generating perforamce on test set - region: {region}, year: {year}')
        test_results = eval_model(trained_model, test_ds, model_name, 'test', show_fit=False)
        model_params = {
                'eco_region':region,
                'start_year':year,
                'N_train_years':(2017-year),
                'N_train_obs':len(train_ds.time),
                'N_test_years':4,
                'N_test_obs':len(test_ds.time)
        }

        sf_data[model_params['N_train_years']-1] = create_sf_dataset(trained_model, xr.concat([train_ds, test_ds], 'time'), region)

        # unpack all dicts to form single results dict
        model_results = pd.DataFrame({**model_params, **train_results, **test_results}, index=[region])
        if len(results_df) != 0:
            results_df = pd.concat([results_df, model_results])
        else:
            results_df = model_results
    write_results(results_df, model_name, region)
    sf_ds = xr.concat(sf_data, 'n_train_years', data_vars='minimal', compat='no_conflicts')
    write_sf(sf_ds, model_name, region)
    return results_df, sf_ds

def create_sf_dataset(model, data, eco_region):

    start_year = pd.DatetimeIndex(data.time).year.min()
    n_train_years = 2017-start_year
    test_data = data.loc[dict(time=slice("2017", "2020"))]

    final_model = model.append(test_data.sf_per_eco.values)

    # Determine the predicted scaling factor
    prediction = final_model.get_prediction(start=0)
    pred_sf = prediction.predicted_mean
    pred_sf = xr.DataArray(
        data=[pred_sf],
        dims=["n_train_years", "time"],
        coords=dict(
            time=data.time,
            n_train_years = [n_train_years],
        ),
        attrs=dict(
            Description="Predicted scaling factor",
            Units="-",
            # model=model_name,
        ),
        name='predicted_sf'
    )

    # Store the prior flux
    prior_flux = data.prior_flux_per_s
    prior_flux.attrs['Description'] = 'Total prior flux for each eco-region'
    prior_flux.attrs['Units'] = 'mol s^-1'

    # Determine the predicted flux
    pred_flux = prior_flux * pred_sf
    pred_flux = pred_flux.rename('predicted_flux').transpose()
    pred_flux.attrs['Description'] = 'Total predicted flux eco_region'
    pred_flux.attrs['Units'] = 'mol s^-1'

    # Store the optimized scaling factor
    opt_sf = data.sf_per_eco
    opt_sf.attrs['Description'] = 'The Effective optimized scaling factor of each eco-region'
    opt_sf.attrs['Units'] = '-'

    # Determine the optimised flux
    opt_flux = data.opt_flux
    opt_flux = opt_flux.rename('optimized_flux')
    opt_flux.attrs['Description'] = 'Total optimized flux of each eco-region'
    opt_flux.attrs['Units'] = 'mol s^-1'

    # Store the time points used for training
    training_time = data.time.where((data.time < test_data.time.values[0]))
    training_time = training_time.rename('training_time')
    training_time = training_time.expand_dims({'n_train_years':[n_train_years]})
    training_time.attrs['Description'] = 'List of time points used for training the model for each number of training years'

    # Store the time points used for training
    testing_time = data.time.where((data.time >= test_data.time.values[0]))
    testing_time = testing_time.rename('testing_time')
    testing_time.attrs['Description'] = 'List of time points used for testing the model'

    # Store the surface area of the complete eco_region
    surface_area = data.eco_area.min()

    # Determine TransCom region
    tc_region = int((eco_region-1)/19+1)
    transCom = xr.DataArray(
        data=tc_region,
        coords=dict(
            eco_regions=data.coords['eco_regions'].values,
        ),
        attrs=dict(
            description="TransCom region of the eco region",
            full_names=transCom_dict,
        ),
        name='tc_region'
    )
    return xr.merge([prior_flux, pred_sf, pred_flux, opt_sf, opt_flux, training_time, testing_time, surface_area, transCom])

def run_model(model_name, complete_ds):
    # the models will be evaluated per ecoregion. Hence, the original dataset is split into a separate one for each ecoregion
    eco_region_dat = list(complete_ds.groupby("eco_regions"))

    # Preload all data to prevent loading error during multithreading process
    eco_region_dat = [(region, data.load(scheduler='sync')) for region, data in eco_region_dat]

    if MACHINE == 'local': # reduce number of ecoregions in order to maintain speed within debugging process
        eco_region_dat = eco_region_dat[:2]

    with Pool(CPU_COUNT) as pool:
        list_of_results = pool.starmap(test_eco_region, zip(eco_region_dat, repeat(model_name)))

    res_list, sf_list = map(list, zip(*list_of_results))

    sf_ds = xr.concat(sf_list, 'eco_regions', data_vars='minimal')
    corrected_train_time = sf_ds.training_time.isel(dict(eco_regions=1)).squeeze()
    corrected_test_time = sf_ds.testing_time.isel(dict(eco_regions=1)).squeeze()
    sf_ds.update({'training_time':corrected_train_time, 'testing_time':corrected_test_time})

    results_df = pd.concat(res_list)

    return results_df, sf_ds



In [33]:
import timeit

# Loading all necessary data
with xr.open_dataset(pred_var_path + 'vars_per_eco_update.nc') as ds:
    complete_ds = ds
start_time = timeit.default_timer()
res_df, sf_ds = run_model('SARIMA', complete_ds)
stop_time = timeit.default_timer()

print(f'Elapsed time: {stop_time-start_time}')

results_file = RESULTS_DIR + 'SARIMA_results.pkl'
print(res_df)
res_df.to_pickle(results_file)

sf_file = SF_DIR + 'SARIMA_sf.nc'
print(sf_ds)
sf_ds.to_netcdf(sf_file)



finished loading model ./models/SARIMA/2000/SARIMA_1.0.pkl
file "./models/SARIMA/2000/SARIMA_1.0.pkl" has already contains a trained model. Skipping training process
Generating perforamce on training set - region: 1.0, year: 2000
Generating perforamce on test set - region: 1.0, year: 2000
finished loading model ./models/SARIMA/2000/SARIMA_3.0.pkl
file "./models/SARIMA/2000/SARIMA_3.0.pkl" has already contains a trained model. Skipping training process
Generating perforamce on training set - region: 3.0, year: 2000
Generating perforamce on test set - region: 3.0, year: 2000
finished loading model ./models/SARIMA/2001/SARIMA_1.0.pkl
file "./models/SARIMA/2001/SARIMA_1.0.pkl" has already contains a trained model. Skipping training process
Generating perforamce on training set - region: 1.0, year: 2001
Generating perforamce on test set - region: 1.0, year: 2001
finished loading model ./models/SARIMA/2001/SARIMA_3.0.pkl
file "./models/SARIMA/2001/SARIMA_3.0.pkl" has already contains a train

In [36]:
sf_ds.tc_region.attrs['full_names'] = transCom_dict
sf_ds

In [112]:
concat_ds = xr.concat(results, 'eco_regions', data_vars='minimal')
corrected_train_time = concat_ds.training_time.isel(dict(eco_regions=1)).squeeze()
corrected_test_time = concat_ds.testing_time.isel(dict(eco_regions=1)).squeeze()
concat_ds.update({'training_time':corrected_train_time, 'testing_time':corrected_test_time})
concat_ds
# corrected_train_time
# results[0]#.prior_flux_per_s.loc[dict(n_train_years=1)].values


In [31]:
complete_ds

In [13]:
sf_list

[<xarray.Dataset>
 Dimensions:           (time: 1096, n_train_years: 17)
 Coordinates:
   * time              (time) datetime64[ns] 2000-01-01 2000-01-08 ... 2020-12-26
   * n_train_years     (n_train_years) int64 1 2 3 4 5 6 7 ... 12 13 14 15 16 17
     eco_regions       float64 1.0
 Data variables:
     prior_flux_per_s  (time) float32 1.726e+06 1.723e+06 ... 1.731e+06 1.727e+06
     predicted_sf      (n_train_years, time) float64 nan nan ... 0.6816 0.6938
     predicted_flux    (n_train_years, time) float64 nan nan ... 1.198e+06
     sf_per_eco        (time) float32 0.2209 0.7194 0.6676 ... 0.5112 0.2465
     optimized_flux    (time) float32 3.813e+05 1.24e+06 ... 8.849e+05 4.257e+05
     training_time     (n_train_years, time) datetime64[ns] NaT NaT ... NaT NaT
     testing_time      (time) datetime64[ns] NaT NaT ... 2020-12-19 2020-12-26
     eco_area          float32 2.315e+12
     tc_region         int64 1
 Attributes:
     Description:  Total flux of the eco_region
     Units: 

In [8]:
# Loading all necessary data
with xr.open_dataset(pred_var_path + 'vars_per_eco_update.nc') as ds:
    complete_ds = ds.loc[dict(eco_regions=slice(0,5))]

results = run_model('SARIMA', complete_ds)
print(results)
# file_name = 'SARIMA_results.pkl'

# results.to_pickle(results_dir+file_name)


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Too few observations to estimate starting parameters%s.'
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.13343D+00    |proj g|=  7.24490D-02

At iterate    5    f=  2.12559D+00    |proj g|=  1.13357D-03

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8      9     11      1     0     0   4.748D-05   2.126D+00
  F =   2.1255879829581397     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Too few observations to estimate starting parameters%s.'


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.45712D+00    |proj g|=  4.71729D-02


 This problem is unconstrained.



At iterate    5    f=  2.45079D+00    |proj g|=  1.69062D-03

At iterate   10    f=  2.45051D+00    |proj g|=  9.41502D-03

At iterate   15    f=  2.43861D+00    |proj g|=  3.04180D-02

At iterate   20    f=  2.43391D+00    |proj g|=  1.58365D-02

At iterate   25    f=  2.42919D+00    |proj g|=  4.68406D-02

At iterate   30    f=  2.42194D+00    |proj g|=  5.63740D-03

At iterate   35    f=  2.41548D+00    |proj g|=  1.95637D-02

At iterate   40    f=  2.40890D+00    |proj g|=  1.06472D-01

At iterate   45    f=  2.40461D+00    |proj g|=  6.92697D-01

At iterate   50    f=  2.39667D+00    |proj g|=  7.41175D-01

At iterate   55    f=  2.38772D+00    |proj g|=  8.40293D-01

At iterate   60    f=  2.38348D+00    |proj g|=  1.84355D+00

At iterate   65    f=  2.37849D+00    |proj g|=  5.19047D+00

At iterate   70    f=  2.37461D+00    |proj g|=  8.03168D-01



   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     73    135      1     0     0   7.642D-03   2.375D+00
  F =   2.3745901986296722     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
   eco_region  start_year  N_train_years  N_train_obs  N_test_years  \
0         1.0      2015.0            2.0        105.0           4.0   
1         1.0      2016.0            1.0         53.0           4.0   

   N_test_obs  sf_ME_train  sf_MAE_train  sf_MAPE_train  sf_RMSE_train  ...  \
0       208.0    -0.015599      0.598733       1.290804       2.015637  ...   
1       208.0    -0.191935      0.760710       0.77

  warn('Too few observations to estimate starting parameters%s.'
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.55772D+00    |proj g|=  5.13919D-02

At iterate    5    f=  1.55417D+00    |proj g|=  2.20372D-03

At iterate   10    f=  1.55388D+00    |proj g|=  2.94095D-03

At iterate   15    f=  1.55386D+00    |proj g|=  1.39761D-03

At iterate   20    f=  1.55375D+00    |proj g|=  4.88883D-03

At iterate   25    f=  1.55371D+00    |proj g|=  2.38697D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     28     32      1     0     0   

  warn('Too few observations to estimate starting parameters%s.'


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  1.09831D+00    |proj g|=  1.05250D-01


 This problem is unconstrained.



At iterate    5    f=  1.09242D+00    |proj g|=  4.36725D-02

At iterate   10    f=  1.08761D+00    |proj g|=  1.58510D-02

At iterate   15    f=  1.07763D+00    |proj g|=  1.98921D-01

At iterate   20    f=  1.07374D+00    |proj g|=  6.27866D-02

At iterate   25    f=  1.06813D+00    |proj g|=  7.50888D-02

At iterate   30    f=  1.06370D+00    |proj g|=  3.96731D-01

At iterate   35    f=  1.05539D+00    |proj g|=  2.40806D-01

At iterate   40    f=  1.04809D+00    |proj g|=  2.69391D-01

At iterate   45    f=  1.04281D+00    |proj g|=  7.86165D-01

At iterate   50    f=  1.03565D+00    |proj g|=  7.63920D-01

At iterate   55    f=  1.02882D+00    |proj g|=  2.55144D+00

At iterate   60    f=  1.02370D+00    |proj g|=  3.05628D+00

At iterate   65    f=  1.02066D+00    |proj g|=  3.36070D-01

At iterate   70    f=  1.02053D+00    |proj g|=  6.70328D-01

At iterate   75    f=  1.02014D+00    |proj g|=  3.12000D-02



   evaluations in the last line search.  Termination
   may possibly be caused by a bad search direction.



           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    8     77    126      1     0     0   1.667D-01   1.020D+00
  F =   1.0201378163967489     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
   eco_region  start_year  N_train_years  N_train_obs  N_test_years  \
0         3.0      2015.0            2.0        105.0           4.0   
1         3.0      2016.0            1.0         53.0           4.0   

   N_test_obs  sf_ME_train  sf_MAE_train  sf_MAPE_train  sf_RMSE_train  ...  \
0       208.0    -0.005351      0.720031       5.389230       1.144732  ...   
1       208.0     0.059483      0.559039       4.80

  warn('Too few observations to estimate starting parameters%s.'
 This problem is unconstrained.


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.48549D+00    |proj g|=  7.09984D-02

At iterate    5    f=  2.47236D+00    |proj g|=  1.86176D-02

At iterate   10    f=  2.46735D+00    |proj g|=  3.93073D-03

At iterate   15    f=  2.46628D+00    |proj g|=  3.82534D-03

At iterate   20    f=  2.46624D+00    |proj g|=  2.09829D-04

At iterate   25    f=  2.46619D+00    |proj g|=  1.47595D-03

At iterate   30    f=  2.46595D+00    |proj g|=  1.83729D-02

At iterate   35    f=  2.46583D+00    |proj g|=  8.81072D-04

At iterate   40    f=  2.46575D+00    |proj g|=  8.48432D-03

At iterate   45    f=  2.46572D+00    |proj g|=  6.46075D-04

At iterate   50    f=  2.46572D+00    |proj g|=  7.52787D-04

At iterate   55    f=  2.46571D+00    |proj g|=  9.78807D-04

At iterate   60    f=  2.46570D+00    |proj g|=  7.54984D-04

At iterate   65    f=  2.4

  warn('Too few observations to estimate starting parameters%s.'


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            8     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  2.46818D+00    |proj g|=  9.47116D-02


 This problem is unconstrained.



At iterate    5    f=  2.45276D+00    |proj g|=  1.80932D-02

At iterate   10    f=  2.44484D+00    |proj g|=  1.26906D-02

At iterate   15    f=  2.44169D+00    |proj g|=  8.95133D-03

At iterate   20    f=  2.43835D+00    |proj g|=  3.10235D-02

At iterate   25    f=  2.43670D+00    |proj g|=  2.64718D-03

At iterate   30    f=  2.43656D+00    |proj g|=  1.79785D-03

At iterate   35    f=  2.43652D+00    |proj g|=  3.64520D-03

At iterate   40    f=  2.43650D+00    |proj g|=  7.75041D-04

At iterate   45    f=  2.43643D+00    |proj g|=  6.02301D-03

At iterate   50    f=  2.43634D+00    |proj g|=  4.64671D-04

At iterate   55    f=  2.43634D+00    |proj g|=  6.42392D-05

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F   

In [10]:
pd.read_pickle('/gpfs/work1/0/ctdas/awoude/Ritten/results/SARIMA_results.pkl')

Unnamed: 0,eco_region,start_year,N_train_years,N_train_obs,N_test_years,N_test_obs,sf_ME_train,sf_MAE_train,sf_MAPE_train,sf_RMSE_train,...,sf_ME_test,sf_MAE_test,sf_MAPE_test,sf_RMSE_test,sf_r2_test,flux_ME_test,flux_MAE_test,flux_MAPE_test,flux_RMSE_test,flux_r2_test
0,1.0,2016,1,53,4,208,-0.191935,0.76071,0.771409,2.812995,...,1.177059,1.818227,5.394846,3.351529,-3.292639,-645784.346063,2710695.0,5.394846,4196090.0,-2.690525
0,3.0,2016,1,53,4,208,0.059483,0.559039,4.807472,0.721612,...,0.361718,1.772138,12.194847,2.382867,-2.653813,-97078.394059,722802.2,12.194847,1038486.0,-1.289774
0,4.0,2016,1,53,4,208,0.014209,2.250739,1.126005,2.795952,...,0.032366,4.610867,12.408523,5.938386,-2.630859,520.090212,140960.3,12.408523,196958.3,-1.981104
