In [None]:
## general
import numpy as np
import datetime
from sklearn.externals import joblib
import copy
import cf_units
import xarray as xr
import os
import sys
from tqdm import notebook.tqdm as tqdm
import datetime as dt
import matplotlib as mpl

## statistics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy import stats
from scipy.stats import multivariate_normal # to compute likelihood
from sklearn.impute import SimpleImputer
#from scipy.stats import shapiro  #check normalicy of seasonal trend distribution
from scipy.optimize import curve_fit, fmin, fminbound, minimize, rosen_der, least_squares
from sklearn.preprocessing import StandardScaler
import pickle

##import functions for fitting
from symfit import parameters, variables, Fit
from symfit import pi,sqrt,log,exp,sinh
from symfit import sin, cos


# statistics which aren't all that nice in python
import rpy2.robjects as robjects

## my stuff
sys.path.insert(1,'/home/tristan/mesmer/tools')
#from tools.loading import load_data_single_mod
from tools.processing import AR1_predict, compute_llh_cv,gaspari_cohn
from tools.plotting import TaylorDiagram


## plotting
import matplotlib.pyplot as plt
import numpy.ma as ma
import cartopy.crs as ccrs
import mplotutils as mpu

##for parallelisation
from sklearn.externals.joblib import Parallel, delayed


In [None]:
from sklearn.preprocessing import PowerTransformer

def power_fit(residue,y,fmin=True):
    
    if fmin:
        power_trans = fit_fmin(PowerTransformer(method='yeo-johnson'),residue.reshape(-1, idx_l.sum()),y)
    else:
        power_trans = PowerTransformer(method='yeo-johnson').fit(residue.reshape(-1, idx_l.sum()))
    
    return power_trans
    
def power_transform(mod, residue,y,fmin=True):
    
    if fmin:
        residue_trans = transform_fmin(mod,residue.reshape(-1, idx_l.sum()),y).reshape(-1,idx_l.sum())
    else:
        residue_trans = mod.transform(residue.reshape(-1, idx_l.sum())).reshape(-1,idx_l.sum())
            
    return residue_trans  

def power_inv_transform(mod, residue,y,fmin=True):
    
    if fmin:
        residue_inv_trans = inverse_transform_fmin(mod,residue.reshape(-1, idx_l.sum()),y).reshape(-1,idx_l.sum())
    else:
        residue_inv_trans = mod.inverse_transform(residue.reshape(-1, idx_l.sum())).reshape(-1,idx_l.sum())
            
    return residue_inv_trans  

def compute_llh_cv(res_tr,res_cv,phi):
    """ Compute sum of log likelihood of a set of residuals based on a covariance matrix derived from a different set (of timeslots) of residuals
    
    Keyword arguments:
        - res_tr: the residual of the training run lacking a specific fold after removing the local mean response (nr ts x nr gp). Nans must be removed before
        - res_cv: the residual of a fold which was removed from the training run
        - phi: matrix to localize the covariance matrix based on a specific localisation radius and distance information (phi = output of fct gaspari_cohen(geo_dist/L))
    
    Output:
        - llh_innov_cv: sum of the log likelihood over the cross validation time slots
    
    """

    ecov_res_tr = np.cov(res_tr,rowvar=False)
    cov_res_tr=phi*ecov_res_tr
    
    mean_0 = np.zeros(phi.shape[0]) # we want the mean of the res to be 0

    llh_innov_cv=np.sum(multivariate_normal.logpdf(res_cv,mean=mean_0, cov=cov_res_tr,allow_singular=True))

    return llh_innov_cv   

def leave_one_out(L_set,nr_folds,residue_trans,idx_fo_tot,phi):
    
    def folds_calc(idx_fo,residue_trans,phi,L):
    
        res_tot_est = residue_trans[~idx_fo] 
        res_tot_fo=residue_trans[idx_fo]

        llh_cv=compute_llh_cv(res_tot_est,res_tot_fo,phi[L])
        
        return llh_cv
    
    idx_L=0
    L = L_set[idx_L]
    
    df_llh_cv={}
    df_llh_cv['llh_max']=-10000
    df_llh_cv['all']={}
    df_llh_cv['sum']={}
    df_llh_cv['L_sel']=L_set[idx_L]
    
    while (L-df_llh_cv['L_sel']<=250) and (df_llh_cv['L_sel']<L_set[-1]): # based on experience I know that once stop selecting larger 
            #loc radii, will not start again -> no point in looping through everything, better to stop once max is 
            #reached (to avoid singular matrices)
        L = L_set[idx_L]
        print('start with L ',L)
        df_llh_cv['all'][L]={}
        df_llh_cv['sum'][L]=0
        for i_fold_par in tqdm(np.arange(len(idx_fo_tot.keys()))):
            df_llh_cv['all'][L][i_fold_par]=folds_calc(idx_fo_tot[i_fold_par],residue_trans,phi,L)
            df_llh_cv['sum'][L] += df_llh_cv['all'][L][i_fold_par]
            
       
        #df_llh_cv['all'][L]=Parallel(n_jobs=10,verbose=10)(delayed(folds_calc)(idx_fo_tot[i],residue_trans,phi,L)for i in np.arange(len(idx_fo_tot.keys())))
        
        #print('rest tot fo shape ',res_tot_fo.shape,'res_tot_est shape ',res_tot_est.shape)
        if df_llh_cv['sum'][L]>df_llh_cv['llh_max']:
            df_llh_cv['L_sel']=L
            df_llh_cv['llh_max']=df_llh_cv['sum'][L]
            print('currently selected L=',df_llh_cv['L_sel'])

        idx_L+=1  
    return df_llh_cv

def lin_func(x, a, b):
    return a * x + b

In [None]:
dir_in_geo_dist = '/home/tristan/mesmer/data/'
geo_dist=np.load(dir_in_geo_dist + 'geo_dist.npy')
#L_set = [1500,1750,2000,2250,2500,2750,3000,3250,3500,3750,4000,4250,4500,4750,5000,5250,5500] 
L_set = [1500,1750,2000,2250,2500,2750,3000,3250,3500,3750,4000,4250,4500,4750,5000,5250,5500,6000,6250,6500,7000,7500,8000,8500]
    # high loc radius stms does not work because of singular matrices. I try to stop cv now once I obtain declining
    # likelihoods. I am not sure whether it will work without issues yet

#L_set = [9000,9250] # for ['MCM-UA-1-0']
#L_set = [10750]
#L_set = [500,750,1000,1250,1500] # for re-doing emulations of INM & IPSL models
phi = {}
for L in tqdm(L_set):
    phi[L] = np.zeros(geo_dist.shape)

    for i in tqdm(np.arange(geo_dist.shape[0])):
        for j in np.arange(geo_dist.shape[1]):
            phi[L][i,j]=gaspari_cohn(geo_dist[i,j]/L)
        if i % 1000 == 0:
                print('done with L:',L,'i:', i)

In [None]:
model = "Best observations"
models=[model]

df_llh_cv_all={}
L_set = [1500,1750,2000,2250,2500,2750,3000,3250,3500,3750,4000,4250,4500,4750,5000,5250,5500] 

coeff_0={}
coeff_1={}

train_residue_all_spat={}
train_residue_trans={}
power_trans={}

nr_years=112

dir_in_data_mod = '/home/tristan/mesmer/output/'

#L_set = [500,750,1000,1250,1500]
#L_set = [9000,9250]

for model in models:
        #add this when doing this cell by itself
        
        train_residue_trans[model]=joblib.load(dir_in_data_mod+'train_residue_trans.pkl')
        coeffs_temp = joblib.load(dir_in_data_mod+'AR(1)_coeffs.pkl')
        
        coeff_0[model] = coeffs_temp[0,:,:]
        coeff_1[model] = coeffs_temp[1,:,:]
        power_trans[model]=joblib.load(dir_in_data_mod+'yeo_johnson_pt_fmin_log.pkl')
        
        AR_process=np.zeros([train_residue_trans[model].reshape(-1,idx_l.sum()).shape[0]+120,
                         idx_l.sum()]).reshape(-1,12,idx_l.sum())
    
        for t in np.arange(1,AR_process.shape[0]):
            for i_mon in range(12):

                if i_mon==0:
                    AR_process[t,i_mon,:]=coeff_0[model][i_mon,:]+coeff_1[model][i_mon,:]*AR_process[t-1,11,:]
                else:
                     AR_process[t,i_mon,:]=coeff_0[model][i_mon,:]+coeff_1[model][i_mon,:]*AR_process[t,i_mon-1,:]

        AR_process= AR_process[10:,:,:]

        
        train_residue_all_spat[model]=train_residue_trans[model].reshape(-1,12,idx_l.sum())-AR_process
    
        

        # hardcoded for very slow leav-1-out cross val at moment to ensure to get most out of the data
        nr_ts=nr_years
        nr_folds = nr_ts*1
        print('number folds', nr_folds)
        fold_out_list = np.arange(nr_folds)
        idx_fo_tot={}
        j=0
        for i in fold_out_list:      
            idx_fo = np.zeros(nr_folds,dtype=bool)
            idx_fo[j:j+1]=True
            idx_fo_tot[i]=idx_fo    
            j+=1    

        # carry out cross-validation to determine the localisation radius L
        print('start with localisation radius for',model)

        df_llh_cv_all[model]={}
        df_llh_cv_all[model]=Parallel(n_jobs=12,verbose=10)(delayed(leave_one_out)(L_set,nr_folds,train_residue_all_spat[model][:,i_mon,:],idx_fo_tot,phi) for i_mon in range(12))
        
        dir_out_data_mod = '/home/tristan/mesmer/output/'

        joblib.dump(df_llh_cv_all[model],dir_out_data_mod+'llh_cv_all.pkl')
