# Load packages

In [1]:
## general
import numpy as np
import datetime
from sklearn.externals import joblib
import copy
import cf_units
import xarray as xr
import os
import sys
from tqdm import tqdm_notebook as tqdm
import datetime as dt
import matplotlib as mpl
import math

## statistics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy import stats
from scipy.stats import multivariate_normal # to compute likelihood
from sklearn.impute import SimpleImputer
#from scipy.stats import shapiro  #check normalicy of seasonal trend distribution
from scipy.optimize import curve_fit, fmin, fminbound, minimize, rosen_der, least_squares
from sklearn.preprocessing import StandardScaler
import pickle

##import functions for fitting
from symfit import parameters, variables, Fit
from symfit import pi,sqrt,log,exp,sinh
from symfit import sin, cos


# statistics which aren't all that nice in python
import rpy2.robjects as robjects

## my stuff
sys.path.insert(1,'/home/tristan/mesmer/tools')
#from tools.loading import load_data_single_mod
from tools.processing import AR1_predict, compute_llh_cv,gaspari_cohn
from tools.plotting import TaylorDiagram


## plotting
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy.ma as ma
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import mplotutils as mpu

##for parallelisation
from sklearn.externals.joblib import Parallel, delayed


  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://num

In [2]:
import sklearn
print(sklearn.__version__)

0.20.1


# Define functions

In [3]:
##define function to do fitting on
def season_mimic(x, mon, n=2):
    """
    Returns a symbolic fourier series of order `n`.

    :param n: Order of the fourier series.
    :param x: Independent variable
    :param f: Frequency of the fourier series
    """
    # Make the parameter objects for all the terms
    a = parameters(','.join(['a{}'.format(i) for i in range(1, n + 1)]))
    b = parameters(','.join(['b{}'.format(i) for i in range(1, n + 1)]))
    c = parameters(','.join(['c{}'.format(i) for i in range(1, n + 1)]))
    d = parameters(','.join(['d{}'.format(i) for i in range(1, n + 1)]))
   
    # Construct the series
    series =    sum((ai*x + bi)*sin(pi*i*(mon%12+1)/6)+(ci*x+di)*cos(pi*i*(mon%12+1)/6)for i,(ai, bi, ci, di) in enumerate(zip(a, b,c,d)))
    return series

In [4]:
def fit_to_bic(x_train,mon_train,y_all_mon,max_period=10):
    
    """
    Fits grid point values to lowest BIC score
    """
    
    #create mask to mask out NaN values
    mask_nan=~np.isnan(x_train)    
    y_all_mon=y_all_mon.reshape(tot_months,-1)[mask_nan,:]
    x_train=x_train[mask_nan]
    mon_train=mon_train[mask_nan]
    
    n=len(x_train)
    bic=np.zeros(max_period)
    for i in range (1,max_period+1):
        
        num_params=2+(4*i)
        x, mon, z = variables('x, mon, z')
        model_dict = {z: season_mimic(x, mon, n=i)}
        
        if i==1:
            fit=Fit(model_dict, x=x_train, z=y_all_mon)
            mse=mean_squared_error(y_all_mon,fit.model(x=x_train, **fit.execute().params).z)
        else:
            fit=Fit(model_dict, x=x_train, mon=mon_train,z=y_all_mon)
            mse=mean_squared_error(y_all_mon,fit.model(x=x_train, mon=mon_train,**fit.execute().params).z)
        
        bic[i-1] = n * log(mse) + num_params * log(n)
        
    order_chosen=np.where(bic==min(bic))[0][0]+1
    
    
    model_dict_chosen = {z: season_mimic(x, mon, n=order_chosen)}
    fit=Fit(model_dict_chosen, x=x_train, mon=mon_train, z=y_all_mon)
    
    return fit.execute()

In [5]:
def fit_fmin(self, X, y):
    """Estimate the optimal parameter lambda for each feature.
    The optimal lambda parameter for minimizing skewness is estimated on
    each feature independently using maximum likelihood.
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        The data used to estimate the optimal transformation parameters.
    y : temp values to calculate lmbda as lmbda = a*y + b
    Returns
    -------
    self : object
    """
    
    X = X.copy()  # force copy so that fit does not change X inplace


    self.coeffs_ =[]
    for i_grid in tqdm(np.arange(idx_l.sum())):
#         print(X.shape, y.shape)
        self.coeffs_.append(yeo_johnson_optimize_fmin(self,X[:,i_grid],y[:,i_grid]))
        
        
    self.coeffs_=np.array(self.coeffs_)
    #print(self.coeffs_.shape)
    self.mins_ = np.amin(X, axis=0)
    self.maxs_ = np.amax(X, axis=0)
    #print(self.coeffs_)
    
    if self.standardize:
        self._scaler = StandardScaler(copy=True)
        self._scaler.fit(X)
        
    return self
    
def transform_fmin(self, X, y):
        """Apply the power transform to each feature using the fitted lambdas.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data to be transformed using a power transformation.
        Returns
        -------
        X_trans : array-like, shape (n_samples, n_features)
            The transformed data.
        """

        lambdas=get_yeo_johnson_lambdas(self.coeffs_,y)
        
        X_trans=np.zeros_like(X)
        for i, lmbda in enumerate(lambdas.T):
            for j,j_lmbda in enumerate(lmbda):
                with np.errstate(invalid='ignore'):  # hide NaN warnings
                    X_trans[j, i] = self._yeo_johnson_transform(X[j, i], j_lmbda)

        if self.standardize:
            X_trans = self._scaler.transform(X_trans)

        return X_trans

def inverse_transform_fmin(self, X, y):
        """Apply the inverse power transformation using the fitted lambdas.
        The inverse of the Yeo-Johnson transformation is given by::
            if X >= 0 and lambda_ == 0:
                X = exp(X_trans) - 1
            elif X >= 0 and lambda_ != 0:
                X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1
            elif X < 0 and lambda_ != 2:
                X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))
            elif X < 0 and lambda_ == 2:
                X = 1 - exp(-X_trans)
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The transformed data.
        Returns
        -------
        X : array-like, shape (n_samples, n_features)
            The original data
        """

        if self.standardize:
            X = self._scaler.inverse_transform(X) 
        
        X_inv = np.zeros_like(X)
        
        lambdas=get_yeo_johnson_lambdas(self.coeffs_,y)
        for i, lmbda in enumerate(lambdas.T):
            for j,j_lmbda in enumerate(lmbda):
                with np.errstate(invalid='ignore'):  # hide NaN warnings
                    X_inv[j, i] = self._yeo_johnson_inverse_transform(X[j, i], j_lmbda)
            X_inv[:,i]=np.where(X_inv[:,i]<self.mins_[i],self.mins_[i],X_inv[:,i])
            X_inv[:,i]=np.where(X_inv[:,i]>self.maxs_[i],self.maxs_[i],X_inv[:,i])

        return X_inv

def yeo_johnson_optimize_fmin(self, x, y):
    """Find and return optimal lambda parameter of the Yeo-Johnson
    transform by MLE, for observed data x.
    Like for Box-Cox, MLE is done via the brent optimizer.
    """

    def _neg_log_likelihood(coeff):
        """Return the negative log likelihood of the observed data x as a
        function of lambda."""
        lambdas=2/(1+coeff[0]*np.exp(y*coeff[1]))
        
        x_trans =np.zeros_like(x)
#         print(x.shape)
        #print(lambdas.shape)
        for i, lmbda in enumerate(lambdas):
            x_trans[i] = self._yeo_johnson_transform(x[i], lmbda)
        
        n_samples = x.shape[0]

        loglike = -n_samples / 2 * np.log(x_trans.var())
        loglike += ((lambdas - 1) * np.sign(x) * np.log1p(np.abs(x))).sum()

        return -loglike

    # the computation of lambda is influenced by NaNs so we need to
    # get rid of them
    x = x[~np.isnan(x)]
    y = y[~np.isnan(y)]
    # choosing bracket -2, 2 like for boxcox
    bounds=np.c_[[0,0], [1,0.1]]
    
    return minimize(_neg_log_likelihood, np.array([0.01,0.001]), bounds=bounds, method='SLSQP',jac=rosen_der 
               ).x

def get_yeo_johnson_lambdas(coeffs,y):

    lambdas=np.zeros_like(y)
    i=0
    for a,b in zip(coeffs,y.T):
        
        lambdas[:,i]=2/(1+a[0]*np.exp(b*a[1]))
        i+=1
        
    lambdas=np.where(lambdas<0,0,lambdas)
    lambdas=np.where(lambdas>2,2,lambdas)
    
    return lambdas

In [7]:
from sklearn.preprocessing import PowerTransformer

def power_fit(residue,y,fmin=True):
    
    if fmin:
        power_trans = fit_fmin(PowerTransformer(method='yeo-johnson'),residue.reshape(-1, idx_l.sum()),y)
    else:
        power_trans = PowerTransformer(method='yeo-johnson').fit(residue.reshape(-1, idx_l.sum()))
    
    return power_trans
    
def power_transform(mod, residue,y,fmin=True):
    
    if fmin:
        residue_trans = transform_fmin(mod,residue.reshape(-1, idx_l.sum()),y).reshape(-1,idx_l.sum())
    else:
        residue_trans = mod.transform(residue.reshape(-1, idx_l.sum())).reshape(-1,idx_l.sum())
            
    return residue_trans  

def power_inv_transform(mod, residue,y,fmin=True):
    
    if fmin:
        residue_inv_trans = inverse_transform_fmin(mod,residue.reshape(-1, idx_l.sum()),y).reshape(-1,idx_l.sum())
    else:
        residue_inv_trans = mod.inverse_transform(residue.reshape(-1, idx_l.sum())).reshape(-1,idx_l.sum())
            
    return residue_inv_trans  

def compute_llh_cv(res_tr,res_cv,phi):
    """ Compute sum of log likelihood of a set of residuals based on a covariance matrix derived from a different set (of timeslots) of residuals
    
    Keyword arguments:
        - res_tr: the residual of the training run lacking a specific fold after removing the local mean response (nr ts x nr gp). Nans must be removed before
        - res_cv: the residual of a fold which was removed from the training run
        - phi: matrix to localize the covariance matrix based on a specific localisation radius and distance information (phi = output of fct gaspari_cohen(geo_dist/L))
    
    Output:
        - llh_innov_cv: sum of the log likelihood over the cross validation time slots
    
    """

    ecov_res_tr = np.cov(res_tr,rowvar=False)
    cov_res_tr=phi*ecov_res_tr
    
    mean_0 = np.zeros(phi.shape[0]) 

    llh_innov_cv=np.sum(multivariate_normal.logpdf(res_cv,mean=mean_0, cov=cov_res_tr,allow_singular=True))

    return llh_innov_cv   

def leave_one_out(L_set,nr_folds,residue_trans,idx_fo_tot,phi):
    
    def folds_calc(idx_fo,residue_trans,phi,L):
    
        res_tot_est = residue_trans[~idx_fo] 
        res_tot_fo=residue_trans[idx_fo]

        llh_cv=compute_llh_cv(res_tot_est,res_tot_fo,phi[L])
        
        return llh_cv
    
    idx_L=0
    L = L_set[idx_L]
    
    df_llh_cv={}
    df_llh_cv['llh_max']=-10000
    df_llh_cv['all']={}
    df_llh_cv['sum']={}
    df_llh_cv['L_sel']=L_set[idx_L]
    
    while (L-df_llh_cv['L_sel']<=250) and (df_llh_cv['L_sel']<L_set[-1]):
        L = L_set[idx_L]
        print('start with L ',L)
        df_llh_cv['all'][L]={}
        df_llh_cv['sum'][L]=0
        for i_fold_par in tqdm(np.arange(len(idx_fo_tot.keys()))):
            df_llh_cv['all'][L][i_fold_par]=folds_calc(idx_fo_tot[i_fold_par],residue_trans,phi,L)
            df_llh_cv['sum'][L] += df_llh_cv['all'][L][i_fold_par]
            
        if df_llh_cv['sum'][L]>df_llh_cv['llh_max']:
            df_llh_cv['L_sel']=L
            df_llh_cv['llh_max']=df_llh_cv['sum'][L]
            print('currently selected L=',df_llh_cv['L_sel'])

        idx_L+=1  
    return df_llh_cv

def lin_func(x, a, b):
    return a * x + b

# Load data and train harmonic model

## Load land mask and create land mask index (idx_l)

In [8]:
# load the land mask as frac_l
dir_in_geo_dist = '/home/tristan/mesmer/data/'
frac_l = xr.open_mfdataset(dir_in_geo_dist + 'interim_invariant_lsmask_regrid.nc', combine='by_coords',decode_times=False)

frac_l_raw = np.squeeze(copy.deepcopy(frac_l.lsm.values))  #land-sea mask of ERA-interim bilinearily interpolated 

frac_l = frac_l.where(frac_l.lat>-60,0)  # remove Antarctica from frac_l field (ie set frac l to 0)

idx_l=np.squeeze(frac_l.lsm.values)>0.0 # idx_l = index land -> idex land #-> everything >0 we consider as land

lon_pc, lat_pc = mpu.infer_interval_breaks(frac_l.lon, frac_l.lat)  ## is this needed??

In [9]:
##Start training for monthly downscaling

y_all_mon={}
y_all={}

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

print("calculating seasonal trends")
        
# prepare the inputs as array
BEST_data = 'obs_data_25.nc'
data_mask = 'interim_invariant_lsmask_regrid.nc'       

df_obs = xr.open_mfdataset(dir_in_geo_dist+BEST_data).roll(lon=72) #open observation data
        
#create the climatology values array
nr_years = 112
tot_months = 12*nr_years 

y_ma = np.zeros((tot_months,idx_l.sum()))  #create emtpy array with correct shape
for i in range(tot_months):
    y_ma[i] = df_obs.climatology.values[i%12,idx_l]    #fill climatology values in the array

print(y_ma.shape)
#create test data over date range - here, 112 years so 1910 incl. to 2022. 
data_test = np.array([df_obs.temperature.values[720:2064,idx_l]])
data_test = data_test.reshape(12*nr_years,idx_l.sum())
        
#load in monthly temperature values by adding the temp anomolies to the climatology
y_all_mon = np.add(y_ma, data_test)

# now subtract the yearly average climatology so we are left with the monthly anomolies (monthly value minus year average)
y_all_mon = np.subtract(y_all_mon, np.reshape(np.tile(np.mean(df_obs.climatology.values[:,idx_l],axis=0),tot_months),(tot_months,idx_l.sum())))
print(y_all_mon.shape)

#calculate annual avergage temperature values - here we only need the shape for now
y_all = np.mean(y_all_mon.reshape(-1,12,idx_l.sum()),axis=1) 

##Get directory to store outputs
dir_out_data_mod = '/home/tristan/mesmer/output/'
        
print('Output folder selected, training starting now, please be patient... \n Go outside, take a long walk, have some food, take a nap, enjoy your life for a few hours...')

if not os.path.exists(dir_out_data_mod):
    os.makedirs(dir_out_data_mod)
    print('created dir:',dir_out_data_mod)
         
##prepare training data
months=np.arange(1,13)
mon_train=np.tile(months,y_all.shape[0])

#final check of shapes
#print (np.repeat(y_all[:,0],12).shape,mon_train.shape and y_all_mon[:,0].shape)

seasonal_model_exec= Parallel(n_jobs=12)(delayed(fit_to_bic)(np.repeat(y_all[:,i],12,axis=0),
                                                                   mon_train, y_all_mon[:,i])
                                               for i in tqdm(np.arange(idx_l.sum())))
                    
joblib.dump(seasonal_model_exec,dir_out_data_mod+'seasonal_trend.pkl')
        
print("finito!")

calculating seasonal trends
(1344, 3043)
(1344, 3043)
Output folder selected, training starting now, please be patient... 
 Go outside, take a long walk, have some food, take a nap, enjoy your life for a few hours...


  0%|          | 0/3043 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [10]:
from time import time

##Extract test and train results
seasonal_mod={}
order_chosen={}

model="BEST observations"
models=[model]

train_results_all={}
train_residue_all={}
y_all_mon={}
y_all={}

#create the climatology values array
nr_years = 112
tot_months = 12*nr_years 

# seasonal_mod={}
# order_chosen={}

for model in models:

########## THIS SECTION CAN BE COMMNENTED OUT IF RUNNING DIRECTLY AFTER RUNNING THE PREVIOUS CELL ##############    
#Get directory to store outputs
    dir_in_data_mod = '/home/tristan/mesmer/data/'

    # prepare the inputs as array
    BEST_data = 'obs_data_25.nc'
    data_mask = 'interim_invariant_lsmask_regrid.nc'       
    
    df_obs = xr.open_mfdataset(dir_in_geo_dist+BEST_data).roll(lon=72) #open observation data
        
    #create the climatology values array
    y_ma = np.zeros((tot_months,idx_l.sum()))  #create emtpy array with correct shape
    for i in range(tot_months):
        y_ma[i] = df_obs.climatology.values[i%12,idx_l]    #fill climatology values in the array

    #create test data over date range - here, 127 years so 1910 incl. to 2022 incl. 
    ## 1344 is the number of months from 1910 until 2022
    data_test = np.nan_to_num(np.array([df_obs.temperature.values[720:2064,idx_l]]))
    data_test = data_test.reshape(tot_months,idx_l.sum())
    
    print(data_test.shape)
        
    #load in monthly temperature values by adding the temp anomolies to the climatology
    y_all_mon[model] = np.add(y_ma, data_test)     
    
    # now subtract the yearly average climatology so we are left with residuals
    y_all_mon[model] = y_all_mon[model] - np.reshape(np.tile(np.mean(df_obs.climatology.values[:,idx_l],axis=0),tot_months),(tot_months,idx_l.sum()))   

################## UNTIL HERE #########################             
        
    #calculate annual average temperature values- here we use nanmean to calculate the annual means for each gridpoint but skipping any Nan values
    y_all[model] = np.mean(y_all_mon[model].reshape(-1,12,idx_l.sum()),axis=1)
    print(y_all[model].shape)
    
############ here begins the actual training ##############################
    
#     print ("Getting seasonal trends for", model)
    
    nans = np.isnan(y_all_mon[model]).sum()
    print(nans)
    
#     if nans == 0:
#         print("No Nan values, moving on...")
        
#     else:
#         print("Imputing Nan values...")
#         imp_mean = SimpleImputer(missing_values = np.nan, strategy = 'mean')
#         SimpleImputer()
#         y_all[model] = imp_mean.fit_transform(y_all[model])
#         print(y_all[model].shape)
        
#         nans = np.isnan(y_all[model]).sum()
#         print(nans)
    dir_in_data_mod = '/home/tristan/mesmer/output/'
    seasonal_mod[model]=joblib.load(dir_in_data_mod+'seasonal_trend.pkl')
    
    if os.path.exists(os.path.join(dir_in_data_mod,'seasonal_training_results.pkl')):
        train_results_all[model]=joblib.load(dir_in_data_mod+'seasonal_training_results.pkl')
        
        train_residue_all[model]=np.subtract(y_all_mon[model], train_results_all[model])
        train_residue_all[model]=train_residue_all[model].reshape(-1,12,idx_l.sum())

    else:
        train_results_all[model]=np.zeros_like(y_all_mon[model] ) 
        months=np.arange(1,13)
        mon_train=np.tile(months,y_all[model].shape[0] )
        
        for i in tqdm(np.arange(idx_l.sum())):
            x_train=np.repeat(y_all[model][:,i],12)
            x, mon, z = variables('x, mon, z')
            train_results_all[model][:,i]=seasonal_mod[model][i].model(x=x_train, mon=np.tile(months,nr_years),**seasonal_mod[model][i].params).z
    
        ##store training results
        joblib.dump(train_results_all[model],dir_in_data_mod+'seasonal_training_results.pkl')  

    ###plot order of harmonics chosen
    order_chosen[model]=np.zeros([idx_l.sum()]) 
    
    for i in np.arange(idx_l.sum()):
        fit_result=seasonal_mod[model][i]
        order_chosen[model][i]=int((len(fit_result.params)+2)/4)-1
      
    
    fig=plt.figure(figsize=(10,20))
    plt.rcParams.update({'font.size': 12})
    plt.rcParams.update({'mathtext.default':'regular'}) 
    plt.rcParams.update({'mathtext.default':'it'}) 
    
    ax=fig.add_subplot(1,1,1,projection=ccrs.Robinson(central_longitude=0))

    y_ma = np.zeros(idx_l.shape)
    y_ma = ma.masked_array(y_ma, mask=idx_l==False)
    y_ma[idx_l]=order_chosen[model]-1
    
    print(y_ma.shape)
    
    mesh_1=ax.pcolormesh(lon_pc, lat_pc, y_ma,  cmap=plt.cm.get_cmap('inferno_r', 7),vmin=0,vmax=7,transform=ccrs.PlateCarree(),rasterized=True)
    ax.set_title('Fourier Series %s'%model,y=1.02,fontsize=14)
    ax.add_feature(cfeature.OCEAN)
    cbar=plt.colorbar(mesh_1,ax=[ax],orientation='horizontal',ticks=(np.arange(8)+0.5),shrink=0.8,pad=0.02,aspect=25)
    cbar.set_label('Order')  
    cbar.ax.set_xticklabels(np.arange(8))
    
    ax.coastlines()

    plt.show()

KeyboardInterrupt: 

In [None]:
# # ## run a Shapiro Wilks test on the training residues to get the p-values for each grid point in each month

from scipy.stats import shapiro
p_vals=np.zeros([12,idx_l.sum()])
for i in range(12):        
    p_vals[i,:] = np.hstack(([shapiro(train_residue_all[model].reshape(nr_years,12,idx_l.sum())[:,i,i_grid])[1] for i_grid in np.arange(idx_l.sum())]))

p_vals.shape

In [None]:
from statsmodels.stats.multitest import multipletests

months=['Jan','Feb','Mar','Apr','May','June','July','Aug','Sept','Oct','Nov','Dec']

white = np.array([248/256, 230/256, 200/256, 1])
blue = np.array([30/256, 70/256, 130/256, 1])
# colors = np.vstack((white,blue))
colors = ['#efefdb', '#e55c30']

mymap = mpl.colors.LinearSegmentedColormap.from_list('my_colormap', colors, N=2)
bounds = [ -1.5, 0.5, 1.5]
norm = mpl.colors.BoundaryNorm(bounds, mymap.N)
n_col = 3
n_row = 4

fs_title=16

fig=plt.figure(figsize=(n_col*14, n_row * 18))

plt.rcParams.update({'font.size': 12})
plt.rcParams.update({'mathtext.default':'regular'})
plt.rcParams.update({'mathtext.default':'it'})

grid = plt.GridSpec(n_row*12+3, n_col*15+4, wspace=0.25, hspace=0) 

# create a grid for the subplots #0.12

# props = dict(boxstyle='round', facecolor='yellow', alpha=0.5)

for i in range(12):    
    if i%3==0:
        i_y=0
        ax = plt.subplot(grid[i+1:i+3,i_y:i_y*10+10], projection=ccrs.Robinson(central_longitude=0))
        
    else:
        ax = plt.subplot(grid[i-(i%3)+1:i-(i%3)+3,i_y*10:i_y*10+10], projection=ccrs.Robinson(central_longitude=0))
            
    i_y+=1
    
    y_ma = np.zeros(idx_l.shape)
    y_ma = ma.masked_array(y_ma, mask=idx_l==False)   
    reject = multipletests(p_vals[i,:],alpha=0.1,method='fdr_bh')[0]
    
    y_ma[idx_l] = reject
    mesh=ax.pcolormesh(lon_pc, lat_pc, y_ma,  cmap=mymap,transform=ccrs.PlateCarree(), norm=norm, vmin=0, vmax=0.1, rasterized=True)
    ax.set_title('BEST Obs. %s : %.2f' %(months[i],(reject.sum()/idx_l.sum()*100)) +'%',y=1.02,fontsize=14)
    axcbar = plt.subplot(grid[i-(i%3)+3:i-(i%3)+4,5:25])
    ax.coastlines()
    ax.add_feature(cfeature.OCEAN)
    plt.axis('off')
    

cbar=plt.colorbar(mesh, orientation="horizontal", fraction=0.65,aspect=75, ticks = [0, 1])
cbar.ax.tick_params(labelsize=12)
cbar.set_ticklabels(['not rejected','rejected'])
cbar.set_label('p-value [-]',fontsize=14)

plt.tight_layout()


In [None]:
## Transform variables first

power_trans={}
train_residue_trans={}
train_residue_all={}
test_residue_all={}

for model in models:

    print('Training power transformer for,', model)
    train_residue_all[model]=np.subtract(y_all_mon[model], train_results_all[model])
    train_residue_all[model]=train_residue_all[model].reshape(-1,12,idx_l.sum())
    
    
#     x = np.array(train_residue_all[model])
#     print(train_residue_all[model][:,i_mon,:].shape)
    
    power_trans[model]=[]
    for i_mon in tqdm(range(12)):
        power_trans[model].append(power_fit(train_residue_all[model][:,i_mon,:],y_all[model]))

    train_residue_trans[model]=Parallel(n_jobs=12,verbose=10)(delayed(power_transform)(power_trans[model][i_mon],train_residue_all[model][:,i_mon,:],y_all[model]) for i_mon in range(12))
    
    temp_residue_trans=np.zeros_like(train_residue_all[model])
    for i_mon in range(12):
        temp_residue_trans[:,i_mon,:]=train_residue_trans[model][i_mon]
    train_residue_trans[model]=temp_residue_trans
    
    dir_out_data_mod = '/home/tristan/mesmer/output/'
    
    if not os.path.exists(dir_out_data_mod):
        os.makedirs(dir_out_data_mod)
        print('created dir:',dir_out_data_mod)
    joblib.dump(train_residue_trans[model],dir_out_data_mod+'train_residue_trans.pkl')

    if not os.path.exists(dir_out_data_mod):
        os.makedirs(dir_out_data_mod)
        print('created dir:',dir_out_data_mod)
    joblib.dump(power_trans[model],dir_out_data_mod+'yeo_johnson_pt_fmin_log.pkl')

In [None]:
model = "Best observations"
models=[model]

temp_residue_trans={}
train_residue_trans={}
dir_out_data_mod = '/home/tristan/mesmer/output/'

for model in tqdm(models):
    train_residue_all[model]=joblib.load(dir_out_data_mod+'seasonal_training_results.pkl')
    
    temp_residue_trans=np.zeros_like(train_residue_all[model])
    
    power_trans[model] = joblib.load(dir_out_data_mod+'yeo_johnson_pt_fmin_log.pkl')
    
    train_residue_trans[model]=Parallel(n_jobs=12,verbose=10)(delayed(power_transform)(power_trans[model][i_mon],train_residue_all[model][:,i_mon,:],y_all[model]) for i_mon in tqdm(range(12)))
    
    for i_mon in range(12):
        temp_residue_trans[:,i_mon,:]=train_residue_trans[model][i_mon]

    train_residue_trans[model]=temp_residue_trans
    
    dir_out_data_mod = '/home/tristan/mesmer/output/'
    
    if not os.path.exists(dir_out_data_mod):
        os.makedirs(dir_out_data_mod)
        print('created dir:',dir_out_data_mod)
    
    joblib.dump(train_residue_trans[model],dir_out_data_mod+'train_residue_trans.pkl')