In [3]:
## general
import numpy as np
import datetime
import copy
import cf_units
import xarray as xr
import os
import sys
from tqdm import tqdm_notebook as tqdm
import datetime as dt
import matplotlib as mpl
import math

## statistics
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy import stats
from scipy.stats import multivariate_normal # to compute likelihood
from sklearn.impute import SimpleImputer
from scipy.stats import shapiro  #check normalicy of seasonal trend distribution
from scipy.optimize import curve_fit, fmin, fminbound, minimize, rosen_der, least_squares
from sklearn.preprocessing import StandardScaler
import pickle

##import functions for fitting
from symfit import parameters, variables, Fit
from symfit import pi,sqrt,log,exp,sinh
from symfit import sin, cos


# statistics which aren't all that nice in python
import rpy2.robjects as robjects

## my stuff
sys.path.insert(1,'/home/tristan/mesmer/tools/')
from loading import load_data_single_mod
from processing import AR1_predict, compute_llh_cv,gaspari_cohn
from plotting import TaylorDiagram


## plotting
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import numpy.ma as ma
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import mplotutils as mpu

##for parallelisation
from sklearn.externals.joblib import Parallel, delayed
from sklearn.externals import joblib

ValueError: r_home is None. Try python -m rpy2.situation

In [None]:
def fit_fmin(self, X, y):
    """Estimate the optimal parameter lambda for each feature.
    The optimal lambda parameter for minimizing skewness is estimated on
    each feature independently using maximum likelihood.
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        The data used to estimate the optimal transformation parameters.
    y : temp values to calculate lmbda as lmbda = a*y + b
    Returns
    -------
    self : object
    """
    
    X = X.copy()  # force copy so that fit does not change X inplace


    self.coeffs_ =[]
    for i_grid in tqdm(np.arange(idx_l.sum())):
#         print(X.shape, y.shape)
        self.coeffs_.append(yeo_johnson_optimize_fmin(self,X[:,i_grid],y[:,i_grid]))
        
        
    self.coeffs_=np.array(self.coeffs_)
    #print(self.coeffs_.shape)
    self.mins_ = np.amin(X, axis=0)
    self.maxs_ = np.amax(X, axis=0)
    #print(self.coeffs_)
    
    if self.standardize:
        self._scaler = StandardScaler(copy=True)
        self._scaler.fit(X)
        
    return self
    
def transform_fmin(self, X, y):
        """Apply the power transform to each feature using the fitted lambdas.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The data to be transformed using a power transformation.
        Returns
        -------
        X_trans : array-like, shape (n_samples, n_features)
            The transformed data.
        """

        lambdas=get_yeo_johnson_lambdas(self.coeffs_,y)
        
        X_trans=np.zeros_like(X)
        for i, lmbda in enumerate(lambdas.T):
            for j,j_lmbda in enumerate(lmbda):
                with np.errstate(invalid='ignore'):  # hide NaN warnings
                    X_trans[j, i] = self._yeo_johnson_transform(X[j, i], j_lmbda)

        if self.standardize:
            X_trans = self._scaler.transform(X_trans)

        return X_trans

def inverse_transform_fmin(self, X, y):
        """Apply the inverse power transformation using the fitted lambdas.
        The inverse of the Yeo-Johnson transformation is given by::
            if X >= 0 and lambda_ == 0:
                X = exp(X_trans) - 1
            elif X >= 0 and lambda_ != 0:
                X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1
            elif X < 0 and lambda_ != 2:
                X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))
            elif X < 0 and lambda_ == 2:
                X = 1 - exp(-X_trans)
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The transformed data.
        Returns
        -------
        X : array-like, shape (n_samples, n_features)
            The original data
        """

        if self.standardize:
            X = self._scaler.inverse_transform(X) 
        
        X_inv = np.zeros_like(X)
        
        lambdas=get_yeo_johnson_lambdas(self.coeffs_,y)
        for i, lmbda in enumerate(lambdas.T):
            for j,j_lmbda in enumerate(lmbda):
                with np.errstate(invalid='ignore'):  # hide NaN warnings
                    X_inv[j, i] = self._yeo_johnson_inverse_transform(X[j, i], j_lmbda)
            X_inv[:,i]=np.where(X_inv[:,i]<self.mins_[i],self.mins_[i],X_inv[:,i])
            X_inv[:,i]=np.where(X_inv[:,i]>self.maxs_[i],self.maxs_[i],X_inv[:,i])

        return X_inv

def yeo_johnson_optimize_fmin(self, x, y):
    """Find and return optimal lambda parameter of the Yeo-Johnson
    transform by MLE, for observed data x.
    Like for Box-Cox, MLE is done via the brent optimizer.
    """

    def _neg_log_likelihood(coeff):
        """Return the negative log likelihood of the observed data x as a
        function of lambda."""
        lambdas=2/(1+coeff[0]*np.exp(y*coeff[1]))
        
        x_trans =np.zeros_like(x)
#         print(x.shape)
        #print(lambdas.shape)
        for i, lmbda in enumerate(lambdas):
            x_trans[i] = self._yeo_johnson_transform(x[i], lmbda)
        
        n_samples = x.shape[0]

        loglike = -n_samples / 2 * np.log(x_trans.var())
        loglike += ((lambdas - 1) * np.sign(x) * np.log1p(np.abs(x))).sum()

        return -loglike

    # the computation of lambda is influenced by NaNs so we need to
    # get rid of them
    x = x[~np.isnan(x)]
    y = y[~np.isnan(y)]
    # choosing bracket -2, 2 like for boxcox
    bounds=np.c_[[0,0], [1,0.1]]
    
    return minimize(_neg_log_likelihood, np.array([0.01,0.001]), bounds=bounds, method='SLSQP',jac=rosen_der 
               ).x

def get_yeo_johnson_lambdas(coeffs,y):

    lambdas=np.zeros_like(y)
    i=0
    for a,b in zip(coeffs,y.T):
        
        lambdas[:,i]=2/(1+a[0]*np.exp(b*a[1]))
        i+=1
        
    lambdas=np.where(lambdas<0,0,lambdas)
    lambdas=np.where(lambdas>2,2,lambdas)
    
    return lambdas

In [2]:
from sklearn.preprocessing import PowerTransformer

def power_fit(residue,y,fmin=True):
    
    if fmin:
        power_trans = fit_fmin(PowerTransformer(method='yeo-johnson'),residue.reshape(-1, idx_l.sum()),y)
    else:
        power_trans = PowerTransformer(method='yeo-johnson').fit(residue.reshape(-1, idx_l.sum()))
    
    return power_trans
    
def power_transform(mod, residue,y,fmin=True):
    
    if fmin:
        residue_trans = transform_fmin(mod,residue.reshape(-1, idx_l.sum()),y).reshape(-1,idx_l.sum())
    else:
        residue_trans = mod.transform(residue.reshape(-1, idx_l.sum())).reshape(-1,idx_l.sum())
            
    return residue_trans  

def power_inv_transform(mod, residue,y,fmin=True):
    
    if fmin:
        residue_inv_trans = inverse_transform_fmin(mod,residue.reshape(-1, idx_l.sum()),y).reshape(-1,idx_l.sum())
    else:
        residue_inv_trans = mod.inverse_transform(residue.reshape(-1, idx_l.sum())).reshape(-1,idx_l.sum())
            
    return residue_inv_trans  

def compute_llh_cv(res_tr,res_cv,phi):
    """ Compute sum of log likelihood of a set of residuals based on a covariance matrix derived from a different set (of timeslots) of residuals
    
    Keyword arguments:
        - res_tr: the residual of the training run lacking a specific fold after removing the local mean response (nr ts x nr gp). Nans must be removed before
        - res_cv: the residual of a fold which was removed from the training run
        - phi: matrix to localize the covariance matrix based on a specific localisation radius and distance information (phi = output of fct gaspari_cohen(geo_dist/L))
    
    Output:
        - llh_innov_cv: sum of the log likelihood over the cross validation time slots
    
    """

    ecov_res_tr = np.cov(res_tr,rowvar=False)
    cov_res_tr=phi*ecov_res_tr
    
    mean_0 = np.zeros(phi.shape[0]) # we want the mean of the res to be 0

    llh_innov_cv=np.sum(multivariate_normal.logpdf(res_cv,mean=mean_0, cov=cov_res_tr,allow_singular=True))

    return llh_innov_cv   

def leave_one_out(L_set,nr_folds,residue_trans,idx_fo_tot,phi):
    
    def folds_calc(idx_fo,residue_trans,phi,L):
    
        res_tot_est = residue_trans[~idx_fo] 
        res_tot_fo=residue_trans[idx_fo]

        llh_cv=compute_llh_cv(res_tot_est,res_tot_fo,phi[L])
        
        return llh_cv
    
    idx_L=0
    L = L_set[idx_L]
    
    df_llh_cv={}
    df_llh_cv['llh_max']=-10000
    df_llh_cv['all']={}
    df_llh_cv['sum']={}
    df_llh_cv['L_sel']=L_set[idx_L]
    
    while (L-df_llh_cv['L_sel']<=250) and (df_llh_cv['L_sel']<L_set[-1]): # based on experience I know that once stop selecting larger 
            #loc radii, will not start again -> no point in looping through everything, better to stop once max is 
            #reached (to avoid singular matrices)
        L = L_set[idx_L]
        print('start with L ',L)
        df_llh_cv['all'][L]={}
        df_llh_cv['sum'][L]=0
        for i_fold_par in tqdm(np.arange(len(idx_fo_tot.keys()))):
            df_llh_cv['all'][L][i_fold_par]=folds_calc(idx_fo_tot[i_fold_par],residue_trans,phi,L)
            df_llh_cv['sum'][L] += df_llh_cv['all'][L][i_fold_par]
            
       
        #df_llh_cv['all'][L]=Parallel(n_jobs=10,verbose=10)(delayed(folds_calc)(idx_fo_tot[i],residue_trans,phi,L)for i in np.arange(len(idx_fo_tot.keys())))
        
        #print('rest tot fo shape ',res_tot_fo.shape,'res_tot_est shape ',res_tot_est.shape)
        if df_llh_cv['sum'][L]>df_llh_cv['llh_max']:
            df_llh_cv['L_sel']=L
            df_llh_cv['llh_max']=df_llh_cv['sum'][L]
            print('currently selected L=',df_llh_cv['L_sel'])

        idx_L+=1  
    return df_llh_cv

def lin_func(x, a, b):
    return a * x + b

In [3]:
# load the land mask as frac_l
dir_in_geo_dist = '/home/tristan/mesmer/data/'
frac_l = xr.open_mfdataset(dir_in_geo_dist + 'interim_invariant_lsmask_regrid.nc', combine='by_coords',decode_times=False)

frac_l_raw = np.squeeze(copy.deepcopy(frac_l.lsm.values))  #land-sea mask of ERA-interim bilinearily interpolated 

frac_l = frac_l.where(frac_l.lat>-60,0)  # remove Antarctica from frac_l field (ie set frac l to 0)

idx_l=np.squeeze(frac_l.lsm.values)>0.0 # idx_l = index land -> idex land #-> everything >0 we consider as land

lon_pc, lat_pc = mpu.infer_interval_breaks(frac_l.lon, frac_l.lat)  ## is this needed??

NameError: name 'xr' is not defined

In [4]:
dir_in_data_mod = '/home/tristan/mesmer/data/'
nr_yrs = 112
nr_months = 12
tot_months = nr_yrs*nr_months

# prepare the inputs as array
BEST_data = 'obs_data_25.nc'
data_mask = 'interim_invariant_lsmask_regrid.nc'       

df_obs = xr.open_mfdataset(dir_in_geo_dist+BEST_data).roll(lon=72) #open observation data

#create the climatology values array
y_ma = np.zeros((tot_months,idx_l.sum()))  #create emtpy array with correct shape
for i in range(tot_months):
    y_ma[i] = df_obs.climatology.values[i%12,idx_l]    #fill climatology values in the array

#create test data over date range - here, 127 years so 1910 incl. to 2022 incl. 
## 1344 is the number of months from 1910 until 2022
data_test = np.nan_to_num(np.array([df_obs.temperature.values[720:2064,idx_l]]))
data_test = data_test.reshape(tot_months,idx_l.sum())

print(data_test.shape)

#load in monthly temperature values by adding the temp anomolies to the climatology
y_all_mon = np.add(y_ma, data_test)     

# now subtract the yearly average climatology so we are left with residuals
y_all_mon = y_all_mon - np.reshape(np.tile(np.mean(df_obs.climatology.values[:,idx_l],axis=0),tot_months),(tot_months,idx_l.sum()))   

################## UNTIL HERE #########################             

#calculate annual average temperature values- here we use nanmean to calculate the annual means for each gridpoint but skipping any Nan values
y_all = np.mean(y_all_mon.reshape(-1,12,idx_l.sum()),axis=1)
print(y_all.shape)

joblib.dump(y_all, dir_in_data_mod+'y_all.pkl')
joblib.dump(y_all_mon, dir_in_data_mod+'y_all_mon.pkl')

NameError: name 'xr' is not defined

In [None]:
emu_res={}
nr_emus=500
buffer=10
nr_ts=112
dir_in_data_mod = '/home/tristan/mesmer/output/'
dir_out_data_mod = '/home/tristan/mesmer/output/'

##load calibration parameters for local variability module
df_llh_cv_all= joblib.load(dir_in_data_mod+'llh_cv_all.pkl')
coeffs_temp = joblib.load(dir_in_data_mod+'AR(1)_coeffs.pkl')
coeff_0 = coeffs_temp[0,:,:]
coeff_1 = coeffs_temp[1,:,:]
power_trans=joblib.load(dir_in_data_mod+'yeo_johnson_pt_fmin_log.pkl')
train_residue_trans=joblib.load(dir_in_data_mod+'train_residue_trans.pkl')
innov_emu = joblib.load(dir_in_data_mod+'innov_emu.pkl')

start = dt.datetime.now()

for k in np.arange(nr_emus):
    emu_res[k]=np.zeros([nr_ts+buffer,12,idx_l.sum()])
    for t in np.arange(1,emu_res[k].shape[0]):
        for i_mon in range(12):

            if i_mon==0:
                emu_res[k][t,i_mon,:]=coeff_0[i_mon,:]+coeff_1[i_mon,:]*emu_res[k][t-1,11,:]+innov_emu[i_mon][k,t]

            else:
                emu_res[k][t,i_mon,:]=coeff_0[i_mon,:]+coeff_1[i_mon,:]*emu_res[k][t,i_mon-1,:]+innov_emu[i_mon][k,t]

    emu_res[k]=emu_res[k][buffer:,:,:]

    for i_mon in range(12):

        emu_res[k][:,i_mon,:]=power_inv_transform(power_trans[i_mon],emu_res[k][:,i_mon,:],y_all)

joblib.dump(emu_res,dir_out_data_mod+'%i_emulator_innovations_fmin_log.pkl'%(nr_emus))
time_taken = dt.datetime.now() - start

print('time taken to create %i emulations: '%(nr_emus, time_taken)

In [None]:
def create_emulations(y_all, innov_emu, power_trans):
    emu_res=np.zeros([nr_ts+buffer,12,idx_l.sum()])
    for t in np.arange(1,emu_res.shape[0]):
        for i in range(12):
            if i_mon==0:
                    emu_res[t,i_mon,:]=coeff_0[i_mon,:]+coeff_1[i_mon,:]*emu_res[t-1,11,:]+innov_emu[i_mon][t]

                else:
                    emu_res[t,i_mon,:]=coeff_0[i_mon,:]+coeff_1[i_mon,:]*emu_res[t,i_mon-1,:]+innov_emu[i_mon][t]

        emu_res=emu_res[buffer:,:,:]

        for i_mon in range(12):

            emu_res[:,i_mon,:]=power_inv_transform(power_trans[i_mon],emu_res[:,i_mon,:],y_all)
    
    return emu_res

In [None]:
emu_res={}
nr_emus=500
buffer=10
nr_ts=112
dir_in_data_mod = '/home/tristan/mesmer/output/'
dir_out_data_mod = '/home/tristan/mesmer/output/'

##load calibration parameters for local variability module
df_llh_cv_all= joblib.load(dir_in_data_mod+'llh_cv_all.pkl')
coeffs_temp = joblib.load(dir_in_data_mod+'AR(1)_coeffs.pkl')
coeff_0 = coeffs_temp[0,:,:]
coeff_1 = coeffs_temp[1,:,:]
power_trans=joblib.load(dir_in_data_mod+'yeo_johnson_pt_fmin_log.pkl')
train_residue_trans=joblib.load(dir_in_data_mod+'train_residue_trans.pkl')
innov_emu = joblib.load(dir_in_data_mod+'innov_emu.pkl')

start = dt.datetime.now()

for k in np.arange(nr_emus):
    emu_res[k] = create_emulations(y_all, innov_emu, power_trans)
    
print('time taken to create %i emulations: '%(nr_emus, time_taken)