# Load data with pickle files

In [1]:
# Load the data
import pickle
import numpy as np
import matplotlib.pyplot as plt
import netCDF4 as netcdf

with open('ssp585_time_series.pkl', 'rb') as f:
    dic_ssp585 = pickle.load(f)

In [2]:
import os 

# Get the list of all files and directories
path = "/net/atmos/data/cmip6-ng/tos/ann/g025"
dir_list = os.listdir(path)

print("Files and directories in '", path, "' :")

list_model = []
list_forcing = []

for idx, file in enumerate(dir_list):

    file_split = file.split("_")
    
    # extract model names
    model_name = file_split[2]
    forcing = file_split[3]
    run_name = file_split[4]
    
    list_model.append(model_name)
    list_forcing.append(forcing)
    
model_names = list(set(list_model))
forcing_names = list(set(list_forcing))

Files and directories in ' /net/atmos/data/cmip6-ng/tos/ann/g025 ' :


# Load the observations

In [3]:
import netCDF4 as netcdf

# define the file
file = '/net/h2o/climphys3/simondi/cope-analysis/data/erss/sst_annual_g050_mean_19812014_centered.nc'

# read the dataset
file2read = netcdf.Dataset(file,'r')

# load longitude, latitude and sst monthly means
lon = np.array(file2read.variables['lon'][:])
lat = np.array(file2read.variables['lat'][:])
sst = np.array(file2read.variables['sst'])

# define grid
lat_grid, lon_grid = np.meshgrid(lat, lon, indexing='ij')

# Preprocessing of the data: $(x_{i,t,m}^{p})_{i=1,\ldots,I, t=1,\ldots,T,m=1,\ldots,M, p=1,\ldots,d}$
## $i$: ensemble member (run) index
## $t$: time index
## $m$: model index
## $p$: grid cell index

#### Keep the model with at least 3 ensemble memebers and downscale the data from latitude 144 -> 36 with local averaging (to match with ensemble methods

In [4]:
import skimage

# first filter out the models that do not contain ensemble members 
dic_reduced_ssp585 = {}

for m in list(dic_ssp585.keys()):
    if len(dic_ssp585[m].keys()) > 2:
        dic_reduced_ssp585[m] = dic_ssp585[m].copy()
        for idx_i, i in enumerate(dic_ssp585[m].keys()):
            dic_reduced_ssp585[m][i] = skimage.transform.downscale_local_mean(dic_reduced_ssp585[m][i],(1,2,2))

### 1) Compute anomalies: $\displaystyle \overline{x}_{i,t,m}^p = x_{i,t,m}^p - \frac{1}{t_{\mathrm{ref}}^f - t_{\mathrm{ref}}^s} \sum_{t= t_{\mathrm{ref}}^s}^{t_{\mathrm{ref}}^f} \sum_{i=1}^I x_{i,t,m}^p$

In [5]:
# second, for each model we compute the anomalies 
dic_processed_ssp585 = {}

import numpy as np

for idx_m,m in enumerate(dic_reduced_ssp585.keys()):
    dic_processed_ssp585[m] = dic_reduced_ssp585[m].copy()
    
    mean_ref_ensemble = 0
    for idx_i, i in enumerate(dic_reduced_ssp585[m].keys()):
        
        if idx_i == 0:
            mean_ref_ensemble = np.nanmean(dic_processed_ssp585[m][i][131:164,:,:],axis=0)/ len(dic_processed_ssp585[m])
        else:
            mean_ref_ensemble += np.nanmean(dic_processed_ssp585[m][i][131:164,:,:],axis=0)/ len(dic_processed_ssp585[m])
    
    for idx_i, i in enumerate(dic_processed_ssp585[m].keys()):
        dic_processed_ssp585[m][i] = dic_processed_ssp585[m][i] - mean_ref_ensemble


  mean_ref_ensemble = np.nanmean(dic_processed_ssp585[m][i][131:164,:,:],axis=0)/ len(dic_processed_ssp585[m])
  mean_ref_ensemble += np.nanmean(dic_processed_ssp585[m][i][131:164,:,:],axis=0)/ len(dic_processed_ssp585[m])


### 2) Compute the forced response: 
#### - Mean over space: $\displaystyle y_{i,t,m} = \frac{1}{P} \sum_{p=1}^P x_{i,t,m}^p$
#### - Mean over ensemble members: $\displaystyle \overline{y}_{t,m} = \frac{1}{I} \sum_{i=1}^I y_{i,t,m}$
#### - Set the mean to all the ensemble member forced responses: $y_{i,t,m} \colon= \overline{y}_{t,m}$
#### - Centering with respect to a given reference period: $\displaystyle y_{i,t,m} = y_{i,t,m} - \frac{1}{t_{\mathrm{ref}}^f - t_{\mathrm{ref}}^s} \sum_{t= t_{\mathrm{ref}}^s}^{t_{\mathrm{ref}}^f} \overline{y}_{t,m}$

In [6]:
# compute the forced response
dic_forced_response_ssp585 = dict({})

for idx_m,m in enumerate(dic_reduced_ssp585.keys()):
    dic_forced_response_ssp585[m] = dic_reduced_ssp585[m].copy()
    
    mean_spatial_ensemble = 0
    for idx_i, i in enumerate(dic_forced_response_ssp585[m].keys()):
        
        if idx_i == 0:
            mean_spatial_ensemble = np.nanmean(dic_forced_response_ssp585[m][i],axis=(1, 2))/ len(dic_forced_response_ssp585[m])
        else:
            mean_spatial_ensemble += np.nanmean(dic_forced_response_ssp585[m][i],axis=(1, 2))/ len(dic_forced_response_ssp585[m])
            
    
    for idx_i, i in enumerate(dic_forced_response_ssp585[m].keys()):
        
        dic_forced_response_ssp585[m][i] = mean_spatial_ensemble - np.mean(mean_spatial_ensemble[131:164])

## Now we can use the data to run some simple regression models

In [7]:
time_period = 33
grid_lat_size = 36
grid_lon_size = 72

y_forced_response = np.zeros((len(dic_forced_response_ssp585.keys()), time_period))
x_predictor = np.zeros((len(dic_forced_response_ssp585.keys()), time_period, grid_lat_size, grid_lat_size))

In [8]:
y_forced_response = {}
x_predictor = {}

for idx_m,m in enumerate(dic_processed_ssp585.keys()):
    y_forced_response[m] = {}
    x_predictor[m] = {}
    
    for idx_i, i in enumerate(dic_forced_response_ssp585[m].keys()):
       
        y_forced_response[m][i] = dic_forced_response_ssp585[m][i][131:164]
        x_predictor[m][i] = dic_processed_ssp585[m][i][131:164,:,:]

In [9]:
# compute the variance
vars_ssp585 = {}
std_ssp585 = {}
for idx_m,m in enumerate(dic_reduced_ssp585.keys()):
    arr_tmp = np.zeros((len(dic_processed_ssp585[m].keys()),33))
    for idx_i, i in enumerate(dic_processed_ssp585[m].keys()):
        arr_tmp[idx_i,:] = np.nanmean(dic_processed_ssp585[m][i][131:164,:,:],axis=(1,2))
    vars_ssp585[m] = np.mean(np.var(arr_tmp,axis=0))
    std_ssp585[m] = np.mean(np.std(arr_tmp,axis=0))

# Define training set

In [10]:
import torch 

# Data preprocessing
x_train = {}
y_train = {}

for idx_m,m in enumerate(dic_reduced_ssp585.keys()):
    x_train[m] = {}
    y_train[m] = {}
    
    for idx_i, i in enumerate(dic_processed_ssp585[m].keys()):
    
        x_train[m][i] = torch.from_numpy(np.nan_to_num(x_predictor[m][i]).reshape(x_predictor[m][i].shape[0],x_predictor[m][i].shape[1]*x_predictor[m][i].shape[2])).to(torch.float64)
        y_train[m][i] = torch.from_numpy(np.nan_to_num(y_forced_response[m][i])).to(torch.float64)
    
        nans_idx = np.where(np.isnan(x_predictor[m][i][0,:,:].ravel()))[0]
        notnans_idx = np.where(np.isnan(x_predictor[m][i][0,:,:].ravel())==False)[0]

In [48]:
best_lambda = {}
best_rmse = {}

for idx_m, m in enumerate(x_train.keys()):
    test_rmse = np.zeros(lambda_range.shape[0])
    for idx_lambda, lambda_ in enumerate(lambda_range):
        # test_rmse[lambda_] = rmse[m][lambda_].mean()
        test_rmse[idx_lambda] = np.array(list(rmse[m][idx_lambda].values())).mean()

    # find mininum
    best_rmse[m] = np.min(test_rmse)
    best_lambda[m] = lambda_range[np.argmin(test_rmse)]

In [None]:
def plot_betas(x,y,vars,lon_size,lat_size,lambda_):

    beta = torch.zeros(len(x.keys()),lon_size*lat_size)

    for idx_m,m in enumerate(x.keys()):
        # beta[idx_m,:] =  ridge_estimator(x[m],y[m],vars[m],lambda_)
        beta[idx_m,:] =  train_single_ridge_regression(x[m],y[m],vars[m],lon_size,lat_size,lambda_,nbEpochs=100,verbose=True)


    # plot the beta map of each mode
    fig, axs = plt.subplots(6,5, figsize=(15,10), facecolor='w', edgecolor='k')
    fig.subplots_adjust(hspace = 2.0, wspace=1.0)

    axs = axs.ravel()
    
    for idx_m, m in enumerate(x.keys()):
        
        beta_tmp = beta[idx_m,:].detach().clone()
        beta_tmp[nans_idx] = float('nan')
        beta_tmp = beta_tmp.detach().numpy().reshape(lat_size,lon_size)

        axs[idx_m].set_title(m+ ' ('+ str(len(dic_processed_ssp585[m].keys())) +') ')
        im0 = axs[idx_m].pcolormesh(lon_grid,lat_grid,beta_tmp,vmin=-0.00,vmax = 0.01)

    plt.colorbar(im0, ax=axs[idx_m], shrink=0.5)

    for i in range(len(x.keys()),30):
        fig.delaxes(axs[i])

    fig.tight_layout()
    plt.show()

    return beta

In [None]:
# load betas


In [None]:
# plot_betas_descent(x_train,y_train,vars_ssp585,grid_lon_size,grid_lat_size,1.0,nbEpochs=5000,verbose=False)

# Analysis of the betas

### 1) PCA on the betas

In [None]:
# run with PCA with pytorch
U,S,V = torch.pca_lowrank(beta, q=6, center=False, niter=10)
proj_first_comp = torch.matmul(beta, V[:, :6])

In [None]:
fig = plt.figure()
plt.plot(S**2/25)
plt.ylim((0.0,0.002))
plt.xlim((0.0,6))
plt.title('Eigenvalues')
plt.show()

In [None]:
comp_x = 0
comp_y = 1

fig, ax = plt.subplots()
ax.scatter(proj_first_comp[:,comp_x],proj_first_comp[:,comp_y])

for idx_m, m in enumerate(dic_reduced_ssp585.keys()):
    ax.annotate(m, (proj_first_comp[idx_m,comp_x]+.003, proj_first_comp[idx_m,comp_y]+.003))

plt.show()

In [None]:
# plot the beta map of each mode
fig, axs = plt.subplots(2,3, figsize=(15,10), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = 2.0, wspace=1.0)

axs = axs.ravel()
    
for k in range(6):
        
    beta_tmp = V[:,k].detach().clone()
    beta_tmp[nans_idx] = float('nan')
    beta_tmp = beta_tmp.detach().numpy().reshape(grid_lat_size,grid_lon_size)

    axs[k].set_title('Component '+ str(k))
    im0 = axs[k].pcolormesh(lon_grid,lat_grid,beta_tmp,vmin=-0.00,vmax = 0.1)

    plt.colorbar(im0, ax=axs[k], shrink=0.5)

fig.tight_layout()
plt.show()

### 2) Hierarchical clustering (within cluster variance based)

#### Ward based hierarchical clustering: minimize the total within cluster variance 

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, leaves_list

models = list(dic_reduced_ssp585.keys())
Z1 = linkage(beta.detach().numpy(), 'ward')
leaves_tmp = leaves_list(Z1)

labels_tmp =  [models[int(i)] for i in leaves_tmp] 


fig = plt.figure(figsize=(25, 10))
dn = dendrogram(Z1,labels = models)

#### plot the model variance to check how it impacts the clustering

In [None]:
# compute the coefficient using soft max
M = len(list(dic_reduced_ssp585.keys()))
gamma = torch.zeros(M)
ordered_betas = [models[int(i)] for i in dn['leaves']]

for idx,i in enumerate(dn['leaves']):
    m = models[int(i)] 
    gamma[idx] = vars_ssp585[m]

# plot the model contributions
fig, ax = plt.subplots()
models = list(dic_reduced_ssp585.keys())
weights = list(gamma.detach().numpy())

ax.bar(models, weights,label='Model variance')
ax.set_ylabel(r'Internal variances')
ax.set_title('cmip6 models')
ax.legend()
ax.set_xticklabels(ordered_betas, rotation=-90)
plt.tight_layout()
plt.show()

### Plot the betas with respect to a given ordering

In [None]:
# plot the beta maps in the leaf-based ordering
fig, axs = plt.subplots(6,5, figsize=(15,10), facecolor='w', edgecolor='k')
fig.subplots_adjust(hspace = 2.0, wspace=1.0)

axs = axs.ravel()
    
for idx,i in enumerate(dn['leaves']):

    m = models[int(i)] 
    beta_tmp = beta[int(i),:].detach().clone()
    beta_tmp[nans_idx] = float('nan')
    beta_tmp = beta_tmp.detach().numpy().reshape(grid_lat_size,grid_lon_size)

    axs[idx].set_title(m)
    im0 = axs[idx].pcolormesh(lon_grid,lat_grid,beta_tmp,vmin=-0.00,vmax = 0.01)

plt.colorbar(im0, ax=axs[idx], shrink=0.5)

for i in range(len(dic_reduced_ssp585.keys()),30):
    fig.delaxes(axs[i])

fig.tight_layout()
plt.show()