In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import os
import glob
from typing import Dict, Optional, List, Callable, Tuple, Union

import wandb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error

### Train and test dataloading

In [2]:
input_dir = '/home/mila/v/venkatesh.ramesh/scratch/causal_data/inputs/input4mips'
target_dir = '/home/mila/v/venkatesh.ramesh/scratch/causal_data/targets/CMIP6'

models = ['NorESM2-LM']
fire_type = 'all-fires'
variables = ['pr']
train_experiments = [ "ssp585", "ssp126", "ssp370"] 
test_experiments = ["ssp245"]
input_gases = ['BC_sum', 'CH4_sum', 'CO2_sum', 'SO2_sum']
total_ensembles = 1 #-1 for all

In [3]:
# Check which variable to load and prepare the data for it.
# Can reuse the data-prep code from the dataloader.

In [4]:
def load_data(mode: str = 'train') -> tuple[np.ndarray, np.ndarray]:
    
    X = get_input_data(input_dir, mode)
    y = get_output_data(target_dir, mode)
    return X, y


def load_data_npz(path: str): #If np data already exists
    X_train, y_train = np.load(os.path.join(base_dir, ''))
    X_test, y_test = np.load(os.path.join(base_dir, ''))
    return X_train, y_train, X_test, y_test


def get_input_data(path: str, mode: str):
    BC = []
    CH4 = []
    CO2 = []
    SO2 = []
    
    if mode == 'train':      
        experiments = train_experiments
    elif mode == 'test':
        experiments = test_experiments
        
    for exp in experiments:
        for gas in input_gases:
            var_dir = os.path.join(path, exp, gas, 'map_250_km/mon')
            files = glob.glob(var_dir + '/**/*.nc', recursive=True)

            for f in files:
                if gas == 'BC_sum' and fire_type in f:
                    BC.append(f)
            for f in files:
                if gas == 'CH4_sum' and fire_type in f:
                    CH4.append(f)
            for f in files:
                if gas == 'BC_sum' and fire_type in f:
                    SO2.append(f)
            for f in files:
                if gas == 'CO2_sum':
                    CO2.append(f)

    BC_data = xr.open_mfdataset(BC, concat_dim='time', combine='nested').compute().to_array().to_numpy()
    CH4_data = xr.open_mfdataset(CH4, concat_dim='time', combine='nested').compute().to_array().to_numpy()
    CO2_data = xr.open_mfdataset(CO2, concat_dim='time', combine='nested').compute().to_array().to_numpy()
    SO2_data = xr.open_mfdataset(SO2, concat_dim='time', combine='nested').compute().to_array().to_numpy()

    merged_data = np.concatenate((BC_data, CH4_data, CO2_data, SO2_data), axis=0)
    return merged_data


def get_output_data(path: str, mode: str):
    nc_files = []
    
    if mode == 'train':
        experiments = train_experiments
    elif mode == 'test':
        experiments = test_experiments
        
    for mod in models:

        model_dir = os.path.join(path, mod)
        ensembles = os.listdir(model_dir)

        if total_ensembles == 1:
            ensembles = ensembles[0]
        
        exp_counter = 0
        for exp in experiments:
            for var in variables:
                var_dir = os.path.join(path, mod, ensembles, exp, var, '250_km/mon')
                files = glob.glob(var_dir + '/**/*.nc', recursive=True)
                nc_files += files
        
            if exp_counter == 0:
                dataset = xr.open_mfdataset(nc_files).compute().to_array().to_numpy()
        
            else: #concatenate dataset in time dimension
                other_experiment = xr.open_mfdataset(nc_files).compute().to_array().to_numpy()
                dataset = np.concatenate((dataset, other_experiment), axis=1)
                
                
            exp_counter += 1
    
    return dataset

In [5]:
X_train, y_train = load_data('train')

In [6]:
X_test, y_test = load_data('test')

In [7]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(4, 3096, 96, 144) (1, 3096, 96, 144) (4, 1032, 96, 144) (1, 1032, 96, 144)


### Calculating statistics

In [8]:
X_stat = X_train.copy()
print(X_stat.shape)

vars_mean = np.mean(X_stat, axis=(1, 2, 3))
vars_std = np.std(X_stat, axis=(1, 2, 3))

(4, 3096, 96, 144)


In [14]:
np.min(vars_std) + 1e-40

1.915785021941452e-12

In [12]:
# vars_mean = np.expand_dims(vars_mean, (1, 2, 3))
# vars_std = np.expand_dims(vars_std, (1, 2, 3))

print(vars_mean.shape, vars_std.shape)

print(vars_mean, vars_std)

input_stats = np.concatenate((np.expand_dims(vars_mean, (-1, 1, 2, 3)), np.expand_dims(vars_std, (-1, 1, 2, 3))), axis=-1)

input_stats

x = np.array([3.1055716e-13, 1.9952876e-11, 2.7081224e-09, 3.1055716e-13])

x = np.expand_dims(x, (1, 2, 3))
print(x.shape)

X_norm = (X_stat - vars_mean)/ vars_std

print(X_norm.shape)

y_train.shape

out_mean = np.mean(y_train, axis=(1, 2, 3))
out_std = np.mean(y_train, axis=(1, 2, 3))

print(out_mean, out_std)

y_norm = (y_train - out_mean)/(out_std)

# (v - v.min()) / (v.max() - v.min())

z = np.zeros((258, 12, 4, 96, 144))

z = np.moveaxis(z, 2, 0)
print(z.shape)

z_max = np.min(z, (1, 2, 3, 4))

print(z_max.shape)

### RF Parameters & HyperParameters (same as climatebench)

In [259]:
#**parameters & hyperparameters

RSCV= True
path_output='output_path/output.nc'

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5,55, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15, 25]
# Minimum number of samples required at each leaf node
min_samples_leaf = [4, 8, 12]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

### Data split

In [285]:
reg0 = RandomForestRegressor(random_state=0)
rf_random0 = RandomizedSearchCV(estimator = reg0, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=2, n_jobs = -1)

In [281]:
X_test = np.moveaxis(X_test, 0, 1)
y_test = np.moveaxis(y_test, 0, 1)

print(X_test.shape, y_test.shape)

(1032, 4, 96, 144) (1032, 1, 96, 144)


In [282]:
X_test = X_test.reshape(X_test.shape[0], -1)
y_test = y_test.reshape(y_test.shape[0], -1)

In [283]:
print(X_test.shape, y_test.shape)

(1032, 55296) (1032, 13824)


In [286]:
rf_pr = rf_random0.fit(X_test, y_test)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=  17.7s
[CV] END bootstrap=False, max_depth=35, max_features=sqrt, min_samples_leaf=8, min_samples_split=25, n_estimators=150; total time=  11.9s
[CV] END bootstrap=False, max_depth=15, max_features=auto, min_samples_leaf=4, min_samples_split=15, n_estimators=150; total time= 4.0min
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=200; total time= 5.4min


In [288]:
print(rf_pr.best_params_)

{'n_estimators': 250, 'min_samples_split': 15, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}
[CV] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time=  22.5s
[CV] END bootstrap=False, max_depth=35, max_features=sqrt, min_samples_leaf=8, min_samples_split=25, n_estimators=150; total time=  16.1s
[CV] END bootstrap=False, max_depth=15, max_features=auto, min_samples_leaf=4, min_samples_split=15, n_estimators=150; total time= 4.3min
[CV] END bootstrap=False, max_depth=50, max_features=auto, min_samples_leaf=8, min_samples_split=5, n_estimators=200; total time= 5.7min
[CV] END bootstrap=False, max_depth=None, max_features=auto, min_samples_leaf=12, min_samples_split=15, n_estimators=100; total time= 2.7min
[CV] END bootstrap=False, max_depth=55, max_features=sqrt, min_samples_leaf=4, min_samples_split=15, n_estimators=100; total time=   8.1s
[CV] END bootstrap=False, max_depth=55, m

In [290]:
print(X_train.shape, y_train.shape)

(4, 3096, 96, 144) (1, 3096, 96, 144)


In [291]:
X_train = np.moveaxis(X_train, 0, 1)
y_train = np.moveaxis(y_train, 0, 1)

print(X_train.shape, y_train.shape)

X_train = X_train.reshape(X_train.shape[0], -1)
y_train = y_train.reshape(y_train.shape[0], -1)

print(X_train.shape, y_train.shape)

(3096, 4, 96, 144) (3096, 1, 96, 144)
(3096, 55296) (3096, 13824)


In [294]:
y_pred = rf_pr.predict(X_train)

In [295]:
y_pred.shape

(3096, 13824)

In [None]:
y_pred = y_pred.reshape(3096, 1, 96, 144)

In [303]:
rmse = mean_squared_error(y_train, y_pred, squared=False)

In [304]:
rmse

1.8836844003928504e-05

### Training and testing

In [292]:
# reg0 = RandomForestRegressor(random_state=0)
# reg1 = RandomForestRegressor(random_state=0)
# reg2 = RandomForestRegressor(random_state=0)
# reg3 = RandomForestRegressor(random_state=0)

# if(RSCV==False):
#     rf_tas = reg0.fit(X_train_tas,y_train_tas)
#     rf_pr = reg1.fit(X_train_pr,y_train_pr)
#     rf_pr90 = reg2.fit(X_train_pr90,y_train_pr90)
#     rf_dtr = reg3.fit(X_train_dtr,y_train_dtr)
# else:
#     rf_random0 = RandomizedSearchCV(estimator = reg0, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)
#     rf_random1 = RandomizedSearchCV(estimator = reg1, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)
#     rf_random2 = RandomizedSearchCV(estimator = reg2, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)
#     rf_random3 = RandomizedSearchCV(estimator = reg3, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)

#     #n_iter = 29
    
#     rf_tas = rf_random0.fit(X_train_tas,y_train_tas)
#     rf_pr = rf_random1.fit(X_train_pr,y_train_pr)
#     rf_pr90 = rf_random2.fit(X_train_pr90,y_train_pr90)
#     rf_dtr = rf_random3.fit(X_train_dtr,y_train_dtr)