In [8]:
import xarray as xr
import numpy as np
import pandas as pd
import os
import glob
from typing import Dict, Optional, List, Callable, Tuple, Union

import wandb
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import RandomizedSearchCV
from eofs.xarray import Eof

### Train and test dataloading

In [156]:
input_dir = '/home/mila/v/venkatesh.ramesh/scratch/causal_data/inputs/input4mips'
target_dir = '/home/mila/v/venkatesh.ramesh/scratch/causal_data/targets/CMIP6'

models = ['NorESM2-LM']
fire_type = 'all-fires'
variables = ['pr']
train_experiments = ["ssp126"] #[ "ssp585", "ssp126", "ssp370"]
test_experiments = ["ssp245"]
input_gases = ['BC_sum', 'CH4_sum', 'CO2_sum', 'SO2_sum']
total_ensembles = 1 #-1 for all

In [157]:
# Check which variable to load and prepare the data for it.
# Can reuse the data-prep code from the dataloader.

In [167]:
def load_data(mode: str = 'train') -> tuple[np.ndarray, np.ndarray]:
    
    X = get_input_data(input_dir, mode)
    y = get_output_data(target_dir, mode)
    return X, y


def load_data_npz(path: str): #If np data already exists
    X_train, y_train = np.load(os.path.join(base_dir, ''))
    X_test, y_test = np.load(os.path.join(base_dir, ''))
    return X_train, y_train, X_test, y_test


def get_input_data(path: str, mode: str) -> List['str']:
    BC = []
    CH4 = []
    CO2 = []
    SO2 = []
    
    if mode == 'train':      
        experiments = train_experiments
    elif mode == 'test':
        experiments = test_experiments
        
    for exp in experiments:
        for gas in input_gases:
            var_dir = os.path.join(path, exp, gas, 'map_250_km/mon')
            files = glob.glob(var_dir + '/**/*.nc', recursive=True)

            for f in files:
                if gas == 'BC_sum' and fire_type in f:
                    BC.append(f)
            for f in files:
                if gas == 'CH4_sum' and fire_type in f:
                    CH4.append(f)
            for f in files:
                if gas == 'BC_sum' and fire_type in f:
                    SO2.append(f)
            for f in files:
                if gas == 'CO2_sum':
                    CO2.append(f)

    BC_data = xr.open_mfdataset(BC).compute().to_array().to_numpy()
    CH4_data = xr.open_mfdataset(CH4).compute().to_array().to_numpy()
    CO2_data = xr.open_mfdataset(CO2).compute().to_array().to_numpy()
    SO2_data = xr.open_mfdataset(SO2).compute().to_array().to_numpy()

#         merged_data = xr.concat((BC_data, CH4_data, CO2_data, SO2_data), dim=0)
    merged_data = np.concatenate((BC_data, CH4_data, CO2_data, SO2_data), axis=0)
    return merged_data


def get_output_data(path: str, mode: str) -> List['str']:
    nc_files = []
    
    if mode == 'train':
        experiments = train_experiments
    elif mode == 'test':
        experiments = test_experiments
        
    for mod in models:

        model_dir = os.path.join(path, mod)
        ensembles = os.listdir(model_dir)

        if total_ensembles == 1:
            ensembles = ensembles[0]

        for exp in experiments:
            for var in variables:
                var_dir = os.path.join(path, mod, ensembles, exp, var, '250_km/mon')
#                 print(os.path.join(path, mod, ensembles, exp, var,'250_km/mon'))
                files = glob.glob(var_dir + '/**/*.nc', recursive=True)
                nc_files += files
    
    return xr.open_mfdataset(nc_files).compute().to_array().to_numpy()

In [168]:
X_train, y_train = load_data('test')
print(X_train.shape, y_train.shape)
# data, files = load_data('train')

(4, 1032, 96, 144) (1, 1032, 96, 144)


### RF Parameters & HyperParameters (same as climatebench)

In [162]:
#**parameters & hyperparameters

RSCV= True
path_output='output_path/output.nc'

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5,55, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15, 25]
# Minimum number of samples required at each leaf node
min_samples_leaf = [4, 8, 12]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
reg0 = RandomForestRegressor(random_state=0)
reg1 = RandomForestRegressor(random_state=0)
reg2 = RandomForestRegressor(random_state=0)
reg3 = RandomForestRegressor(random_state=0)

if(RSCV==False):
    rf_tas = reg0.fit(X_train_tas,y_train_tas)
    rf_pr = reg1.fit(X_train_pr,y_train_pr)
    rf_pr90 = reg2.fit(X_train_pr90,y_train_pr90)
    rf_dtr = reg3.fit(X_train_dtr,y_train_dtr)
else:
    rf_random0 = RandomizedSearchCV(estimator = reg0, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)
    rf_random1 = RandomizedSearchCV(estimator = reg1, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)
    rf_random2 = RandomizedSearchCV(estimator = reg2, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)
    rf_random3 = RandomizedSearchCV(estimator = reg3, param_distributions = random_grid, n_iter = 29, cv = 3, verbose=2, n_jobs = -1)

    #n_iter = 29
    
    rf_tas = rf_random0.fit(X_train_tas,y_train_tas)
    rf_pr = rf_random1.fit(X_train_pr,y_train_pr)
    rf_pr90 = rf_random2.fit(X_train_pr90,y_train_pr90)
    rf_dtr = rf_random3.fit(X_train_dtr,y_train_dtr)