In [7]:
import pandas as pd
from netCDF4 import Dataset
import xarray as xr
import h5py
import numpy as np
import os
import os.path
import copy
import warnings
from siphon import catalog
from sklearn.tree import DecisionTreeRegressor
from dask.distributed import Client, LocalCluster
overwrite = False

warnings.filterwarnings(
    "ignore",
    message="The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
    category=FutureWarning,
)

In [8]:
experiments = [
               '1pctCO2', 'abrupt-4xCO2', 'historical', # CMIP
               'hist-GHG', 'hist-aer', # DAMIP
               'ssp370', 'ssp370-lowNTCF', 'ssp585' # ScenarioMIP
]

In [9]:
def forge_output(experiment):
    tas_prior = (xr.open_dataset(f'data_store/tas_means/NorESM2-LM_{experiment}_tas.nc')).sel(member='r1i1p1f1')['tas']
    pr_prior = (xr.open_dataset(f'data_store/pr_means/NorESM2-LM_{experiment}_pr.nc')).sel(member='r1i1p1f1')['pr']
    pr_perc_prior = (xr.open_dataset(f'data_store/pr_perc/NorESM2-LM_{experiment}_pr_p90_grid_annual_total_mm.nc')).sel(member='r1i1p1f1')['pr_p90_grid_annual_total_mm']

    tas_al, pr_al, prp90_al = xr.align(tas_prior, pr_prior, pr_perc_prior, join='inner')    
    toreturn = xr.Dataset({"tas": tas_al, "pr": pr_al, "pr_p90": prp90_al})
    
    return (np.array(toreturn.to_dataarray())).T

def forge_input(experiment):
    input_prior = xr.open_dataset(f'input_data/inputs_{experiment}.nc')
    
    try:
        weights = np.cos(np.deg2rad(input_prior['latitude'])).rename('weights')
        SO2_gm = input_prior['SO2'].weighted(weights).mean(['latitude','longitude'], skipna=True)
        BC_gm  = input_prior['BC'] .weighted(weights).mean(['latitude','longitude'], skipna=True)
        input_prior = input_prior.drop_vars(['SO2','BC']).assign(SO2=SO2_gm, BC=BC_gm)
        return (np.array(input_prior.to_dataarray())).T
    except: 
        return (np.array(input_prior.to_dataarray())).T

def mean_first3_available(da):
    if "member" not in da.dims:
        return da
    avail = da["member"].astype(str).values  
    sel = [m for m in MEMBERS3 if m in avail]
    if sel:
        return da.sel(member=sel, drop=True).mean("member", skipna=True)
    # none of the named members exist → take first up to 3
    return da.isel(member=slice(0, 3)).mean("member", skipna=True)

In [10]:
MODEL = "NorESM2-LM"
DATA_BASE = "data_store"
MEMBERS3 = ["r1i1p1f1", "r2i1p1f1", "r3i1p1f1"]

# build outputs

cols = ["tas", "pr", "pr_p90"]
full_output = pd.DataFrame(columns=cols + ["experiment", "year"])

for experiment in experiments:
    arr = forge_output(experiment)  

    tas = xr.open_dataset(f"{DATA_BASE}/tas_means/{MODEL}_{experiment}_tas.nc")["tas"]
    pr  = xr.open_dataset(f"{DATA_BASE}/pr_means/{MODEL}_{experiment}_pr.nc")["pr"]
    p90 = xr.open_dataset(
        f"{DATA_BASE}/pr_perc/{MODEL}_{experiment}_pr_p90_grid_annual_total_mm.nc"
    )["pr_p90_grid_annual_total_mm"]

    tas = mean_first3_available(tas)
    pr  = mean_first3_available(pr)
    p90 = mean_first3_available(p90)

    a, b, c = xr.align(tas, pr, p90, join="inner")
    coord = "year" if "year" in a.coords else "time"
    years_out = a[coord].values
    years_out = years_out.astype(int)

    df = pd.DataFrame(arr, columns=cols)
    df["experiment"] = experiment
    df["year"] = years_out
    full_output = pd.concat([full_output, df], ignore_index=True)

# build inputs

cols = ["CO2", "SO2", "CH4", "BC"]
full_input = pd.DataFrame(columns=cols + ["experiment", "year"])

for experiment in experiments:
    arr = forge_input(experiment)  # shape: (n_years_in, 4)

    cand1 = os.path.join("input_data", f"inputs_{experiment}.nc")
    cand2 = os.path.join("input_data", "inputs_ssp370.nc")
    in_path = cand1 if os.path.exists(cand1) else cand2
    ds_in = xr.open_dataset(in_path)

    years_in = ds_in["time"].values
    years_in = years_in.astype(int)

    df = pd.DataFrame(arr, columns=cols)
    df["experiment"] = experiment
    df["year"] = years_in

    years_out_set = set(
        full_output.loc[full_output["experiment"] == experiment, "year"].astype(int)
    )
    df = df[df["year"].isin(years_out_set)]

    full_input = pd.concat([full_input, df], ignore_index=True)

keys = (
    full_output[["experiment", "year"]]
    .merge(full_input[["experiment", "year"]], on=["experiment", "year"], how="inner")
    .drop_duplicates()
)

full_output = keys.merge(full_output, on=["experiment", "year"], how="left")
full_input  = keys.merge(full_input,  on=["experiment", "year"], how="left")

print("checking for experiment alignment: ", (full_input['experiment'] == full_output['experiment']).mean())
print("checking for year alignment: ", (full_input['year'] == full_output['year']).mean())

full_output.drop(columns=['experiment', 'year'], inplace=True)
full_input.drop(columns=['experiment', 'year'], inplace=True)

print("full_output:", full_output.shape)
print("full_input :", full_input.shape)

checking for experiment alignment:  1.0
checking for year alignment:  1.0
full_output: (1344, 3)
full_input : (1344, 4)


In [11]:
### BUILDING MODELS

In [12]:
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [13]:
rf = RandomForestRegressor(
    n_estimators=1200,
    max_depth=None,
    max_features="sqrt",
    min_samples_leaf=2,
    n_jobs=-2,
    random_state=42
)
rf.fit(full_input, full_output)


0,1,2
,n_estimators,1200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
test_inputs = xr.open_dataset("/glade/u/home/nchu/test_data/inputs_ssp245.nc")
weights = np.cos(np.deg2rad(test_inputs['latitude'])).rename('weights')
SO2_gm = test_inputs['SO2'].weighted(weights).mean(['latitude','longitude'], skipna=True)
BC_gm  = test_inputs['BC'] .weighted(weights).mean(['latitude','longitude'], skipna=True)
test_inputs = test_inputs.drop_vars(['SO2','BC']).assign(SO2=SO2_gm, BC=BC_gm)
test_inputs = (np.array(test_inputs.to_dataarray())).T
print(test_inputs.shape)

test_outputs = xr.open_dataset("/glade/u/home/nchu/test_data/outputs_ssp245.nc").sel(member=1, drop=True)
test_outputs = test_outputs.drop_vars("diurnal_temperature_range")
weights_out = np.cos(np.deg2rad(test_outputs['lat'])).rename('weights')
tas_gm = test_outputs['tas'].weighted(weights_out).mean(['lat','lon'], skipna=True)
pr_gm  = test_outputs['pr'] .weighted(weights_out).mean(['lat','lon'], skipna=True)
pr90_gm = test_outputs['pr90'].weighted(weights_out).mean(['lat','lon'], skipna=True)
test_outputs = test_outputs.drop_vars(['tas','pr','pr90']).assign(tas=tas_gm, pr=pr_gm, pr90=pr90_gm)
test_outputs = test_outputs[['tas','pr','pr90']]
test_outputs['tas'] = test_outputs['tas'] + pred[0][0]
test_outputs['pr'] = test_outputs['pr'] + pred[0][1]
test_outputs['pr90'] = test_outputs['pr90'] + pred[0][2]
test_output = (np.array(test_outputs.to_dataarray())).T
print(test_output.shape)


(86, 4)
(86, 3)


In [39]:
pred = rf.predict(test_inputs) 



In [44]:
rmse  = np.sqrt(np.mean((test_output - pred)**2, axis=0))
rng = np.ptp(pred, axis=0)   
nrmse = rmse / np.where(rng == 0, 1, rng)
per_target = pd.DataFrame({"NRMSE": nrmse, "RMSE": rmse})
print(per_target)

print("Overall NRMSE (avg)        :", nrmse.mean())
print("Overall RMSE (avg)        :", rmse.mean())


      NRMSE          RMSE
0  0.339852  8.701839e-01
1  0.181349  2.438889e-07
2  0.447577  3.352252e+01
Overall NRMSE (avg)        : 0.32292600384713566
Overall RMSE (avg)        : 11.46423521371621


In [41]:
pred[0][1]

3.3182910763271524e-05