In [3]:
import pandas as pd
import numpy as np
from joblib import dump, load
import random

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from scipy.stats import ttest_rel, friedmanchisquare
from statsmodels.stats.multicomp import pairwise_tukeyhsd

Helper Functions

In [4]:
# initial random seed
seed = 42

In [5]:
def compute_metrics(y_true, y_pred):
    """Compute RMSE, MAE, and R2."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

In [6]:
def compute_stability(model, X, y, noise_level=0.01, n_trials=5):
    """
    Add Gaussian noise to numeric features and measure average relative RMSE change.
    noise_level is fraction of std-dev of each feature.
    """
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    base_rmse = np.sqrt(mean_squared_error(y, model.predict(X)))
    rel_changes = []
    
    for _ in range(n_trials):
        Xp = X.copy()
        noise = np.random.normal(0, noise_level * Xp[numeric_cols].std(), 
                                 size=Xp[numeric_cols].shape)
        Xp[numeric_cols] += noise
        rp = model.predict(Xp)
        rmse_p = np.sqrt(mean_squared_error(y, rp))
        rel_changes.append((rmse_p - base_rmse) / base_rmse)
    
    # Return average relative change (lower = more stable)
    return np.mean(rel_changes)

# Model Definition

In [7]:
# models = {
#     'Ridge': Ridge(random_state=42, alpha=1.0), 
#     'RandomForest': RandomForestRegressor(n_estimators=100, random_state=seed),
#     'XGBoost': XGBRegressor(
#         n_estimators=100, 
#         eval_metric='rmse', 
#         random_state=seed,
#         learning_rate=0.1,
#         ),
#     'MLP': MLPRegressor(
#         hidden_layer_sizes=(128, 64, 32),
#         activation='relu',
#         solver='lbfgs',
#         learning_rate='adaptive',
#         learning_rate_init=1e-3,
#         alpha=1e-4,
#         early_stopping=True,
#         validation_fraction=0.1,
#         max_iter=300,
#         random_state=seed
#         )
# }

def init_models(seed):
    models = {
        'Ridge': Ridge(random_state=42, alpha=1.0), 
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=seed),
        'XGBoost': XGBRegressor(
            n_estimators=100, 
            eval_metric='rmse', 
            random_state=seed,
            learning_rate=0.1,
            ),
        'MLP': MLPRegressor(
            hidden_layer_sizes=(128, 64, 32),
            activation='relu',
            solver='lbfgs',
            learning_rate='adaptive',
            learning_rate_init=1e-3,
            alpha=1e-4,
            early_stopping=True,
            validation_fraction=0.1,
            max_iter=300,
            random_state=seed
            )
    }

    return models

In [None]:
param_grids = {
    'Ridge': {
        'alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
        },
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
        },
    'XGBoost': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 6],
        'subsample': [0.8, 1.0],
        },
    'MLP': {
        # deeper, tapered architectures
        'hidden_layer_sizes': [
            (128, 64, 32),
            (256, 128, 64, 32)
        ],
        'alpha': [1e-5, 1e-4, 1e-3],
        'learning_rate_init': [1e-4, 1e-3],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'lbfgs'],
        'early_stopping': [True],
        'n_iter_no_change': [20],
        'max_iter': [500]
    }
}

In [9]:
datasets = load('..\data\experimental\experiment_datasets_3.joblib')  # Load datasets from joblib file

# Manual Training

In [6]:
train_df = datasets['baselines']['full']['within_sample']['within_sample']['train']
test_df = datasets['baselines']['full']['within_sample']['within_sample']['test']

train_df.shape, test_df.shape

((291, 82), (73, 82))

In [7]:
X_train = train_df.drop(columns=['totalEsg'])
y_train = train_df['totalEsg']
X_test  = test_df.drop(columns=['totalEsg'])
y_test  = test_df['totalEsg']

In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((291, 81), (291,), (73, 81), (73,))

In [9]:
model = models['Ridge']  # Choose the model you want to use

In [10]:
model

In [11]:
model.fit(X_train, y_train)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [12]:
# predict
y_pred = model.predict(X_test)

In [13]:
# compute metrics
rmse, mae, r2 = compute_metrics(y_test, y_pred)
stability = compute_stability(model, X_test, y_test,
                            noise_level=0.01, n_trials=5)

In [14]:
manual_results = {
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2,
            'Stability': stability
        }

In [15]:
pd.DataFrame(manual_results, index=[0])

Unnamed: 0,RMSE,MAE,R2,Stability
0,44.795695,15.571981,-32.245542,-0.001084


# Iterative Training

In [10]:
def train_and_evaluate(model_name, model, X_train, y_train, X_test, y_test, param_grid):
    """Train model with grid search and evaluate performance"""
    # Prepare grid search
    grid = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=3,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )
    
    # Train model
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    # Make predictions
    y_pred = best_model.predict(X_test)
    
    # Compute metrics
    rmse, mae, r2 = compute_metrics(y_test, y_pred)
    stability = compute_stability(best_model, X_test, y_test, 
                                noise_level=0.01, n_trials=5)
    
    # # Feature importance (if applicable)
    # if hasattr(best_model, "feature_importances_"):
    #     importances = best_model.feature_importances_
    # elif hasattr(best_model, "coef_"):
    #     importances = best_model.coef_.ravel()
    # else:
    #     perm = permutation_importance(
    #         best_model, X_test, y_test,
    #         n_repeats=5, random_state=0, n_jobs=-1
    #     )
    #     importances = perm.importances_mean
    
    return {
        'model': model_name,
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'Stability': stability,
        # 'Feature_names': list(X_train.columns),
        # 'Feature_importances': importances.tolist(),
    }

In [11]:
def prepare_data(df, target_col='totalEsg'):
    """Separate features and target"""
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y

In [12]:
def process_dataset(train_df, test_df, models, param_grids, scenario_info):
    """Process a single dataset with all models"""
    results = []
    
    # Prepare data
    X_train, y_train = prepare_data(train_df)
    X_test, y_test = prepare_data(test_df)
    
    # print(f"Data shapes - Train: {train_df.shape}, Test: {test_df.shape}")
    
    # Train and evaluate all models
    for model_name, model in models.items():
        result = train_and_evaluate(
            model_name, model, 
            X_train, y_train, 
            X_test, y_test,
            param_grids[model_name]
        )
        results.append({**scenario_info, **result})
    
    return results

In [13]:
datasets['baselines']['full']['region_holdout']['europe_n_central_asia']['train'].shape

(275, 54)

In [14]:
results = []
n_runs = 5 # number of runs for statistical comparison

In [15]:
import warnings

warnings.filterwarnings("ignore", message="Ill-conditioned matrix", category=UserWarning)

try:
    from sklearn.utils._testing import LinAlgWarning
except ImportError:
    from scipy.linalg import LinAlgWarning
except ImportError:
     LinAlgWarning = UserWarning # Fallback if LinAlgWarning is not easily accessible

warnings.filterwarnings("ignore", category=LinAlgWarning)

In [16]:
datasets['diversified']['median_balanced']['region_holdout']['east_asia_n_pacific']['train'].isna().any().any()

False

In [17]:
%%time
for run in range(n_runs):
    seed = random.randint(0, 100)  # Random seed for each run
    models = init_models(seed)  # Initialize models with the new seed
    print(f"Run {run + 1}/{n_runs} with seed {seed}")
    for scenario_type, scenario_dict in datasets.items():        # e.g., 'baselines', 'diversified'
        for scenario_name, splits in scenario_dict.items():      # e.g., 'full', 'constrained', 'max_balanced', etc.
            for split_type, data_group in splits.items():        # 'within_sample', 'region_holdout', 'size_holdout'
                # if split_type == 'original_data':
                #     continue  # skip original data entries

                # Handle different split types
                if split_type == 'within_sample':
                    data = data_group['within_sample'] # for within_sample, the data structure is one level deeper

                    scenario_info = {
                        'run': run,
                        'scenario_type': scenario_type,
                        'scenario': scenario_name,
                        'split': split_type,
                        'context': 'within_sample',
                    }
                    
                    results.extend(process_dataset(
                        data['train'], 
                        data['test'], 
                        models,
                        param_grids,
                        scenario_info
                    ))
                
                elif split_type in ['region_holdout', 'size_holdout']:
                    for context_name, data in data_group.items():
                        if 'train' not in data or 'test' not in data:
                            continue # skip if train/test not available

                        scenario_info = {
                            'run': run,
                            'scenario_type': scenario_type,
                            'scenario': scenario_name,
                            'split': split_type,
                            'context': context_name
                        }
                        
                        results.extend(process_dataset(
                            data['train'],
                            data['test'],
                            models,
                            param_grids,
                            scenario_info
                        ))

results_df = pd.DataFrame(results)

Run 1/5 with seed 69


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Run 2/5 with seed 92


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

Run 3/5 with seed 98


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Run 4/5 with seed 90




Run 5/5 with seed 73


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


CPU times: total: 56min 34s
Wall time: 1h 35min 15s


In [18]:
results_df = pd.DataFrame(results)

In [19]:
results_df.to_csv('..\data\experimental\experiment_results_3_new.csv', index=False)

In [None]:
results_df.head(25)

Unnamed: 0,run,scenario_type,scenario,split,context,model,RMSE,MAE,R2,Stability
0,0,baselines,full,within_sample,within_sample,Ridge,36.05126,13.17664,-20.53286,-0.000886
1,0,baselines,full,within_sample,within_sample,RandomForest,6.853874,5.183024,0.221725,0.04208
2,0,baselines,full,within_sample,within_sample,XGBoost,6.95985,5.228697,0.1974713,0.032714
3,0,baselines,full,within_sample,within_sample,MLP,199588300.0,32125020.0,-659980300000000.0,0.000613
4,0,baselines,full,region_holdout,east_asia_n_pacific,Ridge,122.1359,67.28334,-261.9003,0.000968
5,0,baselines,full,region_holdout,east_asia_n_pacific,RandomForest,8.878998,7.331653,-0.3894165,0.045592
6,0,baselines,full,region_holdout,east_asia_n_pacific,XGBoost,8.221527,6.583589,-0.1912682,0.048606
7,0,baselines,full,region_holdout,east_asia_n_pacific,MLP,15440440000.0,1993592000.0,-4.20169e+18,1.4e-05
8,0,baselines,full,region_holdout,europe_n_central_asia,Ridge,7.365533,5.843953,0.05464359,-0.000264
9,0,baselines,full,region_holdout,europe_n_central_asia,RandomForest,6.038049,4.663827,0.3646976,0.07304


In [20]:
results_df.shape

(700, 10)

In [21]:
results_df['R2'].max()

0.8804426332854435

In [22]:
# results_df = pd.read_csv('..\data\experimental\experiment_results_3_huh.csv')

In [23]:
results_df

Unnamed: 0,run,scenario_type,scenario,split,context,model,RMSE,MAE,R2,Stability
0,0,baselines,full,within_sample,within_sample,Ridge,7.610812,5.862182,0.040328,0.000247
1,0,baselines,full,within_sample,within_sample,RandomForest,7.141293,5.471655,0.155082,0.028876
2,0,baselines,full,within_sample,within_sample,XGBoost,7.430083,5.558425,0.085364,0.028490
3,0,baselines,full,within_sample,within_sample,MLP,8.933511,6.513849,-0.322224,0.000515
4,0,baselines,full,region_holdout,east_asia_n_pacific,Ridge,8.262882,6.642976,-0.203283,-0.000083
...,...,...,...,...,...,...,...,...,...,...
695,4,diversified,median_balanced,size_holdout,Mid-Cap,MLP,8.368405,6.460525,-0.290265,0.000418
696,4,diversified,median_balanced,size_holdout,Small-Cap,Ridge,9.533633,7.213626,-0.135753,-0.000024
697,4,diversified,median_balanced,size_holdout,Small-Cap,RandomForest,9.046370,6.749570,-0.022623,-0.019296
698,4,diversified,median_balanced,size_holdout,Small-Cap,XGBoost,8.889949,6.638005,0.012435,-0.012236


# Statistical Comparisons

In [24]:
results_df[results_df.scenario_type=='diversified'].scenario.unique()

array(['max_balanced', 'median_balanced'], dtype=object)

In [25]:
mask_base = (
    (results_df.scenario_type=='baselines') &
    (results_df.scenario=='full') &
    (results_df.split=='within_sample') &
    (results_df.context=='within_sample') &
    (results_df.model=='MLP')
)

mask_div = (
    (results_df.scenario_type=='diversified') &
    (results_df.scenario=='median_balanced') &
    (results_df.split=='within_sample') &
    (results_df.context=='within_sample') &
    (results_df.model=='MLP')
)

In [26]:
results_df[mask_base].sort_values('run')['RMSE']

3       8.933511
143     9.809793
283    12.238902
423     9.915267
563     8.664135
Name: RMSE, dtype: float64

In [27]:
results_df[mask_div].sort_values('run')['RMSE']

103    6.470807
243    7.871145
383    5.852423
523    6.536421
663    6.767249
Name: RMSE, dtype: float64

In [28]:
def paired_t_tests(results, baseline_scenario='full', diversified_scenarios=None):
    """
    Perform paired t-tests comparing baseline 'baselines' vs each diversified scenario.
    Returns a DataFrame of test statistics and p-values for each combination of:
    split, context, model, diversified_scenario.
    """
    if diversified_scenarios is None:
        diversified_scenarios = results[results.scenario_type=='diversified']['scenario'].unique()
    
    rows = []
    for split in results['split'].unique():
        for context in results[results['split']==split]['context'].unique():
            for model in results['model'].unique():
                # baseline RMSEs
                mask_base = (
                    (results.scenario_type=='baselines') &
                    (results.scenario=='full') &
                    (results.split==split) &
                    (results.context==context) &
                    (results.model==model)
                )
                base_vals = results[mask_base].sort_values('run')['RMSE'].values
                for div_scenario in diversified_scenarios:
                    mask_div = (
                        (results.scenario_type=='diversified') &
                        (results.scenario==div_scenario) &
                        (results.split==split) &
                        (results.context==context) &
                        (results.model==model)
                    )
                    div_vals = results[mask_div].sort_values('run')['RMSE'].values
                    # require equal-length arrays
                    if len(base_vals)==len(div_vals) and len(base_vals)>1:
                        stat, p = ttest_rel(base_vals, div_vals)
                        rows.append({
                            'split': split,
                            'context': context,
                            'model': model,
                            'diversified_scenario': div_scenario,
                            't_stat': stat,
                            'p_value': p
                        })
    return pd.DataFrame(rows)

paired_results_df = paired_t_tests(results_df)

  return hypotest_fun_in(*args, **kwds)


In [29]:
paired_results_df

Unnamed: 0,split,context,model,diversified_scenario,t_stat,p_value
0,within_sample,within_sample,Ridge,max_balanced,inf,0.000000e+00
1,within_sample,within_sample,Ridge,median_balanced,inf,0.000000e+00
2,within_sample,within_sample,RandomForest,max_balanced,78.786996,1.555488e-07
3,within_sample,within_sample,RandomForest,median_balanced,32.629927,5.259838e-06
4,within_sample,within_sample,XGBoost,max_balanced,58.042328,5.276100e-07
...,...,...,...,...,...,...
75,size_holdout,Small-Cap,RandomForest,median_balanced,-70.636104,2.406934e-07
76,size_holdout,Small-Cap,XGBoost,max_balanced,-7.596347,1.611210e-03
77,size_holdout,Small-Cap,XGBoost,median_balanced,-27.500271,1.039883e-05
78,size_holdout,Small-Cap,MLP,max_balanced,-0.153955,8.851006e-01


In [30]:
# ----------------------------------------
# 2. Friedman + Tukey HSD: Multiple-model comparison
#    on diversified within-sample 'full' (i.e., context=='within_sample')
# ----------------------------------------

def multi_model_comparison(results, scenario_type='diversified', scenario_name='max_balanced'):
    """
    Perform Friedman test across models on within-sample for a given diversified scenario,
    followed by Tukey HSD for post-hoc pairwise comparisons.
    Returns a tuple: (friedman_stat, friedman_p, tukey_df)
    """
    # Filter for within-sample context
    mask = (
        (results.scenario_type==scenario_type) &
        (results.scenario==scenario_name) &
        (results.split=='within_sample')
    )
    df_sub = results[mask]
    
    # Pivot: rows=runs, columns=models
    pivot = df_sub.pivot(index='run', columns='model', values='RMSE')
    models = pivot.columns.tolist()
    
    # Friedman test
    data = [pivot[m].values for m in models]
    stat, p = friedmanchisquare(*data)
    
    # Tukey HSD
    tukey = pairwise_tukeyhsd(endog=df_sub['RMSE'], groups=df_sub['model'], alpha=0.05)
    tukey_df = pd.DataFrame(data=tukey._results_table.data[1:], 
                            columns=tukey._results_table.data[0])
    
    return stat, p, tukey_df

# Apply multi-model comparison for each diversified scenario_name
multi_results = []
for scenario_name in results_df[results_df.scenario_type=='diversified']['scenario'].unique():
    stat, p, tukey_df = multi_model_comparison(results_df, scenario_type='diversified', scenario_name=scenario_name)
    # store summary
    summary = {
        'scenario': scenario_name,
        'friedman_stat': stat,
        'friedman_p': p
    }
    multi_results.append(summary)
    # display Tukey results per scenario
    # tools.display_dataframe_to_user(f"Tukey HSD Post-hoc ({scenario_name})", tukey_df)

multi_results_df = pd.DataFrame(multi_results)

In [33]:
multi_results_df

Unnamed: 0,scenario,friedman_stat,friedman_p
0,max_balanced,15.0,0.001817
1,median_balanced,12.6,0.005587


In [34]:
tukey_df

Unnamed: 0,group1,group2,meandiff,p-adj,lower,upper,reject
0,MLP,RandomForest,-1.1702,0.0011,-1.8716,-0.4688,True
1,MLP,Ridge,-0.1557,0.9191,-0.8572,0.5457,False
2,MLP,XGBoost,-1.5869,0.0,-2.2883,-0.8855,True
3,RandomForest,Ridge,1.0145,0.0039,0.3131,1.7159,True
4,RandomForest,XGBoost,-0.4167,0.3558,-1.1181,0.2847,False
5,Ridge,XGBoost,-1.4312,0.0001,-2.1326,-0.7298,True


In [35]:
# Extract within-sample RMSE per scenario/model
within = results_df[results_df['split'] == 'within_sample'][['scenario_type', 'scenario', 'model', 'RMSE']]
within = within.rename(columns={'RMSE': 'RMSE_within'})

# Merge to get RMSE_within alongside all rows
merged = results_df.merge(within, on=['scenario_type', 'scenario', 'model'], how='left')

# Compute Cross-Context Generalization Score:
#    Transfer Ratio = (RMSE_holdout / RMSE_within)
#    For within-sample rows, set NaN
merged['TransferRatio'] = np.where(
    merged['split'] == 'within_sample',
    np.nan,
    merged['RMSE_within'] / merged['RMSE']
)

In [36]:
merged

Unnamed: 0,run,scenario_type,scenario,split,context,model,RMSE,MAE,R2,Stability,RMSE_within,TransferRatio
0,0,baselines,full,within_sample,within_sample,Ridge,7.610812,5.862182,0.040328,0.000247,7.610812,
1,0,baselines,full,within_sample,within_sample,Ridge,7.610812,5.862182,0.040328,0.000247,7.610812,
2,0,baselines,full,within_sample,within_sample,Ridge,7.610812,5.862182,0.040328,0.000247,7.610812,
3,0,baselines,full,within_sample,within_sample,Ridge,7.610812,5.862182,0.040328,0.000247,7.610812,
4,0,baselines,full,within_sample,within_sample,Ridge,7.610812,5.862182,0.040328,0.000247,7.610812,
...,...,...,...,...,...,...,...,...,...,...,...,...
3495,4,diversified,median_balanced,size_holdout,Small-Cap,MLP,11.664123,9.064744,-0.700087,-0.000265,6.470807,0.554762
3496,4,diversified,median_balanced,size_holdout,Small-Cap,MLP,11.664123,9.064744,-0.700087,-0.000265,7.871145,0.674817
3497,4,diversified,median_balanced,size_holdout,Small-Cap,MLP,11.664123,9.064744,-0.700087,-0.000265,5.852423,0.501746
3498,4,diversified,median_balanced,size_holdout,Small-Cap,MLP,11.664123,9.064744,-0.700087,-0.000265,6.536421,0.560387


In [37]:
merged.groupby('model').agg(metric=('R2','std'))

Unnamed: 0_level_0,metric
model,Unnamed: 1_level_1
MLP,1.430159
RandomForest,0.287858
Ridge,0.307946
XGBoost,0.280969


In [38]:
merged.groupby(['scenario_type', 'scenario', 'split', 'context', 'model']).agg(
        RMSE_mean=('RMSE', 'mean'),
        RMSE_std=('RMSE', 'std'),
        MAE_mean=('MAE', 'mean'),
        MAE_std=('MAE', 'std'),
        R2_mean=('R2', 'mean'),
        R2_std=('R2', 'std'),
        Stability_mean=('Stability', 'mean'),
        Stability_std=('Stability', 'std'),
        TransferRatio_mean=('TransferRatio', 'mean'),
        TransferRatio_std=('TransferRatio', 'std')
    )

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,RMSE_mean,RMSE_std,MAE_mean,MAE_std,R2_mean,R2_std,Stability_mean,Stability_std,TransferRatio_mean,TransferRatio_std
scenario_type,scenario,split,context,model,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
baselines,constrained,region_holdout,europe_n_central_asia,MLP,13.050503,2.826647,10.648506,2.937194,-2.083765,1.394610,1.910552e-05,0.000206,0.938396,0.313625
baselines,constrained,region_holdout,europe_n_central_asia,RandomForest,6.538190,0.034804,5.062671,0.081441,0.259334,0.007893,3.243965e-02,0.007024,0.984382,0.018992
baselines,constrained,region_holdout,europe_n_central_asia,Ridge,7.074343,0.000000,5.761852,0.000000,0.132903,0.000000,-3.908724e-07,0.000106,0.950238,0.000000
baselines,constrained,region_holdout,europe_n_central_asia,XGBoost,6.783723,0.157804,5.249376,0.258356,0.202268,0.037221,2.196815e-02,0.006727,0.910356,0.034664
baselines,constrained,region_holdout,north_america,MLP,10.427466,1.380349,7.984294,1.004880,-1.355518,0.658738,-1.738065e-04,0.000592,1.145372,0.338502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
diversified,median_balanced,size_holdout,Small-Cap,XGBoost,9.091902,0.122919,6.638524,0.008282,-0.033124,0.027875,-1.251065e-02,0.008258,0.562431,0.020553
diversified,median_balanced,within_sample,within_sample,MLP,6.699609,0.672956,3.569660,0.263369,0.171788,0.170560,5.851265e-04,0.001170,,
diversified,median_balanced,within_sample,within_sample,RandomForest,5.529381,0.133804,3.203790,0.062676,0.440947,0.027075,1.744932e-01,0.025014,,
diversified,median_balanced,within_sample,within_sample,Ridge,6.543861,0.000000,5.351673,0.000000,0.217428,0.000000,4.478148e-05,0.000431,,


In [39]:
agg_df = (
    merged
    .groupby(['scenario_type', 'scenario', 'split', 'context', 'model'])
    .agg(
        RMSE_mean=('RMSE', 'mean'),
        RMSE_std=('RMSE', 'std'),
        MAE_mean=('MAE', 'mean'),
        MAE_std=('MAE', 'std'),
        R2_mean=('R2', 'mean'),
        R2_std=('R2', 'std'),
        Stability_mean=('Stability', 'mean'),
        Stability_std=('Stability', 'std'),
        TransferRatio_mean=('TransferRatio', 'mean'),
        TransferRatio_std=('TransferRatio', 'std')
    )
    .reset_index()
)

In [40]:
agg_df

Unnamed: 0,scenario_type,scenario,split,context,model,RMSE_mean,RMSE_std,MAE_mean,MAE_std,R2_mean,R2_std,Stability_mean,Stability_std,TransferRatio_mean,TransferRatio_std
0,baselines,constrained,region_holdout,europe_n_central_asia,MLP,13.050503,2.826647,10.648506,2.937194,-2.083765,1.394610,1.910552e-05,0.000206,0.938396,0.313625
1,baselines,constrained,region_holdout,europe_n_central_asia,RandomForest,6.538190,0.034804,5.062671,0.081441,0.259334,0.007893,3.243965e-02,0.007024,0.984382,0.018992
2,baselines,constrained,region_holdout,europe_n_central_asia,Ridge,7.074343,0.000000,5.761852,0.000000,0.132903,0.000000,-3.908724e-07,0.000106,0.950238,0.000000
3,baselines,constrained,region_holdout,europe_n_central_asia,XGBoost,6.783723,0.157804,5.249376,0.258356,0.202268,0.037221,2.196815e-02,0.006727,0.910356,0.034664
4,baselines,constrained,region_holdout,north_america,MLP,10.427466,1.380349,7.984294,1.004880,-1.355518,0.658738,-1.738065e-04,0.000592,1.145372,0.338502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,diversified,median_balanced,size_holdout,Small-Cap,XGBoost,9.091902,0.122919,6.638524,0.008282,-0.033124,0.027875,-1.251065e-02,0.008258,0.562431,0.020553
136,diversified,median_balanced,within_sample,within_sample,MLP,6.699609,0.672956,3.569660,0.263369,0.171788,0.170560,5.851265e-04,0.001170,,
137,diversified,median_balanced,within_sample,within_sample,RandomForest,5.529381,0.133804,3.203790,0.062676,0.440947,0.027075,1.744932e-01,0.025014,,
138,diversified,median_balanced,within_sample,within_sample,Ridge,6.543861,0.000000,5.351673,0.000000,0.217428,0.000000,4.478148e-05,0.000431,,


In [42]:
agg_df.to_csv('..\data\experimental\gg_experiment_results_3_agg.csv', index=False)