In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Helper Functions

In [2]:
def compute_metrics(y_true, y_pred):
    """Compute RMSE, MAE, and R2."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return rmse, mae, r2

In [3]:
def compute_stability(model, X, y, noise_level=0.01, n_trials=5):
    """
    Add Gaussian noise to numeric features and measure average relative RMSE change.
    noise_level is fraction of std-dev of each feature.
    """
    numeric_cols = X.select_dtypes(include=[np.number]).columns
    base_rmse = np.sqrt(mean_squared_error(y, model.predict(X)))
    rel_changes = []
    
    for _ in range(n_trials):
        Xp = X.copy()
        noise = np.random.normal(0, noise_level * Xp[numeric_cols].std(), 
                                 size=Xp[numeric_cols].shape)
        Xp[numeric_cols] += noise
        rp = model.predict(Xp)
        rmse_p = np.sqrt(mean_squared_error(y, rp))
        rel_changes.append((rmse_p - base_rmse) / base_rmse)
    
    # Return average relative change (lower = more stable)
    return np.mean(rel_changes)

# Model Definition

In [4]:
models = {
    'Ridge': Ridge(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, use_label_encoder=False, eval_metric='rmse', random_state=42),
    'MLP': MLPRegressor(hidden_layer_sizes=(64, 64),    # implemented a simple MLP with sklearn for easy compatibility with the rest of the code
                        activation='relu',
                        solver='adam',
                        max_iter=200,
                        random_state=42)
}

In [5]:
datasets = load('..\data\experimental\experiment_datasets.joblib')  # Load datasets from joblib file

# Manual Training

In [6]:
train_df = datasets['baselines']['full']['within_sample']['within_sample']['train']
test_df = datasets['baselines']['full']['within_sample']['within_sample']['test']

train_df.shape, test_df.shape

((291, 83), (73, 83))

In [7]:
X_train = train_df.drop(columns=['totalEsg'])
y_train = train_df['totalEsg']
X_test  = test_df.drop(columns=['totalEsg'])
y_test  = test_df['totalEsg']

In [8]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((291, 82), (291,), (73, 82), (73,))

In [9]:
model = models['Ridge']  # Choose the model you want to use

In [10]:
model

In [11]:
model.fit(X_train, y_train)

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


In [12]:
# predict
y_pred = model.predict(X_test)

In [13]:
# compute metrics
rmse, mae, r2 = compute_metrics(y_test, y_pred)
stability = compute_stability(model, X_test, y_test,
                            noise_level=0.01, n_trials=5)

In [14]:
manual_results = {
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2,
            'Stability': stability
        }

In [15]:
pd.DataFrame(manual_results, index=[0])

Unnamed: 0,RMSE,MAE,R2,Stability
0,44.795695,15.571981,-32.245542,0.000953


# Iterative Training

In [7]:
datasets['diversified']['median_balanced']['region_holdout']['east_asia_n_pacific']['train'].head(5)

Unnamed: 0,Issuance Of Capital Stock,marketCap,price_to_earnings,Purchase Of Business,Share Issued,Change In Payable,Change In Inventory,Sale Of Business,Gains Losses Not Affecting Retained Earnings,Cash Flow From Continuing Financing Activities,...,region_north_america,region_south_asia,region_sub_saharan_africa,companySize_Large-Cap,companySize_Mid-Cap,companySize_Small-Cap,sentiment_ratio_all,sentiment_ratio_extremes,weighted_sentiment_avg,weighted_sentiment_norm_conf
0,0.687094,-0.228332,0.052014,-0.036783,0.167647,0.451035,0.088759,0.327382,-0.001519,0.245693,...,1.0,0.0,0.0,1.0,0.0,0.0,0.5,1.0,0.336655,0.448924
1,5.961003,-0.068157,-0.201522,0.105802,0.472271,-19.300383,-0.131484,-2.791721,-0.174742,7.729399,...,1.0,0.0,0.0,1.0,0.0,0.0,0.16129,0.555556,0.146451,0.189129
2,0.099041,-0.132534,-0.049257,-0.312445,0.476636,0.060062,-0.131484,-2.791721,-0.002753,1.394286,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,12.522439,-0.066962,-0.000455,-0.064886,0.010446,-0.418846,0.062938,0.040166,-0.017643,-0.214733,...,1.0,0.0,0.0,1.0,0.0,0.0,0.459459,1.0,0.348449,0.458072
4,0.810894,-0.027947,-0.212082,-0.474454,1.014458,-4.430383,1.15548,0.401591,-0.017643,-0.214733,...,1.0,0.0,0.0,1.0,0.0,0.0,-0.487179,-0.655172,-0.344822,-0.515513


In [25]:
datasets['baselines']['full']['region_holdout']['europe_n_central_asia']['train'].shape

(275, 83)

In [23]:
datasets['baselines']['full']['region_holdout']['europe_n_central_asia'].keys()

dict_keys(['train', 'test', 'train_size', 'test_size'])

In [30]:
results = []

In [31]:
%%time
for scenario_type, scenario_dict in datasets.items():        # e.g., 'baselines', 'diversified'
    for scenario_name, splits in scenario_dict.items():      # e.g., 'full', 'constrained', 'max_balanced', etc.
        for split_type, data_group in splits.items():        # 'within_sample', 'region_holdout', 'size_holdout'
            if split_type == 'original_data':
                continue  # Skip original data entries
            
            if split_type == 'region_holdout':
                continue # TEMP TO CHECK SMTH

            # For within-sample, we have one group; for others, multiple contexts
            # contexts = {'within_sample': data_group} if split_type == 'within_sample' else data_group

            # for context_name, data in contexts.items():
            #     print(f"Scenario: {scenario_type}, {scenario_name}, Split: {split_type}, Context: {context_name}")

            # Handle different split types
            if split_type == 'within_sample':
                # For within_sample, data structure is one level deeper
                data = data_group['within_sample']
                print(f"Scenario: {scenario_type}, {scenario_name}, Split: {split_type}")
                if 'train' not in data or 'test' not in data:
                    continue
                    
                train_df = data['train']
                test_df = data['test']

                print(train_df.shape, test_df.shape)

                # Separate features and target
                X_train = train_df.drop(columns=['totalEsg'])
                y_train = train_df['totalEsg']
                X_test  = test_df.drop(columns=['totalEsg'])
                y_test  = test_df['totalEsg']
                
                # Train and evaluate models
                for model_name, model in models.items():
                    # train
                    model.fit(X_train, y_train)

                    # predict
                    y_pred = model.predict(X_test)

                    # compute metrics
                    rmse, mae, r2 = compute_metrics(y_test, y_pred)
                    stability = compute_stability(model, X_test, y_test,
                                                noise_level=0.01, n_trials=5)
                    
                    results.append({
                        'scenario_type': scenario_type,
                        'scenario': scenario_name,
                        'split': split_type,
                        'context': 'within_sample',
                        'model': model_name,
                        'RMSE': rmse,
                        'MAE': mae,
                        'R2': r2,
                        'Stability': stability
                    })
            
            else:
                # For region_holdout and size_holdout, process each context
                for context_name, data in data_group.items():
                    print(f"Scenario: {scenario_type}, {scenario_name}, Split: {split_type}, Context: {context_name}")
                    if 'train' not in data or 'test' not in data:
                        continue
                        
                    train_df = data['train']
                    test_df = data['test']

                    print(train_df.shape, test_df.shape)

                    # Separate features and target
                    X_train = train_df.drop(columns=['totalEsg'])
                    y_train = train_df['totalEsg']
                    X_test  = test_df.drop(columns=['totalEsg'])
                    y_test  = test_df['totalEsg']
                    
                    # Train and evaluate models
                    for model_name, model in models.items():
                        # train
                        model.fit(X_train, y_train)

                        # predict
                        y_pred = model.predict(X_test)

                        # compute metrics
                        rmse, mae, r2 = compute_metrics(y_test, y_pred)
                        stability = compute_stability(model, X_test, y_test,
                                                  noise_level=0.01, n_trials=5)
                        
                        results.append({
                            'scenario_type': scenario_type,
                            'scenario': scenario_name,
                            'split': split_type,
                            'context': context_name,
                            'model': model_name,
                            'RMSE': rmse,
                            'MAE': mae,
                            'R2': r2,
                            'Stability': stability
                        })

results_df = pd.DataFrame(results)

Scenario: baselines, full, Split: within_sample
(291, 83) (73, 83)


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Scenario: baselines, full, Split: size_holdout, Context: Large-Cap
(251, 83) (113, 83)


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Scenario: baselines, full, Split: size_holdout, Context: Mid-Cap
(249, 83) (115, 83)


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Scenario: baselines, full, Split: size_holdout, Context: Small-Cap
(228, 83) (136, 83)


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Scenario: baselines, constrained, Split: within_sample
(48, 83) (13, 83)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Scenario: baselines, constrained, Split: size_holdout, Context: Large-Cap
(0, 83) (61, 83)


ValueError: Found array with 0 sample(s) (shape=(0, 82)) while a minimum of 1 is required by Ridge.

In [32]:
pd.DataFrame(results)

Unnamed: 0,scenario_type,scenario,split,context,model,RMSE,MAE,R2,Stability
0,baselines,full,within_sample,within_sample,Ridge,44.7957,15.57198,-32.24554,-0.000373
1,baselines,full,within_sample,within_sample,RandomForest,6.850229,5.258253,0.2225525,0.042465
2,baselines,full,within_sample,within_sample,XGBoost,6.895373,4.74277,0.2122718,0.116201
3,baselines,full,within_sample,within_sample,MLP,281718300.0,45344380.0,-1314895000000000.0,0.000187
4,baselines,full,size_holdout,Large-Cap,Ridge,181.1717,37.03211,-479.2172,0.000313
5,baselines,full,size_holdout,Large-Cap,RandomForest,6.867475,5.280148,0.3099963,0.11856
6,baselines,full,size_holdout,Large-Cap,XGBoost,6.786575,4.822111,0.3261574,0.150631
7,baselines,full,size_holdout,Large-Cap,MLP,35426360000.0,3625109000.0,-1.836159e+19,6.4e-05
8,baselines,full,size_holdout,Mid-Cap,Ridge,82.17587,21.69888,-113.668,-0.00082
9,baselines,full,size_holdout,Mid-Cap,RandomForest,6.384345,5.021419,0.3078721,0.198161
