In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import csv

GAP_THRESHOLD = 0.15
TEST_RANDOM_STATES = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

def mean_relative_error(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return 0.0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mre = mean_relative_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'mre': mre,
        'r2': r2
    }

def run_model_with_seed(random_state):
    # Read pattern data
    pattern = []
    with open('4-pattern1.csv', 'r', encoding='utf-8-sig') as fhd:
        fhd_csv = csv.reader(fhd)
        for line in fhd_csv:
            pattern.append(line)
    pattern = np.array(pattern, dtype='float64')

    # Preprocess pattern data
    pattern = np.where(np.isinf(pattern), np.nan, pattern)
    pattern = np.nan_to_num(pattern, nan=np.nanmean(pattern) if not np.isnan(np.nanmean(pattern)) else 0)
    min_vals = np.min(pattern, axis=0)
    max_vals = np.max(pattern, axis=0)
    range_vals = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
    pattern_normalized = (pattern - min_vals) / range_vals
    scaler = StandardScaler()
    pattern_scaled = scaler.fit_transform(pattern_normalized)

    # Read label data
    label_data = []
    with open('4-label_r1.csv', 'r', encoding='utf-8-sig') as fhl:
        fhl_csv = csv.reader(fhl)
        for line in fhl_csv:
            label_data.append(line)
    label_data = np.array(label_data, dtype='float64')
    groups = label_data[:, 0]
    label_c = label_data[:, 1]
    label_c = np.exp(label_c)

    # Split train/test groups by Group
    unique_groups = np.unique(groups)
    train_groups, test_groups = train_test_split(
        unique_groups,
        test_size=99/491,
        random_state=random_state
    )
    train_mask = np.isin(groups, train_groups)
    test_mask = np.isin(groups, test_groups)
    X_train_raw = pattern_scaled[train_mask]
    y_train = label_c[train_mask]
    X_test_raw = pattern_scaled[test_mask]
    y_test = label_c[test_mask]

    # PCA dimensionality reduction (first 4 components)
    pca = PCA(n_components=12)
    X_train_pca_all = pca.fit_transform(X_train_raw)
    X_test_pca_all = pca.transform(X_test_raw)
    X_train_pca = X_train_pca_all[:, :4]
    X_test_pca = X_test_pca_all[:, :4]

    # Model training and evaluation
    best_params = {
        'n_estimators': 100,
        'max_depth': 20,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'max_features': 'sqrt'
    }
    model = RandomForestRegressor(
        **best_params,
        bootstrap=True,
        random_state=random_state,
        n_jobs=-1
    )
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train_pca, y_train, 
        test_size=0.2, 
        random_state=random_state
    )
    model.fit(X_tr, y_tr)

    # Calculate predictions and metrics
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test_pca)
    val_metrics = calculate_metrics(y_val, y_val_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)

    return {
        'random_state': random_state,
        'val_r2': round(val_metrics['r2'], 6),
        'test_r2': round(test_metrics['r2'], 6),
        'val_mae': round(val_metrics['mae'], 6),
        'test_mae': round(test_metrics['mae'], 6)
    }

# Main process: Robustness validation only
robustness_results = []
for rs in TEST_RANDOM_STATES:
    res = run_model_with_seed(rs)
    robustness_results.append(res)

# Convert to DataFrame for display and export
robustness_df = pd.DataFrame(robustness_results)

# Export robustness results to Excel
with pd.ExcelWriter('RF_PCA_4D_Results-GI-Robustness.xlsx', engine='openpyxl') as writer:
    robustness_df.to_excel(writer, sheet_name='Robustness_Validation_Results', index=False)

# Display results in Jupyter Lab
print("===================== Random Forest Robustness Validation Summary =====================")
display(robustness_df)

# Calculate and display statistical metrics for robustness
val_r2_mean = robustness_df['val_r2'].mean()
val_r2_std = robustness_df['val_r2'].std()
test_r2_mean = robustness_df['test_r2'].mean()
test_r2_std = robustness_df['test_r2'].std()
val_mae_mean = robustness_df['val_mae'].mean()
val_mae_std = robustness_df['val_mae'].std()
test_mae_mean = robustness_df['test_mae'].mean()
test_mae_std = robustness_df['test_mae'].std()

print(f"\n--- Robustness Statistical Metrics ---")
print(f"Validation R2 Mean: {val_r2_mean:.6f} | Standard Deviation: {val_r2_std:.6f}")
print(f"Test R2 Mean: {test_r2_mean:.6f} | Standard Deviation: {test_r2_std:.6f}")
print(f"Validation MAE Mean: {val_mae_mean:.6f} | Standard Deviation: {val_mae_std:.6f}")
print(f"Test MAE Mean: {test_mae_mean:.6f} | Standard Deviation: {test_mae_std:.6f}")
print(f"(Smaller standard deviation indicates better model robustness)")

print(f"\nüìÅ Robustness results exported to: RF_PCA_4D_Results-GI-Robustness.xlsx")
print("===================== Random Forest Robustness Validation Completed =====================")

# Display key robustness stats as DataFrame
stats_df = pd.DataFrame({
    'Metric': ['Validation R2 Mean', 'Validation R2 Std', 'Test R2 Mean', 'Test R2 Std',
               'Validation MAE Mean', 'Validation MAE Std', 'Test MAE Mean', 'Test MAE Std'],
    'Value': [val_r2_mean, val_r2_std, test_r2_mean, test_r2_std,
              val_mae_mean, val_mae_std, test_mae_mean, test_mae_std]
})
display(stats_df)



Unnamed: 0,random_state,val_r2,test_r2,val_mae,test_mae
0,10,0.93794,0.883103,0.058283,0.065497
1,20,0.908978,0.900709,0.062329,0.06595
2,30,0.936892,0.904339,0.056324,0.063396
3,40,0.906355,0.856252,0.066156,0.084432
4,50,0.849794,0.93447,0.070732,0.069438
5,60,0.902731,0.915833,0.074036,0.069422
6,70,0.894144,0.887367,0.077928,0.059218
7,80,0.910916,0.882542,0.064031,0.075162
8,90,0.861724,0.892518,0.064849,0.080163
9,100,0.87903,0.897525,0.066253,0.069217



--- Robustness Statistical Metrics ---
Validation R2 Mean: 0.898850 | Standard Deviation: 0.028857
Test R2 Mean: 0.895466 | Standard Deviation: 0.021025
Validation MAE Mean: 0.066092 | Standard Deviation: 0.006676
Test MAE Mean: 0.070189 | Standard Deviation: 0.007715
(Smaller standard deviation indicates better model robustness)

üìÅ Robustness results exported to: RF_PCA_4D_Results-GI-Robustness.xlsx


Unnamed: 0,Metric,Value
0,Validation R2 Mean,0.89885
1,Validation R2 Std,0.028857
2,Test R2 Mean,0.895466
3,Test R2 Std,0.021025
4,Validation MAE Mean,0.066092
5,Validation MAE Std,0.006676
6,Test MAE Mean,0.070189
7,Test MAE Std,0.007715
