In [4]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import csv

np.random.seed(42)
GAP_THRESHOLD = 0.15

# Read pattern data
pattern = []
with open('4-pattern1.csv', 'r', encoding='utf-8-sig') as fhd:
    fhd_csv = csv.reader(fhd)
    for line in fhd_csv:
        pattern.append(line)
pattern = np.array(pattern, dtype='float64')

# Preprocess pattern data
pattern = np.where(np.isinf(pattern), np.nan, pattern)
pattern = np.nan_to_num(pattern, nan=np.nanmean(pattern) if not np.isnan(np.nanmean(pattern)) else 0)

min_vals = np.min(pattern, axis=0)
max_vals = np.max(pattern, axis=0)
range_vals = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
pattern_normalized = (pattern - min_vals) / range_vals

scaler = StandardScaler()
pattern_scaled = scaler.fit_transform(pattern_normalized)

# Read label data
label_data = []
with open('output-IMS10.csv', 'r', encoding='utf-8-sig') as fhl:
    fhl_csv = csv.reader(fhl)
    for line in fhl_csv:
        label_data.append(line)
label_data = np.array(label_data, dtype='float64')

groups = label_data[:, 0]
label_c = label_data[:, 1]
label_c = np.exp(label_c)

unique_groups = np.unique(groups)

# Define evaluation metrics functions
def mean_relative_error(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return 0.0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mre = mean_relative_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'mre': mre,
        'r2': r2
    }

# Set up robustness test parameters
test_random_states = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
robustness_results = []

best_params = {
    'n_estimators': 50,
    'max_depth': 20,
    'min_samples_split': 10,
    'min_samples_leaf': 2,
    'max_features': 'sqrt'
}

# Run robustness test with different random states (no real-time output)
for rs in test_random_states:
    # Split train/test groups
    train_groups, test_groups = train_test_split(
        unique_groups,
        test_size=99/491,
        random_state=rs
    )
    train_mask = np.isin(groups, train_groups)
    test_mask = np.isin(groups, test_groups)
    
    # Prepare training and testing data
    X_train_raw = pattern_scaled[train_mask]
    y_train = label_c[train_mask]
    X_test_raw = pattern_scaled[test_mask]
    y_test = label_c[test_mask]
    
    # PCA dimensionality reduction
    pca = PCA(n_components=12)
    X_train_pca_all = pca.fit_transform(X_train_raw)
    X_test_pca_all = pca.transform(X_test_raw)
    X_train_pca = X_train_pca_all[:, :6]
    X_test_pca = X_test_pca_all[:, :6]
    
    # Split training set into train/validation
    X_tr, X_val, y_tr, y_val = train_test_split(
        X_train_pca, y_train, 
        test_size=0.2, 
        random_state=rs
    )
    
    # Train Random Forest model
    model = RandomForestRegressor(
        **best_params,
        bootstrap=True,
        random_state=rs,
        n_jobs=-1
    )
    model.fit(X_tr, y_tr)
    
    # Predict and calculate metrics
    y_val_pred = model.predict(X_val)
    y_train_pred = model.predict(X_train_pca)
    y_test_pred = model.predict(X_test_pca)
    
    val_metrics = calculate_metrics(y_val, y_val_pred)
    train_metrics = calculate_metrics(y_train, y_train_pred)
    test_metrics = calculate_metrics(y_test, y_test_pred)
    
    # Record results (only keep core R2 fields)
    result = {
        'random_state': rs,
        'val_r2': round(val_metrics['r2'], 6),
        'test_r2': round(test_metrics['r2'], 6),
        'train_r2': round(train_metrics['r2'], 6)
    }
    robustness_results.append(result)

# Convert results to DataFrame for display and export
robustness_df = pd.DataFrame(robustness_results)

# Display summary results in Jupyter Lab
print("===================== Robustness Validation Summary =====================")
display(robustness_df)

# Calculate and display statistical metrics
val_r2_mean = robustness_df['val_r2'].mean()
val_r2_std = robustness_df['val_r2'].std()
test_r2_mean = robustness_df['test_r2'].mean()
test_r2_std = robustness_df['test_r2'].std()

print(f"\n--- Robustness Statistical Metrics ---")
print(f"Validation R2 Mean: {val_r2_mean:.6f} | Standard Deviation: {val_r2_std:.6f}")
print(f"Test R2 Mean: {test_r2_mean:.6f} | Standard Deviation: {test_r2_std:.6f}")
print(f"(Smaller standard deviation indicates better robustness to random_state changes)")

# Export results to Excel
try:
    with pd.ExcelWriter('RF_PCA_6D_Results-IMS-Robustness.xlsx', engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
        robustness_df.to_excel(writer, sheet_name='Robustness_Various_Random_State', index=False)
except FileNotFoundError:
    with pd.ExcelWriter('RF_PCA_6D_Results-IMS-Robustness.xlsx', engine='openpyxl') as writer:
        robustness_df.to_excel(writer, sheet_name='Robustness_Various_Random_State', index=False)

print(f"\nüìÅ Robustness validation results saved to: RF_PCA_6D_Results-IMS-Robustness.xlsx")
print("===================== Robustness Validation Completed =====================")

# Display key statistical metrics as a DataFrame for better visualization
stats_df = pd.DataFrame({
    'Metric': ['Validation R2 Mean', 'Validation R2 Std', 'Test R2 Mean', 'Test R2 Std'],
    'Value': [val_r2_mean, val_r2_std, test_r2_mean, test_r2_std]
})
display(stats_df)



Unnamed: 0,random_state,val_r2,test_r2,train_r2
0,10,0.815974,0.629834,0.876209
1,20,0.705361,0.675637,0.867968
2,30,0.778673,0.693342,0.870134
3,40,0.717716,0.725722,0.851738
4,50,0.719767,0.768871,0.863365
5,60,0.771174,0.809033,0.858584
6,70,0.746505,0.707226,0.862561
7,80,0.750748,0.756815,0.847513
8,90,0.72588,0.613635,0.870202
9,100,0.759172,0.78066,0.866925



--- Robustness Statistical Metrics ---
Validation R2 Mean: 0.749097 | Standard Deviation: 0.033735
Test R2 Mean: 0.716078 | Standard Deviation: 0.064562
(Smaller standard deviation indicates better robustness to random_state changes)

üìÅ Robustness validation results saved to: RF_PCA_6D_Results-IMS-Robustness.xlsx


Unnamed: 0,Metric,Value
0,Validation R2 Mean,0.749097
1,Validation R2 Std,0.033735
2,Test R2 Mean,0.716078
3,Test R2 Std,0.064562
