In [1]:
import numpy as np
from sklearn.decomposition import PCA
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import csv
import random
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
random.seed(42)

pattern = []
with open('4-pattern1.csv', 'r', encoding='utf-8-sig') as fhd:
    fhd_csv = csv.reader(fhd)
    for line in fhd_csv:
        pattern.append(line)
pattern = np.array(pattern, dtype='float64')

pattern = np.where(np.isinf(pattern), np.nan, pattern)
mean_val = np.nanmean(pattern) if not np.isnan(np.nanmean(pattern)) else 0
pattern = np.nan_to_num(pattern, nan=mean_val)

min_vals = np.min(pattern, axis=0)
max_vals = np.max(pattern, axis=0)
range_vals = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
pattern_normalized = (pattern - min_vals) / range_vals

scaler = StandardScaler()
pattern_scaled = scaler.fit_transform(pattern_normalized)

label_data = []
with open('output-IMS10.csv', 'r', encoding='utf-8-sig') as fhl:
    fhl_csv = csv.reader(fhl)
    for line in fhl_csv:
        label_data.append(line)
label_data = np.array(label_data, dtype='float64')

groups = label_data[:, 0]
label_c = label_data[:, 1]
label_c = np.exp(label_c)

unique_groups = np.unique(groups)
train_groups, test_groups = train_test_split(
    unique_groups,
    test_size=99/491,
    random_state=42
)
train_mask = np.isin(groups, train_groups)
test_mask = np.isin(groups, test_groups)

X_train_raw = pattern_scaled[train_mask]
y_train = label_c[train_mask]
X_test_raw = pattern_scaled[test_mask]
y_test = label_c[test_mask]

pca = PCA(n_components=12)
X_train_pca_all = pca.fit_transform(X_train_raw)
X_test_pca_all = pca.transform(X_test_raw)

param_space = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

TRIALS_PER_PCA = 5
GAP_THRESHOLD = 0.15
results_list = []
all_pred_results = []

def build_reg_model(params):
    model = XGBRegressor(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    )
    return model

def mean_relative_error(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return 0.0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mre = mean_relative_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {
        'mae': mae,
        'mse': mse,
        'rmse': rmse,
        'mre': mre,
        'r2': r2
    }

for n_components in range(1, 13):
    X_train_pca = X_train_pca_all[:, :n_components]
    X_test_pca = X_test_pca_all[:, :n_components]
    
    trial_results = []
    
    for trial in range(TRIALS_PER_PCA):
        current_params = {
            'n_estimators': random.choice(param_space['n_estimators']),
            'max_depth': random.choice(param_space['max_depth']),
            'learning_rate': random.choice(param_space['learning_rate']),
            'subsample': random.choice(param_space['subsample']),
            'colsample_bytree': random.choice(param_space['colsample_bytree'])
        }
        
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train_pca, y_train, 
            test_size=0.2, 
            random_state=42
        )
        
        model = build_reg_model(current_params)
        model.fit(X_tr, y_tr)
        
        y_tr_pred = model.predict(X_tr)
        y_val_pred = model.predict(X_val)
        y_train_pred = model.predict(X_train_pca)
        y_test_pred = model.predict(X_test_pca)
        
        tr_metrics = calculate_metrics(y_tr, y_tr_pred)
        val_metrics = calculate_metrics(y_val, y_val_pred)
        train_metrics = calculate_metrics(y_train, y_train_pred)
        test_metrics = calculate_metrics(y_test, y_test_pred)
        
        train_val_gap = abs(train_metrics['r2'] - val_metrics['r2'])
        train_test_gap = abs(train_metrics['r2'] - test_metrics['r2'])
        is_valid = (train_val_gap < GAP_THRESHOLD) and (train_test_gap < GAP_THRESHOLD)
        
        trial_results.append({
            'params': current_params,
            'tr_r2': tr_metrics['r2'],
            'tr_mae': tr_metrics['mae'],
            'tr_mse': tr_metrics['mse'],
            'tr_mre': tr_metrics['mre'],
            'tr_rmse': tr_metrics['rmse'],
            'val_r2': val_metrics['r2'],
            'val_mae': val_metrics['mae'],
            'val_mse': val_metrics['mse'],
            'val_mre': val_metrics['mre'],
            'val_rmse': val_metrics['rmse'],
            'train_r2': train_metrics['r2'],
            'train_mae': train_metrics['mae'],
            'train_mse': train_metrics['mse'],
            'train_mre': train_metrics['mre'],
            'train_rmse': train_metrics['rmse'],
            'test_r2': test_metrics['r2'],
            'test_mae': test_metrics['mae'],
            'test_mse': test_metrics['mse'],
            'test_mre': test_metrics['mre'],
            'test_rmse': test_metrics['rmse'],
            'train_val_gap': train_val_gap,
            'train_test_gap': train_test_gap,
            'is_valid': is_valid
        })
    
    valid_trials = [t for t in trial_results if t['is_valid']]
    
    if valid_trials:
        valid_trials_sorted = sorted(valid_trials, key=lambda x: x['test_r2'], reverse=True)
        best_trial = valid_trials_sorted[0]
    else:
        trial_results_sorted = sorted(trial_results, key=lambda x: x['train_val_gap'] + x['train_test_gap'])
        best_trial = trial_results_sorted[0]
    
    best_params = best_trial['params']
    best_train_r2 = best_trial['train_r2']
    best_train_mae = best_trial['train_mae']
    best_train_mse = best_trial['train_mse']
    best_train_mre = best_trial['train_mre']
    best_train_rmse = best_trial['train_rmse']
    best_val_r2 = best_trial['val_r2']
    best_val_mae = best_trial['val_mae']
    best_val_mse = best_trial['val_mse']
    best_val_mre = best_trial['val_mre']
    best_val_rmse = best_trial['val_rmse']
    best_test_r2 = best_trial['test_r2']
    best_test_mae = best_trial['test_mae']
    best_test_mse = best_trial['test_mse']
    best_test_mre = best_trial['test_mre']
    best_test_rmse = best_trial['test_rmse']
    best_train_val_gap = best_trial['train_val_gap']
    best_train_test_gap = best_trial['train_test_gap']
    is_valid = best_trial['is_valid']
    
    results_list.append({
        "PCA_Components": n_components,
        "Best_Params": str(best_params),
        "Train_R2": round(best_train_r2, 6),
        "Train_MAE": round(best_train_mae, 6),
        "Train_MSE": round(best_train_mse, 6),
        "Train_MRE": round(best_train_mre, 6),
        "Train_RMSE": round(best_train_rmse, 6),
        "Val_R2": round(best_val_r2, 6),
        "Val_MAE": round(best_val_mae, 6),
        "Val_MSE": round(best_val_mse, 6),
        "Val_MRE": round(best_val_mre, 6),
        "Val_RMSE": round(best_val_rmse, 6),
        "Test_R2": round(best_test_r2, 6),
        "Test_MAE": round(best_test_mae, 6),
        "Test_MSE": round(best_test_mse, 6),
        "Test_MRE": round(best_test_mre, 6),
        "Test_RMSE": round(best_test_rmse, 6),
        "Train_Val_Gap": round(best_train_val_gap, 6),
        "Train_Test_Gap": round(best_train_test_gap, 6),
        "Is_Valid": is_valid
    })
    
    X_tr, X_val, y_tr, y_val, tr_idx_in_train, val_idx_in_train = train_test_split(
        X_train_pca, y_train, np.arange(len(X_train_pca)),
        test_size=0.2, random_state=42
    )
    val_model = build_reg_model(best_params)
    val_model.fit(X_tr, y_tr)
    y_val_pred = val_model.predict(X_val)
    
    train_global_idx = np.where(train_mask)[0]
    val_global_idx = train_global_idx[val_idx_in_train]
    
    final_model = build_reg_model(best_params)
    final_model.fit(X_train_pca, y_train)
    
    y_train_pred = final_model.predict(X_train_pca)
    y_test_pred = final_model.predict(X_test_pca)
    
    pca_pred = np.full(len(pattern_scaled), np.nan)
    pca_pred[train_mask] = y_train_pred
    pca_pred[test_mask] = y_test_pred
    pca_pred[val_global_idx] = y_val_pred
    
    for idx in range(len(pattern_scaled)):
        if idx in val_global_idx:
            data_set = "Validation Set"
        elif groups[idx] in train_groups:
            data_set = "Training Set"
        else:
            data_set = "Test Set"
        all_pred_results.append({
            "Original_Index": idx + 1,
            "PCA_Dimensions": n_components,
            "Group": groups[idx],
            "True_Label": label_c[idx],
            "Dataset_Type": data_set,
            "Predicted_Label": pca_pred[idx]
        })

with pd.ExcelWriter('XGB_PCA_12D_Tuning_Results-IMS.xlsx', engine='openpyxl') as writer:
    results_df = pd.DataFrame(results_list)
    results_df.to_excel(writer, sheet_name='PCA_Dimension_Performance', index=False)
    
    pred_df = pd.DataFrame(all_pred_results)
    pred_df = pred_df.sort_values(by=['Original_Index', 'PCA_Dimensions'])
    pred_df.to_excel(writer, sheet_name='All_Dimension_Predictions', index=False)

print("\n===================== Summary of All PCA Dimension Tuning Results =====================")
summary_cols = ['PCA_Components',
                'Train_R2', 'Train_MAE', 'Train_MSE', 'Train_MRE', 'Train_RMSE',
                'Val_R2', 'Val_MAE', 'Val_MSE', 'Val_MRE', 'Val_RMSE',
                'Test_R2', 'Test_MAE', 'Test_MSE', 'Test_MRE', 'Test_RMSE',
                'Train_Val_Gap', 'Train_Test_Gap', 'Is_Valid']
print(results_df[summary_cols])

valid_results = results_df[results_df['Is_Valid'] == True]
if not valid_results.empty:
    best_pca_dim = valid_results.loc[valid_results['Test_R2'].idxmax()]
    print(f"\n‚ú® Optimal PCA dimension meeting gap criteria: {best_pca_dim['PCA_Components']}")
else:
    best_pca_dim = results_df.loc[results_df['Test_R2'].idxmax()]
    print(f"\n‚ö†Ô∏è No dimension meets gap criteria, selecting dimension with highest Test R¬≤: {best_pca_dim['PCA_Components']}")

print(f"\n„ÄêComplete Metrics of Optimal Dimension„Äë")
print(f"[Training Set] R¬≤: {best_pca_dim['Train_R2']:.6f} | MAE: {best_pca_dim['Train_MAE']:.6f} | MSE: {best_pca_dim['Train_MSE']:.6f} | MRE: {best_pca_dim['Train_MRE']:.6f} | RMSE: {best_pca_dim['Train_RMSE']:.6f}")
print(f"[Validation Set] R¬≤: {best_pca_dim['Val_R2']:.6f} | MAE: {best_pca_dim['Val_MAE']:.6f} | MSE: {best_pca_dim['Val_MSE']:.6f} | MRE: {best_pca_dim['Val_MRE']:.6f} | RMSE: {best_pca_dim['Val_RMSE']:.6f}")
print(f"[Test Set] R¬≤: {best_pca_dim['Test_R2']:.6f} | MAE: {best_pca_dim['Test_MAE']:.6f} | MSE: {best_pca_dim['Test_MSE']:.6f} | MRE: {best_pca_dim['Test_MRE']:.6f} | RMSE: {best_pca_dim['Test_RMSE']:.6f}")
print(f"Train-Validation Gap: {best_pca_dim['Train_Val_Gap']:.6f} | Train-Test Gap: {best_pca_dim['Train_Test_Gap']:.6f} | Is Valid: {best_pca_dim['Is_Valid']}")
print(f"   Optimal Parameters: {best_pca_dim['Best_Params']}")
print(f"\nüìÅ All results exported to: XGB_PCA_12D_Tuning_Results-IMS.xlsx")


    PCA_Components  Train_R2  Train_MAE  Train_MSE  Train_MRE  Train_RMSE  \
0                1  0.154337   0.063283   0.006201   0.067395    0.078744   
1                2  0.419905   0.051988   0.004253   0.055203    0.065218   
2                3  0.918197   0.017267   0.000600   0.017912    0.024491   
3                4  0.610276   0.042219   0.002858   0.044785    0.053456   
4                5  0.895974   0.019953   0.000763   0.020724    0.027618   
5                6  0.925513   0.016663   0.000546   0.017363    0.023370   
6                7  0.931682   0.015679   0.000501   0.016280    0.022381   
7                8  0.735737   0.034325   0.001938   0.036282    0.044019   
8                9  0.478582   0.049275   0.003823   0.052201    0.061832   
9               10  0.402143   0.053124   0.004384   0.056382    0.066209   
10              11  0.586119   0.043670   0.003035   0.046297    0.055088   
11              12  0.856519   0.024748   0.001052   0.025945    0.032435  