In [1]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import csv
import random
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
random.seed(42)

pattern = []
with open('4-pattern1.csv', 'r', encoding='utf-8-sig') as fhd:
    fhd_csv = csv.reader(fhd)
    for line in fhd_csv:
        pattern.append(line)
pattern = np.array(pattern, dtype='float64')

pattern = np.where(np.isinf(pattern), np.nan, pattern)
mean_val = np.nanmean(pattern) if not np.isnan(np.nanmean(pattern)) else 0
pattern = np.nan_to_num(pattern, nan=mean_val)

min_vals = np.min(pattern, axis=0)
max_vals = np.max(pattern, axis=0)
range_vals = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
pattern_normalized = (pattern - min_vals) / range_vals

scaler = StandardScaler()
pattern_scaled = scaler.fit_transform(pattern_normalized)

label_data = []
with open('output-IMS10.csv', 'r', encoding='utf-8-sig') as fhl:
    fhl_csv = csv.reader(fhl)
    for line in fhl_csv:
        label_data.append(line)
label_data = np.array(label_data, dtype='float64')

groups = label_data[:, 0]
label_c = label_data[:, 1]
label_c = np.exp(label_c)

unique_groups = np.unique(groups)
train_groups, test_groups = train_test_split(
    unique_groups,
    test_size=99/491,
    random_state=42
)
train_mask = np.isin(groups, train_groups)
test_mask = np.isin(groups, test_groups)

X_train_raw = pattern_scaled[train_mask]
y_train = label_c[train_mask]
X_test_raw = pattern_scaled[test_mask]
y_test = label_c[test_mask]

pca = PCA(n_components=12)
X_train_pca_all = pca.fit_transform(X_train_raw)
X_test_pca_all = pca.transform(X_test_raw)

param_space = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [200, 500, 1000]
}

TRIALS_PER_PCA = 5
GAP_THRESHOLD = 0.15
results_list = []
all_pred_results = []

def build_reg_model(params):
    model = MLPRegressor(
        hidden_layer_sizes=params['hidden_layer_sizes'],
        activation=params['activation'],
        solver=params['solver'],
        alpha=params['alpha'],
        learning_rate=params['learning_rate'],
        max_iter=params['max_iter'],
        random_state=42,
        verbose=False,
        early_stopping=True,
        n_iter_no_change=10
    )
    return model

def mean_relative_error(y_true, y_pred):
    mask = y_true != 0
    if np.sum(mask) == 0:
        return 0.0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]))

for n_components in range(1, 13):
    X_train_pca = X_train_pca_all[:, :n_components]
    X_test_pca = X_test_pca_all[:, :n_components]
    
    trial_results = []
    
    for trial in range(TRIALS_PER_PCA):
        current_params = {
            'hidden_layer_sizes': random.choice(param_space['hidden_layer_sizes']),
            'activation': random.choice(param_space['activation']),
            'solver': random.choice(param_space['solver']),
            'alpha': random.choice(param_space['alpha']),
            'learning_rate': random.choice(param_space['learning_rate']),
            'max_iter': random.choice(param_space['max_iter'])
        }
        
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train_pca, y_train, 
            test_size=0.2, 
            random_state=42
        )
        
        model = build_reg_model(current_params)
        model.fit(X_tr, y_tr)
        
        y_tr_pred = model.predict(X_tr)
        y_val_pred = model.predict(X_val)
        y_train_pred = model.predict(X_train_pca)
        y_test_pred = model.predict(X_test_pca)
        
        tr_r2 = r2_score(y_tr, y_tr_pred)
        val_r2 = r2_score(y_val, y_val_pred)
        train_r2 = r2_score(y_train, y_train_pred)
        test_r2 = r2_score(y_test, y_test_pred)
        
        train_val_gap = abs(train_r2 - val_r2)
        train_test_gap = abs(train_r2 - test_r2)
        
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_rmse = np.sqrt(train_mse)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_mre = mean_relative_error(y_train, y_train_pred)
        
        val_mse = mean_squared_error(y_val, y_val_pred)
        val_rmse = np.sqrt(val_mse)
        val_mae = mean_absolute_error(y_val, y_val_pred)
        val_mre = mean_relative_error(y_val, y_val_pred)
        
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_rmse = np.sqrt(test_mse)
        test_mae = mean_absolute_error(y_test, y_test_pred)
        test_mre = mean_relative_error(y_test, y_test_pred)
        
        trial_results.append({
            'params': current_params,
            'tr_r2': tr_r2,
            'val_r2': val_r2,
            'train_r2': train_r2,
            'test_r2': test_r2,
            'train_val_gap': train_val_gap,
            'train_test_gap': train_test_gap,
            'train_mse': train_mse,
            'train_rmse': train_rmse,
            'train_mae': train_mae,
            'train_mre': train_mre,
            'val_mse': val_mse,
            'val_rmse': val_rmse,
            'val_mae': val_mae,
            'val_mre': val_mre,
            'test_mse': test_mse,
            'test_rmse': test_rmse,
            'test_mae': test_mae,
            'test_mre': test_mre,
            'is_valid': (train_val_gap < GAP_THRESHOLD) and (train_test_gap < GAP_THRESHOLD)
        })
    
    valid_trials = [t for t in trial_results if t['is_valid']]
    
    if valid_trials:
        valid_trials_sorted = sorted(valid_trials, key=lambda x: x['test_r2'], reverse=True)
        best_trial = valid_trials_sorted[0]
    else:
        trial_results_sorted = sorted(trial_results, key=lambda x: x['train_val_gap'] + x['train_test_gap'])
        best_trial = trial_results_sorted[0]
    
    best_params = best_trial['params']
    best_train_r2 = best_trial['train_r2']
    best_val_r2 = best_trial['val_r2']
    best_test_r2 = best_trial['test_r2']
    best_train_val_gap = best_trial['train_val_gap']
    best_train_test_gap = best_trial['train_test_gap']
    best_train_mse = best_trial['train_mse']
    best_train_rmse = best_trial['train_rmse']
    best_train_mae = best_trial['train_mae']
    best_train_mre = best_trial['train_mre']
    best_val_mse = best_trial['val_mse']
    best_val_rmse = best_trial['val_rmse']
    best_val_mae = best_trial['val_mae']
    best_val_mre = best_trial['val_mre']
    best_test_mse = best_trial['test_mse']
    best_test_rmse = best_trial['test_rmse']
    best_test_mae = best_trial['test_mae']
    best_test_mre = best_trial['test_mre']
    is_valid = best_trial['is_valid']
    
    results_list.append({
        "PCA_Components": n_components,
        "Best_Params": str(best_params),
        "Train_R2": round(best_train_r2, 6),
        "Val_R2": round(best_val_r2, 6),
        "Test_R2": round(best_test_r2, 6),
        "Train_Val_Gap": round(best_train_val_gap, 6),
        "Train_Test_Gap": round(best_train_test_gap, 6),
        "Is_Valid": is_valid,
        "Train_MSE": round(best_train_mse, 6),
        "Train_MAE": round(best_train_mae, 6),
        "Train_MRE": round(best_train_mre, 6),
        "Train_RMSE": round(best_train_rmse, 6),
        "Val_MSE": round(best_val_mse, 6),
        "Val_MAE": round(best_val_mae, 6),
        "Val_MRE": round(best_val_mre, 6),
        "Val_RMSE": round(best_val_rmse, 6),
        "Test_MSE": round(best_test_mse, 6),
        "Test_MAE": round(best_test_mae, 6),
        "Test_MRE": round(best_test_mre, 6),
        "Test_RMSE": round(best_test_rmse, 6)
    })
    
    final_model = build_reg_model(best_params)
    final_model.fit(X_train_pca, y_train)
    
    y_train_pred = final_model.predict(X_train_pca)
    y_test_pred = final_model.predict(X_test_pca)
    
    pca_pred = np.full(len(pattern_scaled), np.nan)
    pca_pred[train_mask] = y_train_pred
    pca_pred[test_mask] = y_test_pred
    
    for idx in range(len(pattern_scaled)):
        all_pred_results.append({
            "Original_Index": idx + 1,
            "PCA_Dimensions": n_components,
            "Group": groups[idx],
            "True_Label": label_c[idx],
            "Dataset_Type": "Training Set" if groups[idx] in train_groups else "Test Set",
            "Predicted_Label": pca_pred[idx]
        })

with pd.ExcelWriter('ANN_PCA_12D_Tuning_Results-IMS.xlsx', engine='openpyxl') as writer:
    results_df = pd.DataFrame(results_list)
    results_df.to_excel(writer, sheet_name='PCA_Dimension_Performance', index=False)
    
    pred_df = pd.DataFrame(all_pred_results)
    pred_df = pred_df.sort_values(by=['Original_Index', 'PCA_Dimensions'])
    pred_df.to_excel(writer, sheet_name='All_Dimension_Predictions', index=False)

print("\n===================== Summary of All PCA Dimension Tuning Results =====================")
summary_cols = ['PCA_Components', 'Train_R2', 'Val_R2', 'Test_R2', 
                'Train_MSE', 'Train_MAE', 'Train_MRE', 'Train_RMSE',
                'Val_MSE', 'Val_MAE', 'Val_MRE', 'Val_RMSE',
                'Test_MSE', 'Test_MAE', 'Test_MRE', 'Test_RMSE',
                'Train_Val_Gap', 'Train_Test_Gap', 'Is_Valid']
print(results_df[summary_cols])

valid_results = results_df[results_df['Is_Valid'] == True]
if not valid_results.empty:
    best_pca_dim = valid_results.loc[valid_results['Test_R2'].idxmax()]
    print(f"\n‚ú® Optimal PCA dimension meeting gap criteria: {best_pca_dim['PCA_Components']}")
else:
    best_pca_dim = results_df.loc[results_df['Test_R2'].idxmax()]
    print(f"\n‚ö†Ô∏è No dimension meets gap criteria, selecting dimension with highest Test R¬≤: {best_pca_dim['PCA_Components']}")

print(f"   Training R¬≤: {best_pca_dim['Train_R2']:.6f} | Validation R¬≤: {best_pca_dim['Val_R2']:.6f} | Test R¬≤: {best_pca_dim['Test_R2']:.6f}")
print(f"   Train-Validation Gap: {best_pca_dim['Train_Val_Gap']:.6f} | Train-Test Gap: {best_pca_dim['Train_Test_Gap']:.6f}")
print(f"   [Training Set] MSE: {best_pca_dim['Train_MSE']:.6f} | MAE: {best_pca_dim['Train_MAE']:.6f} | MRE: {best_pca_dim['Train_MRE']:.6f} | RMSE: {best_pca_dim['Train_RMSE']:.6f}")
print(f"   [Validation Set] MSE: {best_pca_dim['Val_MSE']:.6f} | MAE: {best_pca_dim['Val_MAE']:.6f} | MRE: {best_pca_dim['Val_MRE']:.6f} | RMSE: {best_pca_dim['Val_RMSE']:.6f}")
print(f"   [Test Set] MSE: {best_pca_dim['Test_MSE']:.6f} | MAE: {best_pca_dim['Test_MAE']:.6f} | MRE: {best_pca_dim['Test_MRE']:.6f} | RMSE: {best_pca_dim['Test_RMSE']:.6f}")
print(f"   Optimal Parameters: {best_pca_dim['Best_Params']}")
print(f"\nüìÅ All results exported to: ANN_PCA_12D_Tuning_Results-IMS.xlsx")


    PCA_Components  Train_R2    Val_R2   Test_R2  Train_MSE  Train_MAE  \
0                1  0.119914  0.189200  0.158713   0.006453   0.063732   
1                2 -0.158805 -0.229462 -0.191415   0.008497   0.075091   
2                3  0.761468  0.549403  0.424486   0.001749   0.031264   
3                4  0.754035  0.719771  0.490491   0.001803   0.032490   
4                5  0.741117  0.707358  0.617405   0.001898   0.031749   
5                6  0.763705  0.735254  0.618986   0.001733   0.030576   
6                7  0.454907  0.359225  0.279075   0.003997   0.049945   
7                8  0.744158  0.726735  0.551324   0.001876   0.032492   
8                9  0.129201  0.098737  0.125262   0.006385   0.063661   
9               10  0.822968  0.786892  0.642134   0.001298   0.026754   
10              11  0.686375  0.639433  0.499658   0.002300   0.036835   
11              12  0.755552  0.706856  0.595708   0.001792   0.032631   

    Train_MRE  Train_RMSE   Val_MSE 