In [1]:
import numpy as np
import pandas as pd
import csv
import random
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)
random.seed(42)

pattern = []
with open('4-pattern1.csv', 'r', encoding='utf-8-sig') as fhd:
    reader = csv.reader(fhd)
    for line in reader:
        pattern.append(line)
pattern = np.array(pattern, dtype='float64')

pattern = np.where(np.isinf(pattern), np.nan, pattern)
pattern = np.nan_to_num(pattern, nan=np.nanmean(pattern))
min_vals = np.min(pattern, axis=0)
max_vals = np.max(pattern, axis=0)
range_vals = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
pattern = (pattern - min_vals) / range_vals

label_data = []
with open('label-2.csv', 'r', encoding='utf-8-sig') as fhl:
    reader = csv.reader(fhl)
    for line in reader:
        label_data.append(line)
label_data = np.array(label_data, dtype='float64')
groups = label_data[:, 0]
labels = label_data[:, 1]

unique_groups = np.unique(groups)
train_groups, test_groups = train_test_split(
    unique_groups,
    test_size=99/491,
    random_state=42
)

train_mask = np.isin(groups, train_groups)
test_mask = np.isin(groups, test_groups)

X_train_raw = pattern[train_mask]
y_train = labels[train_mask]
X_test_raw = pattern[test_mask]
y_test = labels[test_mask]

print(f"Number of training samples: {len(X_train_raw)}")
print(f"Number of test samples: {len(X_test_raw)}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

pca = PCA(n_components=12)
X_train_pca_all = pca.fit_transform(X_train_scaled)
X_test_pca_all = pca.transform(X_test_scaled)

def build_model(params):
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'],
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        random_state=42,
        n_jobs=-1
    )
    return model

param_space = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

results_list = []
all_pred_results = []
TRIALS_PER_PCA = 5

original_sample_meta = []
for idx in range(len(pattern)):
    original_sample_meta.append({
        "Original_Index": idx + 1,
        "Group": groups[idx],
        "True_Label": labels[idx],
        "Dataset": "Training" if groups[idx] in train_groups else "Testing"
    })
original_meta_df = pd.DataFrame(original_sample_meta)

for n_components in range(1, 13):
    print(f"Processing PCA Components: {n_components} ...")
    
    X_train_pca = X_train_pca_all[:, :n_components]
    X_test_pca = X_test_pca_all[:, :n_components]
    
    best_val_acc = 0.0
    best_model = None
    best_params = {}
    
    for trial in range(TRIALS_PER_PCA):
        current_params = {
            'n_estimators': random.choice(param_space['n_estimators']),
            'max_depth': random.choice(param_space['max_depth']),
            'min_samples_split': random.choice(param_space['min_samples_split']),
            'min_samples_leaf': random.choice(param_space['min_samples_leaf'])
        }
        
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train_pca, y_train, 
            test_size=0.2, 
            random_state=42
        )
        
        model = build_model(current_params)
        model.fit(X_tr, y_tr)
        
        val_pred = model.predict(X_val)
        val_acc = accuracy_score(y_val, val_pred)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model 
            best_params = current_params

    if best_model is None:
        continue

    y_train_pred = best_model.predict(X_train_pca)
    y_test_pred = best_model.predict(X_test_pca)
    
    accuracy = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred, average='binary', zero_division=0)
    recall = recall_score(y_test, y_test_pred, average='binary', zero_division=0)
    f1 = f1_score(y_test, y_test_pred, average='binary', zero_division=0)
    
    print(f"  -> Best Params: {best_params} | Test Acc: {accuracy:.4f}")

    results_list.append({
        "PCA_Components": n_components,
        "Accuracy": round(accuracy, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1_Score": round(f1, 4),
        "Best_Params": str(best_params)
    })
    
    pca_pred = np.full(len(pattern), np.nan)
    pca_pred[train_mask] = y_train_pred
    pca_pred[test_mask] = y_test_pred
    
    for idx in range(len(pattern)):
        all_pred_results.append({
            "Original_Index": idx + 1,
            "PCA_Dimensions": n_components,
            "Group": groups[idx],
            "True_Label": labels[idx],
            "Dataset": "Training" if groups[idx] in train_groups else "Testing",
            "Predicted_Label": pca_pred[idx]
        })

with pd.ExcelWriter('rf_pca_results-2.xlsx', engine='openpyxl') as writer:
    results_df = pd.DataFrame(results_list)
    results_df.to_excel(writer, sheet_name='Performance_Summary', index=False)
    
    pred_df = pd.DataFrame(all_pred_results)
    pred_df = pred_df.sort_values(by=['Original_Index', 'PCA_Dimensions'])
    pred_df.to_excel(writer, sheet_name='Prediction_Results', index=False)

print("\nAll results have been exported to rf_pca_results-2.xlsx!")
print("\nModel performance metrics for different PCA dimensions (after hyperparameter tuning):")
print(results_df)

Number of training samples: 390
Number of test samples: 101
Processing PCA Components: 1 ...
  -> Best Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 4} | Test Acc: 0.7426
Processing PCA Components: 2 ...
  -> Best Params: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 1} | Test Acc: 0.7822
Processing PCA Components: 3 ...
  -> Best Params: {'n_estimators': 200, 'max_depth': None, 'min_samples_split': 10, 'min_samples_leaf': 2} | Test Acc: 0.9307
Processing PCA Components: 4 ...
  -> Best Params: {'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 4} | Test Acc: 0.9109
Processing PCA Components: 5 ...
  -> Best Params: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 2} | Test Acc: 0.9505
Processing PCA Components: 6 ...
  -> Best Params: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 4} | Test Acc: 0.9307