In [1]:
import numpy as np
import pandas as pd
import csv
import random
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score, log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split as sk_train_test_split

np.random.seed(42)
random.seed(42)

pattern = []
with open('4-pattern1.csv', 'r', encoding='utf-8-sig') as fhd:
    reader = csv.reader(fhd)
    for line in reader:
        pattern.append(line)
pattern = np.array(pattern, dtype='float64')

pattern = np.where(np.isinf(pattern), np.nan, pattern)
pattern = np.nan_to_num(pattern, nan=np.nanmean(pattern))
min_vals = np.min(pattern, axis=0)
max_vals = np.max(pattern, axis=0)
range_vals = np.where(max_vals - min_vals == 0, 1, max_vals - min_vals)
pattern = (pattern - min_vals) / range_vals

label_data = []
with open('label-2.csv', 'r', encoding='utf-8-sig') as fhl:
    reader = csv.reader(fhl)
    for line in reader:
        label_data.append(line)
label_data = np.array(label_data, dtype='float64')
groups = label_data[:, 0]
labels = label_data[:, 1]

unique_groups = np.unique(groups)
train_groups, test_groups = train_test_split(
    unique_groups,
    test_size=99/491,
    random_state=42
)

train_mask = np.isin(groups, train_groups)
test_mask = np.isin(groups, test_groups)

X_train_raw = pattern[train_mask]
y_train = labels[train_mask]
X_test_raw = pattern[test_mask]
y_test = labels[test_mask]

print(f"Number of training samples: {len(X_train_raw)}")
print(f"Number of test samples: {len(X_test_raw)}")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

pca = PCA(n_components=12)
X_train_pca_all = pca.fit_transform(X_train_scaled)
X_test_pca_all = pca.transform(X_test_scaled)

def train_xgb_model(X_train, y_train, X_val, y_val, params):
    model = xgb.XGBClassifier(
        max_depth=params['max_depth'],
        learning_rate=params['learning_rate'],
        n_estimators=params['n_estimators'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        gamma=params['gamma'],
        objective='binary:logistic',
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False if hasattr(xgb.XGBClassifier, 'use_label_encoder') else None
    )
    
    model.fit(
        X_train, y_train,
        verbose=0
    )
    
    y_val_pred_proba = model.predict_proba(X_val)[:, 1]
    val_loss = log_loss(y_val, y_val_pred_proba)
    
    return model, val_loss

param_space = {
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2]
}

results_list = []
TRIALS_PER_PCA = 5

for n_components in range(1, 13):
    print(f"Processing PCA Components: {n_components} ...")
    
    X_train_pca = X_train_pca_all[:, :n_components]
    X_test_pca = X_test_pca_all[:, :n_components]
    
    best_val_loss = float('inf')
    best_model = None
    best_params = {}
    
    for trial in range(TRIALS_PER_PCA):
        current_params = {
            'max_depth': random.choice(param_space['max_depth']),
            'learning_rate': random.choice(param_space['learning_rate']),
            'n_estimators': random.choice(param_space['n_estimators']),
            'subsample': random.choice(param_space['subsample']),
            'colsample_bytree': random.choice(param_space['colsample_bytree']),
            'gamma': random.choice(param_space['gamma'])
        }
        
        X_tr, X_val, y_tr, y_val = sk_train_test_split(
            X_train_pca, y_train,
            test_size=0.2,
            random_state=42
        )
        
        model, val_loss = train_xgb_model(X_tr, y_tr, X_val, y_val, current_params)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model 
            best_params = current_params

    if best_model is None:
        continue

    y_pred_proba = best_model.predict_proba(X_test_pca)[:, 1]
    y_pred = (y_pred_proba > 0.5).astype(int)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary', zero_division=0)
    recall = recall_score(y_test, y_pred, average='binary', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='binary', zero_division=0)
    
    print(f"  -> Best Params: {best_params} | Test Acc: {accuracy:.4f}")

    results_list.append({
        "Components": n_components,
        "Accuracy": round(accuracy, 4),
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1 Score": round(f1, 4),
        "Best_Params": str(best_params)
    })

results_df = pd.DataFrame(results_list)
print("\nModel performance metrics for different PCA dimensions (after hyperparameter tuning):")
print(results_df)

# results_df.to_csv('pca_tuned_performance.csv', index=False, encoding='utf-8-sig')

Number of training samples: 390
Number of test samples: 101
Processing PCA Components: 1 ...
  -> Best Params: {'max_depth': 3, 'learning_rate': 0.01, 'n_estimators': 300, 'subsample': 0.8, 'colsample_bytree': 0.7, 'gamma': 0} | Test Acc: 0.8218
Processing PCA Components: 2 ...
  -> Best Params: {'max_depth': 4, 'learning_rate': 0.05, 'n_estimators': 200, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0.1} | Test Acc: 0.8119
Processing PCA Components: 3 ...
  -> Best Params: {'max_depth': 6, 'learning_rate': 0.2, 'n_estimators': 200, 'subsample': 0.9, 'colsample_bytree': 0.9, 'gamma': 0.2} | Test Acc: 0.8911
Processing PCA Components: 4 ...
  -> Best Params: {'max_depth': 4, 'learning_rate': 0.1, 'n_estimators': 100, 'subsample': 0.7, 'colsample_bytree': 0.7, 'gamma': 0.1} | Test Acc: 0.9208
Processing PCA Components: 5 ...
  -> Best Params: {'max_depth': 7, 'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.7, 'colsample_bytree': 0.9, 'gamma': 0.2} | Test Acc: 0.9505
Proce