# Notebook 5: Hyperparameter Tuning

**Purpose**: Hyperparameter search for all models (Classical, Neural, QML).

**Inputs**:
- `pca_train.csv` from Notebook 4
- `pca_test.csv` from Notebook 4

**Outputs**:
- `best_params.json` → `results/`
- `tuning_results.csv` → `results/`

---

In [1]:
# Imports
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import json
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.mixture import GaussianMixture
from sklearn.metrics import make_scorer, f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths
BASE_DIR = Path('.').resolve().parent
FEATURES_DIR = BASE_DIR / 'data' / 'features'
RESULTS_DIR = BASE_DIR / 'results'

# Configuration
TARGET_COLUMN = 'Class'
CV_FOLDS = 3  # Reduced for faster tuning

print(f"Random Seed: {RANDOM_SEED}")
print(f"CV Folds: {CV_FOLDS}")

Random Seed: 42
CV Folds: 3


## 1. Load Data

In [2]:
# Load PCA data
train_df = pd.read_csv(FEATURES_DIR / 'pca_train.csv')
test_df = pd.read_csv(FEATURES_DIR / 'pca_test.csv')

X_train = train_df.drop(columns=[TARGET_COLUMN]).values
y_train = train_df[TARGET_COLUMN].values

X_test = test_df.drop(columns=[TARGET_COLUMN]).values
y_test = test_df[TARGET_COLUMN].values

print(f"Training samples: {X_train.shape[0]}")
print(f"Features: {X_train.shape[1]}")
print(f"Class distribution: {np.bincount(y_train)}")

Training samples: 1600
Features: 10
Class distribution: [1520   80]


## 2. Classical Models Tuning

In [3]:
# Define parameter grids for classical models
classical_param_grids = {
    'SVM_Linear': {
        'model': SVC(kernel='linear', random_state=RANDOM_SEED, probability=True),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'class_weight': [None, 'balanced']
        }
    },
    'SVM_RBF': {
        'model': SVC(kernel='rbf', random_state=RANDOM_SEED, probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'gamma': ['scale', 'auto', 0.01, 0.1],
            'class_weight': [None, 'balanced']
        }
    },
    'Logistic_Regression': {
        'model': LogisticRegression(random_state=RANDOM_SEED, max_iter=1000),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l2'],
            'class_weight': [None, 'balanced']
        }
    },
    'Random_Forest': {
        'model': RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 10, None],
            'min_samples_split': [2, 5],
            'class_weight': [None, 'balanced']
        }
    }
}

print(f"Classical models to tune: {list(classical_param_grids.keys())}")

Classical models to tune: ['SVM_Linear', 'SVM_RBF', 'Logistic_Regression', 'Random_Forest']


In [4]:
# Tune classical models
classical_best_params = {}
tuning_results = []

cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_SEED)

for name, config in classical_param_grids.items():
    print(f"\nTuning {name}...")
    
    grid_search = GridSearchCV(
        config['model'],
        config['params'],
        cv=cv,
        scoring='f1',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_train, y_train)
    
    classical_best_params[name] = grid_search.best_params_
    
    tuning_results.append({
        'model': name,
        'category': 'classical',
        'best_score': grid_search.best_score_,
        'best_params': str(grid_search.best_params_)
    })
    
    print(f"  Best F1: {grid_search.best_score_:.4f}")
    print(f"  Best Params: {grid_search.best_params_}")


Tuning SVM_Linear...


KeyboardInterrupt: 

In [None]:
# Isolation Forest (unsupervised, different approach)
print("\nTuning Isolation Forest...")

# For Isolation Forest, we tune based on anomaly detection performance
from sklearn.metrics import f1_score as f1

best_if_score = 0
best_if_params = {}

for n_estimators in [50, 100, 200]:
    for contamination in [0.05, 0.1, 0.15]:
        iso = IsolationForest(
            n_estimators=n_estimators,
            contamination=contamination,
            random_state=RANDOM_SEED,
            n_jobs=-1
        )
        iso.fit(X_train)
        y_pred = (iso.predict(X_train) == -1).astype(int)  # -1 = anomaly
        score = f1(y_train, y_pred)
        
        if score > best_if_score:
            best_if_score = score
            best_if_params = {'n_estimators': n_estimators, 'contamination': contamination}

classical_best_params['Isolation_Forest'] = best_if_params
tuning_results.append({
    'model': 'Isolation_Forest',
    'category': 'classical',
    'best_score': best_if_score,
    'best_params': str(best_if_params)
})

print(f"  Best F1: {best_if_score:.4f}")
print(f"  Best Params: {best_if_params}")

In [None]:
# GMM tuning
print("\nTuning Gaussian Mixture Model...")

best_gmm_score = 0
best_gmm_params = {}

for n_components in [2, 3, 5, 10]:
    for covariance_type in ['full', 'diag']:
        gmm = GaussianMixture(
            n_components=n_components,
            covariance_type=covariance_type,
            random_state=RANDOM_SEED
        )
        gmm.fit(X_train)
        scores = gmm.score_samples(X_train)
        threshold = np.percentile(scores, 5)  # Bottom 5% as anomalies
        y_pred = (scores < threshold).astype(int)
        score = f1(y_train, y_pred)
        
        if score > best_gmm_score:
            best_gmm_score = score
            best_gmm_params = {'n_components': n_components, 'covariance_type': covariance_type}

classical_best_params['GMM'] = best_gmm_params
tuning_results.append({
    'model': 'GMM',
    'category': 'classical',
    'best_score': best_gmm_score,
    'best_params': str(best_gmm_params)
})

print(f"  Best F1: {best_gmm_score:.4f}")
print(f"  Best Params: {best_gmm_params}")

## 3. Neural Network Parameters

In [None]:
# Define neural network hyperparameters (to be used in Notebook 7)
neural_params = {
    'MLP': {
        'hidden_layers': [64, 32],
        'learning_rate': 0.001,
        'epochs': 100,
        'batch_size': 32,
        'dropout': 0.2
    },
    'Autoencoder': {
        'encoder_layers': [32, 16, 8],
        'decoder_layers': [8, 16, 32],
        'learning_rate': 0.001,
        'epochs': 100,
        'batch_size': 32
    },
    'VAE': {
        'encoder_layers': [32, 16],
        'latent_dim': 4,
        'decoder_layers': [16, 32],
        'learning_rate': 0.001,
        'epochs': 100,
        'batch_size': 32,
        'kl_weight': 0.1
    },
    'Deep_MLP': {
        'hidden_layers': [128, 64, 32, 16],
        'learning_rate': 0.0005,
        'epochs': 150,
        'batch_size': 32,
        'dropout': 0.3
    },
    'Deep_Autoencoder': {
        'encoder_layers': [64, 32, 16, 8],
        'decoder_layers': [8, 16, 32, 64],
        'learning_rate': 0.001,
        'epochs': 100,
        'batch_size': 32
    }
}

# Add to tuning results (these are pre-defined based on best practices)
for name, params in neural_params.items():
    tuning_results.append({
        'model': name,
        'category': 'neural',
        'best_score': None,  # Will be computed in Notebook 7
        'best_params': str(params)
    })

print("Neural network parameters defined:")
for name, params in neural_params.items():
    print(f"  {name}: {params}")

## 4. QML Parameters

In [None]:
# Define QML hyperparameters (to be used in Notebook 8)
qml_params = {
    'VQC': {
        'n_qubits': 4,
        'n_layers': 2,
        'learning_rate': 0.1,
        'epochs': 50,
        'shots': 1024,
        'feature_map': 'ZZFeatureMap',
        'ansatz': 'RealAmplitudes'
    },
    'Hybrid_QNN': {
        'n_qubits': 4,
        'n_layers': 2,
        'classical_layers': [16, 8],
        'learning_rate': 0.01,
        'epochs': 50,
        'shots': 1024
    },
    'QSVM': {
        'n_qubits': 4,
        'feature_map': 'ZZFeatureMap',
        'shots': 1024,
        'C': 1.0
    },
    'Quantum_Autoencoder': {
        'n_qubits': 4,
        'n_latent_qubits': 2,
        'n_layers': 2,
        'learning_rate': 0.1,
        'epochs': 30,
        'shots': 1024
    },
    'QGAN': {
        'n_qubits': 2,
        'n_layers': 1,
        'learning_rate': 0.05,
        'epochs': 20,
        'shots': 512
    }
}

# Add to tuning results
for name, params in qml_params.items():
    tuning_results.append({
        'model': name,
        'category': 'qml',
        'best_score': None,  # Will be computed in Notebook 8
        'best_params': str(params)
    })

print("QML parameters defined:")
for name, params in qml_params.items():
    print(f"  {name}: {params}")

## 5. Save Results

In [None]:
# Combine all best parameters
all_best_params = {
    'random_seed': RANDOM_SEED,
    'cv_folds': CV_FOLDS,
    'classical': classical_best_params,
    'neural': neural_params,
    'qml': qml_params
}

# Save best params
params_path = RESULTS_DIR / 'best_params.json'
with open(params_path, 'w') as f:
    json.dump(all_best_params, f, indent=2)

print(f"✅ Saved best parameters to: {params_path}")

In [None]:
# Save tuning results
tuning_df = pd.DataFrame(tuning_results)
tuning_path = RESULTS_DIR / 'tuning_results.csv'
tuning_df.to_csv(tuning_path, index=False)

print(f"✅ Saved tuning results to: {tuning_path}")
print(f"\nTuning Results:")
print(tuning_df)

## 6. Summary Visualization

In [None]:
# Visualize classical model tuning results
classical_results = tuning_df[tuning_df['category'] == 'classical'].copy()
classical_results = classical_results.dropna(subset=['best_score'])

if len(classical_results) > 0:
    plt.figure(figsize=(10, 6))
    plt.barh(classical_results['model'], classical_results['best_score'], color='steelblue')
    plt.xlabel('F1 Score (CV)')
    plt.ylabel('Model')
    plt.title('Classical Models - Cross-Validation F1 Scores')
    plt.xlim([0, 1])
    
    for i, v in enumerate(classical_results['best_score']):
        plt.text(v + 0.01, i, f'{v:.4f}', va='center')
    
    plt.tight_layout()
    plt.savefig(BASE_DIR / 'figures' / 'tuning_classical_scores.png', dpi=150)
    plt.show()

In [None]:
# Summary
print("\n" + "="*50)
print("HYPERPARAMETER TUNING SUMMARY")
print("="*50)
print(f"Classical models tuned: {len(classical_best_params)}")
print(f"Neural network configs: {len(neural_params)}")
print(f"QML configs: {len(qml_params)}")
print(f"Total configurations: {len(tuning_results)}")
print("\n✅ Notebook 5 Complete!")