In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score, make_scorer

# Step 1: Load the dataset
csv_file_path = 'wavelet_metrics_1.csv'  # Path to your CSV file
data = pd.read_csv(csv_file_path)

# Step 2: Separate features and labels
X = data.drop(['Data', 'label'], axis=1)  # Drop non-numeric and target columns
y = data['label']  # Extract labels (0 or 1)

# Step 3: Ensure equal class distribution in train and test sets
class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]

# Step 4: Split each class into 80% train and 20% test
train_0, test_0 = train_test_split(class_0, test_size=0.5, random_state=42)
train_1, test_1 = train_test_split(class_1, test_size=0.5, random_state=42)

# Step 5: Concatenate the training and testing data for both classes
train_data = pd.concat([train_0, train_1])
test_data = pd.concat([test_0, test_1])

# Step 6: Extract features and labels from the training and testing sets
X_train = train_data.drop(['Data', 'label'], axis=1)
y_train = train_data['label']
X_test = test_data.drop(['Data', 'label'], axis=1)
y_test = test_data['label']


In [6]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(400, 16) (400,) (400, 16) (400,)


In [7]:

# Step 7: Standardize the features

scaler = StandardScaler()

#scaler = StandardScaler(with_mean=True, with_std=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)




In [8]:
'''
# Step 8: Apply PCA for dimensionality reduction

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train) # How does train and test work in pca
X_test_pca = pca.transform(X_test)

# Step 9: Define the SVM model and hyperparameters for GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 2, 3, 5, 10, 20, 30, 40, 50, 100, 200],      # Regularization parameter
    'gamma': [0.0001, 0.001, 0.01, 0.02, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1],  # RBF Kernel coefficient
}


# Use StratifiedKFold for balanced cross-validation splits
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define the SVM with RBF kernel
svm = SVC(kernel='rbf')

# Perform GridSearchCV to find the optimal hyperparameters
grid_search = GridSearchCV(svm, param_grid, cv=cv, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train_pca, y_train)

# Print the best hyperparameters
print(f"Best Parameters: {grid_search.best_params_}")

# Step 10: Train the final SVM model with the best parameters
best_svm = grid_search.best_estimator_
best_svm.fit(X_train_pca, y_train)

# Step 11: Predict on the test set
y_pred = best_svm.predict(X_test_pca)

# Step 12: Calculate the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Step 13: Calculate Sensitivity and Specificity
sensitivity = tp / (tp + fn)  # True Positive Rate
specificity = tn / (tn + fp)  # True Negative Rate

# Step 14: Print evaluation metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print(f"\nSensitivity: {sensitivity:.2f}")
print(f"Specificity: {specificity:.2f}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}") '''


'\n# Step 8: Apply PCA for dimensionality reduction\n\npca = PCA(n_components=5)\nX_train_pca = pca.fit_transform(X_train) # How does train and test work in pca\nX_test_pca = pca.transform(X_test)\n\n# Step 9: Define the SVM model and hyperparameters for GridSearchCV\nparam_grid = {\n    \'C\': [0.01, 0.1, 1, 2, 3, 5, 10, 20, 30, 40, 50, 100, 200],      # Regularization parameter\n    \'gamma\': [0.0001, 0.001, 0.01, 0.02, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1],  # RBF Kernel coefficient\n}\n\n\n# Use StratifiedKFold for balanced cross-validation splits\ncv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)\n\n# Define the SVM with RBF kernel\nsvm = SVC(kernel=\'rbf\')\n\n# Perform GridSearchCV to find the optimal hyperparameters\ngrid_search = GridSearchCV(svm, param_grid, cv=cv, scoring=\'accuracy\', verbose=1, n_jobs=-1)\ngrid_search.fit(X_train_pca, y_train)\n\n# Print the best hyperparameters\nprint(f"Best Parameters: {grid_search.best_params_}")\n\n# Step 10: Train the fin

In [9]:
# Initialize list to store results
results = []

# Define the SVM hyperparameters for GridSearchCV
param_grid = {
    'C': [0.01, 0.1, 1, 2, 3, 5, 10],
    'gamma': [0.0001, 0.001, 0.01, 0.02, 0.05, 0.08, 0.1, 0.2, 0.3, 0.5, 1],
}

# Use StratifiedKFold for balanced cross-validation splits
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Loop over different n_components for PCA
for n in range(3, 17):  # Components from 3 to 16
    print(f"\nEvaluating PCA with n_components={n}")
    
    # Apply PCA transformation
    #pca = PCA(n_components=n)
    #X_train_pca = pca.fit_transform(X_train)
    #X_test_pca = pca.transform(X_test)
    
    # Define the SVM with RBF kernel
    svm = SVC(kernel='rbf')
    
    # Perform GridSearchCV to find the optimal hyperparameters
    grid_search = GridSearchCV(svm, param_grid, cv=cv, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    # Get the best model
    best_svm = grid_search.best_estimator_
    best_svm.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = best_svm.predict(X_test)
    
    # Calculate confusion matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    
    # Calculate evaluation metrics
    sensitivity = tp / (tp + fn)  # True Positive Rate
    specificity = tn / (tn + fp)  # True Negative Rate
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print and store the results
    print(f"Best Parameters for n_components={n}: {grid_search.best_params_}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Sensitivity: {sensitivity:.2f}, Specificity: {specificity:.2f}, Accuracy: {accuracy:.2f}")
    
    # Append results to list
    results.append({
        'n_components': n,
        'best_params': grid_search.best_params_,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'accuracy': accuracy
    })

# Display all results
for result in results:
    print(f"\nPCA Components: {result['n_components']}, "
          f"Accuracy: {result['accuracy']:.2f}, "
          f"Sensitivity: {result['sensitivity']:.2f}, "
          f"Specificity: {result['specificity']:.2f}, "
          f"Best Params: {result['best_params']}")


Evaluating PCA with n_components=3
Fitting 10 folds for each of 77 candidates, totalling 770 fits
Best Parameters for n_components=3: {'C': 1, 'gamma': 0.08}
Confusion Matrix:
[[199   1]
 [  0 200]]
Sensitivity: 1.00, Specificity: 0.99, Accuracy: 1.00

Evaluating PCA with n_components=4
Fitting 10 folds for each of 77 candidates, totalling 770 fits
Best Parameters for n_components=4: {'C': 1, 'gamma': 0.08}
Confusion Matrix:
[[199   1]
 [  0 200]]
Sensitivity: 1.00, Specificity: 0.99, Accuracy: 1.00

Evaluating PCA with n_components=5
Fitting 10 folds for each of 77 candidates, totalling 770 fits
Best Parameters for n_components=5: {'C': 1, 'gamma': 0.08}
Confusion Matrix:
[[199   1]
 [  0 200]]
Sensitivity: 1.00, Specificity: 0.99, Accuracy: 1.00

Evaluating PCA with n_components=6
Fitting 10 folds for each of 77 candidates, totalling 770 fits
Best Parameters for n_components=6: {'C': 1, 'gamma': 0.08}
Confusion Matrix:
[[199   1]
 [  0 200]]
Sensitivity: 1.00, Specificity: 0.99, Acc