In [1]:
import os
import numpy as np
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
# Paths for datasets
datasets = ["Dataset1", "Dataset2", "Dataset3"]
output_dir = "output_pca"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

In [6]:
# Feature extraction using PCA
def extract_pca_features(image_path, pca_pipeline):
    """Extract PCA-based features from an image."""
    image = Image.open(image_path).convert("L")  # Convert to grayscale for simplicity
    image_resized = image.resize((224, 224))  # Resize to fixed dimensions
    image_array = np.array(image_resized).flatten()  # Flatten the image to 1D array
    return pca_pipeline.transform([image_array])[0]  # Apply PCA

In [7]:
def process_dataset_pca(dataset_name):
    """Process a single dataset: extract features using PCA, train SVM, and save results."""
    dataset_path = dataset_name
    output_path = os.path.join(output_dir, f"{dataset_name}_features.npy")

    # Check if features already exist
    if not os.path.exists(output_path):
        print(f"Extracting PCA features for {dataset_name}...")
        features = []
        labels = []
        class_to_idx = {cls_name: idx for idx, cls_name in enumerate(os.listdir(dataset_path))}

        # Prepare data for PCA training
        images = []
        for class_name, class_idx in class_to_idx.items():
            class_path = os.path.join(dataset_path, class_name)
            for img_name in os.listdir(class_path):
                img_path = os.path.join(class_path, img_name)
                try:
                    image = Image.open(img_path).convert("L").resize((224, 224))
                    images.append(np.array(image).flatten())
                    labels.append(class_idx)
                except Exception as e:
                    print(f"Error processing {img_path}: {e}")

        images = np.array(images)
        labels = np.array(labels)

        # Determine the maximum allowable n_components for PCA
        max_components = min(len(images), images.shape[1])
        print(f"Setting PCA n_components to {max_components} for {dataset_name}")

        # Train PCA on the dataset
        scaler = StandardScaler()
        pca = PCA(n_components=max_components)  # Dynamically adjust components
        pca_pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
        pca_pipeline.fit(images)

        # Extract PCA features
        features = pca_pipeline.transform(images)
        np.save(output_path, {"features": features, "labels": labels, "pca_pipeline": pca_pipeline})
        print(f"PCA features saved for {dataset_name} to {output_path}")
    else:
        print(f"PCA features file already exists for {dataset_name}. Loading...")

        # Load features and labels
        data = np.load(output_path, allow_pickle=True).item()
        features, labels, pca_pipeline = data["features"], data["labels"], data["pca_pipeline"]

    # Shuffle the dataset
    indices = np.arange(len(labels))
    np.random.shuffle(indices)
    features = features[indices]
    labels = labels[indices]

    # Train an SVM classifier on the entire dataset
    svm_clf = SVC(kernel='linear', C=1.0, random_state=42)
    svm_clf.fit(features, labels)

    # Predictions using the same data
    predictions = svm_clf.predict(features)

    # Evaluate the classifier
    accuracy = accuracy_score(labels, predictions)
    print(f"Accuracy for {dataset_name}: {accuracy:.2f}")

    # Detailed classification report
    print(f"\nClassification Report for {dataset_name}:")
    print(classification_report(labels, predictions, target_names=os.listdir(dataset_path)))

    # Generate confusion matrix
    conf_matrix = confusion_matrix(labels, predictions)

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=os.listdir(dataset_path), yticklabels=os.listdir(dataset_path))
    plt.title(f"Confusion Matrix for {dataset_name}")
    plt.xlabel("Predicted Labels")
    plt.ylabel("True Labels")
    confusion_matrix_path = os.path.join(output_dir, f"{dataset_name}_confusion_matrix.png")
    plt.savefig(confusion_matrix_path)
    plt.close()

    print(f"Confusion matrix for {dataset_name} saved to {confusion_matrix_path}")

In [8]:
# Process all datasets
for dataset in datasets:
    process_dataset_pca(dataset)

Extracting PCA features for Dataset1...
Setting PCA n_components to 40 for Dataset1
PCA features saved for Dataset1 to output_pca\Dataset1_features.npy
Accuracy for Dataset1: 1.00

Classification Report for Dataset1:
              precision    recall  f1-score   support

     Laptops       1.00      1.00      1.00        20
 Smartphones       1.00      1.00      1.00        20

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40

Confusion matrix for Dataset1 saved to output_pca\Dataset1_confusion_matrix.png
Extracting PCA features for Dataset2...
Setting PCA n_components to 40 for Dataset2
PCA features saved for Dataset2 to output_pca\Dataset2_features.npy
Accuracy for Dataset2: 1.00

Classification Report for Dataset2:
              precision    recall  f1-score   support

        Cars       1.00      1.00      1.00        20
      Planes       1.00      1.00      1.00       