In [3]:
import os
import cv2
import numpy as np
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm
import time

# --- 1. DATA PREPARATION ---

def load_and_preprocess_data(data_dir, image_size=(64, 64), sample_size=4000):
    """
    Loads images, preprocesses them, and extracts HOG features.

    Args:
        data_dir (str): Path to the training data directory.
        image_size (tuple): The size to resize images to.
        sample_size (int): The number of images to process for faster runs.
                           Set to None to process all images.
    """
    features = []
    labels = []
    
    # Get a list of image files and shuffle it
    image_files = os.listdir(data_dir)
    np.random.shuffle(image_files)
    
    # Limit the number of images to process if sample_size is set
    if sample_size:
        image_files = image_files[:sample_size]

    print(f"Processing {len(image_files)} images...")

    # Use tqdm for a progress bar
    for filename in tqdm(image_files, desc="Extracting Features"):
        try:
            # Construct full path and read the image in grayscale
            img_path = os.path.join(data_dir, filename)
            image = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            
            # Skip if image is not loaded correctly
            if image is None:
                print(f"Warning: Could not read image {filename}. Skipping.")
                continue

            # Resize the image
            resized_image = cv2.resize(image, image_size)

            # --- Feature Extraction: Histogram of Oriented Gradients (HOG) ---
            hog_features = hog(resized_image, orientations=9, pixels_per_cell=(8, 8),
                               cells_per_block=(2, 2), block_norm='L2-Hys', visualize=False)
            
            features.append(hog_features)

            # --- Labeling: 'cat' -> 0, 'dog' -> 1 ---
            if 'cat' in filename:
                labels.append(0)
            elif 'dog' in filename:
                labels.append(1)

        except Exception as e:
            print(f"Error processing file {filename}: {e}")

    # Convert lists to numpy arrays for scikit-learn
    return np.array(features), np.array(labels)


# --- 2. MODEL TRAINING & EVALUATION ---

# Define the path to your dataset
DATASET_PATH = 'train'

# Load data (using a sample of 4000 images for a reasonably quick run)
# For a full run, set sample_size=None, but be aware it will be very slow.
features, labels = load_and_preprocess_data(DATASET_PATH, sample_size=4000)

# --- Diagnostic Check ---
# Check the distribution of the loaded labels to ensure we have both classes.
print(f"\nLabel distribution - Cats (0): {np.count_nonzero(labels == 0)}, Dogs (1): {np.count_nonzero(labels == 1)}")

if len(features) > 0 and np.count_nonzero(labels == 0) > 0 and np.count_nonzero(labels == 1) > 0:
    print(f"Data loaded successfully. Total samples: {len(features)}")
    
    # Split data into training (80%) and testing (20%) sets
    # stratify=labels ensures that the train/test split has a similar proportion of cats and dogs.
    X_train, X_test, y_train, y_test = train_test_split(
        features, labels, test_size=0.2, random_state=42, stratify=labels
    )

    print(f"Training samples: {len(X_train)}")
    print(f"Testing samples: {len(X_test)}")

    # Initialize the Support Vector Classifier
    # kernel='rbf' is a powerful, non-linear kernel.
    # C=10 is a regularization parameter.
    print("\nTraining the SVM classifier...")
    svm_model = SVC(kernel='rbf', C=10, random_state=42, gamma='scale')
    
    start_time = time.time()
    svm_model.fit(X_train, y_train)
    end_time = time.time()
    
    print(f"Training completed in {end_time - start_time:.2f} seconds.")

    # --- 3. PREDICTION & PERFORMANCE ---
    print("\nEvaluating the model...")
    y_pred = svm_model.predict(X_test)
    
    # Calculate and print accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    # Print a detailed classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Cat', 'Dog']))

else:
    print("\nError: Not enough data or classes to train the model. Please check your 'train' folder.")

Processing 4000 images...


Extracting Features: 100%|██████████| 4000/4000 [00:10<00:00, 395.26it/s]



Label distribution - Cats (0): 1980, Dogs (1): 2020
Data loaded successfully. Total samples: 4000
Training samples: 3200
Testing samples: 800

Training the SVM classifier...
Training completed in 9.06 seconds.

Evaluating the model...
Accuracy: 75.25%

Classification Report:
              precision    recall  f1-score   support

         Cat       0.74      0.77      0.75       396
         Dog       0.76      0.74      0.75       404

    accuracy                           0.75       800
   macro avg       0.75      0.75      0.75       800
weighted avg       0.75      0.75      0.75       800

