# Confusion Matrix for Similarity Matching Model

This notebook creates a confusion matrix to evaluate the OpenCV similarity matching model performance on test data.

In [7]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import cv2
import json
import pickle
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import warnings

warnings.filterwarnings('ignore')
np.random.seed(42)

print("Libraries loaded successfully!")

Libraries loaded successfully!


## 1. Define CV2SimilarityClassifier Class

This class needs to be defined to unpickle the saved model.

In [8]:
class CV2SimilarityClassifier:
    """
    Classifier using OpenCV's template matching and feature matching.
    Uses multiple similarity metrics from cv2.
    This class needs to be defined here so pickle can load saved models.
    """
    
    def __init__(self, method='multi'):
        """
        Args:
            method: 'template' (cv2.matchTemplate), 'features' (keypoint matching), 
                   'histogram' (histogram comparison), or 'multi' (combines all)
        """
        self.method = method
        self.templates = {}
        self.classes_ = None
        
        # Initialize feature detectors
        if method in ['features', 'multi']:
            try:
                # Try SIFT first (better but requires opencv-contrib-python)
                self.detector = cv2.SIFT_create()
                self.matcher = cv2.BFMatcher()
            except:
                # Fall back to ORB (built-in)
                self.detector = cv2.ORB_create()
                self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
    
    def _template_match_score(self, img, template):
        """Use cv2.matchTemplate with multiple methods."""
        # Normalize images
        img_norm = cv2.normalize(img.astype(np.float32), None, 0, 1, cv2.NORM_MINMAX)
        template_norm = cv2.normalize(template.astype(np.float32), None, 0, 1, cv2.NORM_MINMAX)
        
        # Try different matching methods
        methods = [
            cv2.TM_CCOEFF_NORMED,  # Normalized correlation coefficient
            cv2.TM_CCORR_NORMED,   # Normalized cross-correlation
        ]
        
        scores = []
        for method in methods:
            result = cv2.matchTemplate(img_norm, template_norm, method)
            scores.append(np.max(result))
        
        return np.mean(scores)
    
    def _feature_match_score(self, img, template_data):
        """Match using keypoint features."""
        kp_img, desc_img = self.detector.detectAndCompute(img, None)
        
        if desc_img is None or len(kp_img) < 4:
            return 0.0
        
        best_matches = []
        for desc_template in template_data.get('descriptors', []):
            if desc_template is None:
                continue
            
            try:
                # Check if matcher has getCrossCheck method (ORB)
                if hasattr(self.matcher, 'getCrossCheck') and self.matcher.getCrossCheck():
                    # ORB with Hamming distance
                    matches = self.matcher.match(desc_img, desc_template)
                    matches = sorted(matches, key=lambda x: x.distance)
                    best_matches.extend(matches[:20])  # Top 20 matches
                else:
                    # SIFT with ratio test
                    matches = self.matcher.knnMatch(desc_img, desc_template, k=2)
                    good_matches = []
                    for match_pair in matches:
                        if len(match_pair) == 2:
                            m, n = match_pair
                            if m.distance < 0.75 * n.distance:  # Lowe's ratio test
                                good_matches.append(m)
                    best_matches.extend(good_matches)
            except:
                continue
        
        if len(best_matches) == 0:
            return 0.0
        
        # Score based on number of good matches
        match_score = len(best_matches) / max(len(kp_img), 1)
        return min(match_score, 1.0)
    
    def _histogram_match_score(self, img, template):
        """Compare histograms using multiple methods."""
        # Calculate histograms
        hist_img = cv2.calcHist([img], [0], None, [256], [0, 256])
        hist_template = cv2.calcHist([template], [0], None, [256], [0, 256])
        
        # Normalize
        cv2.normalize(hist_img, hist_img, 0, 1, cv2.NORM_MINMAX)
        cv2.normalize(hist_template, hist_template, 0, 1, cv2.NORM_MINMAX)
        
        # Compare using multiple methods
        methods = [
            cv2.HISTCMP_CORREL,      # Correlation
            cv2.HISTCMP_INTERSECT,   # Intersection
            cv2.HISTCMP_BHATTACHARYYA # Bhattacharyya distance
        ]
        
        scores = []
        for method in methods:
            score = cv2.compareHist(hist_img, hist_template, method)
            if method == cv2.HISTCMP_BHATTACHARYYA:
                # Lower is better for Bhattacharyya, convert to similarity
                score = 1.0 - min(score, 1.0)
            scores.append(score)
        
        return np.mean(scores)
    
    def _compute_similarity(self, img, template_data):
        """Compute similarity using selected method(s)."""
        template = template_data['mean']
        scores = []
        
        if self.method in ['template', 'multi']:
            scores.append(self._template_match_score(img, template))
        
        if self.method in ['features', 'multi']:
            scores.append(self._feature_match_score(img, template_data))
        
        if self.method in ['histogram', 'multi']:
            scores.append(self._histogram_match_score(img, template))
        
        return np.mean(scores) if scores else 0.0
    
    def predict(self, X):
        """Predict class for each image in X."""
        predictions = []
        for img in X:
            similarities = {}
            for cls_idx in self.classes_:
                template_data = self.templates[cls_idx]
                similarity = self._compute_similarity(img, template_data)
                similarities[cls_idx] = similarity
            
            # Get class with highest similarity
            predicted_class = max(similarities, key=similarities.get)
            predictions.append(predicted_class)
        
        return np.array(predictions)

print("CV2SimilarityClassifier class defined!")

CV2SimilarityClassifier class defined!


## 2. Load Model and Configuration

In [9]:
# Load model info and label mappings
model_dir = Path('./models/similarity')

# Load model info
with open(model_dir / 'model_info.json', 'r') as f:
    model_info = json.load(f)

# Load label mappings
with open(model_dir / 'label_mappings.json', 'r') as f:
    label_mappings = json.load(f)

# Extract information
idx_to_label = {int(k): v for k, v in label_mappings['idx_to_label'].items()}
label_to_idx = label_mappings['label_to_idx']
num_classes = model_info['num_classes']
img_size = model_info.get('image_size', 64)
use_sobel = model_info.get('use_sobel', False)
method = model_info.get('method', 'multi')

print(f"\nModel Configuration:")
print(f"  Image size: {img_size}x{img_size}")
print(f"  Number of classes: {num_classes}")
print(f"  Method: {method}")
print(f"  Use Sobel: {use_sobel}")
print(f"  Test accuracy (from training): {model_info.get('accuracy', 'N/A')}")

# Load the classifier from pickle
classifier_path = model_dir / 'classifier.pkl'
if not classifier_path.exists():
    raise FileNotFoundError(f"Classifier file not found: {classifier_path}")

with open(classifier_path, 'rb') as f:
    classifier = pickle.load(f)

# Recreate detector and matcher if needed (they might not be pickleable)
if method in ['features', 'multi']:
    if not hasattr(classifier, 'detector') or classifier.detector is None:
        try:
            classifier.detector = cv2.SIFT_create()
            classifier.matcher = cv2.BFMatcher()
            print("✓ Using SIFT detector")
        except:
            classifier.detector = cv2.ORB_create()
            classifier.matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)
            print("✓ Using ORB detector")
    else:
        print("✓ Detector already loaded")

print(f"\n✓ Model loaded successfully from {classifier_path}")


Model Configuration:
  Image size: 64x64
  Number of classes: 340
  Method: multi
  Use Sobel: True
  Test accuracy (from training): 0.162
✓ Using SIFT detector

✓ Model loaded successfully from models/similarity/classifier.pkl


## 3. Define Preprocessing Functions

In [10]:
def apply_sobel(img):
    """Apply Sobel edge detection to an image.
    
    The Sobel filter detects edges by computing gradients in x and y directions.
    This is a classical computer vision technique - the kernel weights are 
    hand-designed, NOT learned like in CNNs.
    """
    # Sobel kernels for x and y gradients
    sobel_x = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=3)  # Horizontal edges
    sobel_y = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=3)  # Vertical edges
    
    # Compute gradient magnitude
    magnitude = np.sqrt(sobel_x**2 + sobel_y**2)
    
    # Normalize to 0-255 range
    magnitude = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX)
    
    return magnitude.astype(np.uint8)

def preprocess_image(img_path, image_size, use_sobel=False):
    """
    Load and preprocess an image for similarity matching.
    Similarity matching uses grayscale images.
    """
    img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)
    if img is None:
        return None
    
    # Resize
    img = cv2.resize(img, (image_size, image_size))
    
    # Invert if needed (doodles are typically white on black)
    if img.mean() > 127:
        img = 255 - img
    
    # Apply Sobel edge detection if model was trained with it
    if use_sobel:
        img = apply_sobel(img)
    
    return img

print("Preprocessing functions defined!")

Preprocessing functions defined!


## 4. Load Test Data

In [11]:
# Setup paths
doodle_dir = Path('doodles/doodle')

# Configuration
SAMPLES_PER_CATEGORY = 200  # Number of test samples per category

def load_test_images(categories, samples_per_category, image_size, use_sobel=False):
    """
    Load test images from disk and preprocess them for similarity matching.
    Similarity matching uses grayscale images.
    """
    images = []
    labels = []
    image_paths = []
    
    for category in tqdm(categories, desc="Loading test images"):
        if category not in label_to_idx:
            continue
            
        category_path = doodle_dir / category
        if not category_path.exists():
            continue
            
        image_files = list(category_path.glob('*.png'))[:samples_per_category]
        
        for img_path in image_files:
            try:
                img = preprocess_image(img_path, image_size, use_sobel)
                if img is not None:
                    images.append(img)
                    labels.append(category)
                    image_paths.append(str(img_path))
            except Exception as e:
                print(f"Error loading {img_path}: {e}")
    
    return np.array(images), np.array(labels), image_paths

# Get all categories that are in the model
all_categories = sorted([d.name for d in doodle_dir.iterdir() if d.is_dir()])
model_categories = [cat for cat in all_categories if cat in label_to_idx]

print(f"Loading test data from {len(model_categories)} categories...")
print(f"Samples per category: {SAMPLES_PER_CATEGORY}")
print(f"Using Sobel preprocessing: {use_sobel}")

# Load test images
X_test_images, y_test_labels, test_paths = load_test_images(
    model_categories, SAMPLES_PER_CATEGORY, img_size, use_sobel
)

# Encode labels
y_test_encoded = np.array([label_to_idx[label] for label in y_test_labels])

print(f"\nLoaded {len(X_test_images)} test images")
print(f"Number of classes in test set: {len(np.unique(y_test_encoded))}")

Loading test data from 340 categories...
Samples per category: 200
Using Sobel preprocessing: True


Loading test images:   2%|▏         | 8/340 [00:00<00:39,  8.40it/s]


KeyboardInterrupt: 

## 5. Make Predictions

In [None]:
# Make predictions on test set
print("Making predictions on test set...")
print("Note: Similarity matching can be slow as it compares each image to all templates.")

y_pred_encoded = classifier.predict(X_test_images)

# Calculate accuracy
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
print(f"\nTest Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Convert back to labels for better readability
y_test_label_names = [idx_to_label[idx] for idx in y_test_encoded]
y_pred_label_names = [idx_to_label[idx] for idx in y_pred_encoded]

Making predictions on test set...
Note: Similarity matching can be slow as it compares each image to all templates.


KeyboardInterrupt: 

## 6. Create Confusion Matrix

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_test_encoded, y_pred_encoded, labels=list(range(num_classes)))

print(f"Confusion matrix shape: {cm.shape}")
print(f"Total test samples: {len(y_test_encoded)}")
print(f"Correct predictions: {np.trace(cm)}")
print(f"Accuracy from confusion matrix: {np.trace(cm) / len(y_test_encoded):.4f}")

## 7. Visualize Confusion Matrix (Full)

In [None]:
# For 340 classes, the full confusion matrix is too large to visualize clearly
# We'll create a normalized version and show statistics

# Normalize confusion matrix (percentages)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm_normalized = np.nan_to_num(cm_normalized)  # Replace NaN with 0

# Create figure with subplots
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Plot raw confusion matrix (log scale for better visibility)
cm_log = np.log1p(cm)  # log(1+x) to handle zeros
im1 = axes[0].imshow(cm_log, cmap='Blues', aspect='auto')
axes[0].set_title(f'Confusion Matrix (Log Scale)\nAccuracy: {accuracy:.4f}', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Predicted Label', fontsize=12)
axes[0].set_ylabel('True Label', fontsize=12)
plt.colorbar(im1, ax=axes[0])

# Plot normalized confusion matrix
im2 = axes[1].imshow(cm_normalized, cmap='Blues', aspect='auto', vmin=0, vmax=1)
axes[1].set_title('Normalized Confusion Matrix\n(Row-wise percentages)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Predicted Label', fontsize=12)
axes[1].set_ylabel('True Label', fontsize=12)
plt.colorbar(im2, ax=axes[1])

plt.tight_layout()
plt.show()

print("\nNote: Due to the large number of classes (340), individual labels are not shown.")
print("The diagonal represents correct predictions.")

## 8. Visualize Top Confused Classes

In [None]:
# Find classes with most confusion (off-diagonal elements)
off_diagonal = cm.copy()
np.fill_diagonal(off_diagonal, 0)

# Get top N most confused classes
top_n = 20
class_totals = cm.sum(axis=1)
class_correct = np.diag(cm)
class_errors = class_totals - class_correct

# Get indices sorted by error count
top_error_indices = np.argsort(class_errors)[-top_n:][::-1]

# Create submatrix for top confused classes
cm_subset = cm[np.ix_(top_error_indices, top_error_indices)]
labels_subset = [idx_to_label[idx] for idx in top_error_indices]

# Plot subset confusion matrix
plt.figure(figsize=(16, 14))
cm_subset_normalized = cm_subset.astype('float') / cm_subset.sum(axis=1)[:, np.newaxis]
cm_subset_normalized = np.nan_to_num(cm_subset_normalized)

sns.heatmap(cm_subset_normalized, 
            annot=True, 
            fmt='.2f', 
            cmap='Blues',
            xticklabels=labels_subset,
            yticklabels=labels_subset,
            cbar_kws={'label': 'Normalized Count'})

plt.title(f'Confusion Matrix - Top {top_n} Most Confused Classes\n(Normalized by row)', 
          fontsize=16, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Print statistics for top confused classes
print(f"\nTop {top_n} classes with most errors:")
print("=" * 80)
for idx in top_error_indices:
    label = idx_to_label[idx]
    total = class_totals[idx]
    correct = class_correct[idx]
    errors = class_errors[idx]
    accuracy_class = correct / total if total > 0 else 0
    print(f"{label:30s} | Total: {total:4d} | Correct: {correct:4d} | Errors: {errors:4d} | Acc: {accuracy_class:.2%}")

## 9. Per-Class Accuracy Statistics

In [None]:
# Calculate per-class accuracy
class_accuracies = []
for idx in range(num_classes):
    total = cm.sum(axis=1)[idx]
    if total > 0:
        correct = cm[idx, idx]
        accuracy_class = correct / total
        class_accuracies.append({
            'class_idx': idx,
            'label': idx_to_label[idx],
            'total_samples': total,
            'correct': correct,
            'accuracy': accuracy_class
        })

# Convert to DataFrame
df_acc = pd.DataFrame(class_accuracies)
df_acc = df_acc.sort_values('accuracy')

# Display statistics
print("Per-Class Accuracy Statistics:")
print("=" * 80)
print(f"Mean accuracy: {df_acc['accuracy'].mean():.4f}")
print(f"Median accuracy: {df_acc['accuracy'].median():.4f}")
print(f"Std deviation: {df_acc['accuracy'].std():.4f}")
print(f"\nBest performing classes:")
print(df_acc.tail(10)[['label', 'total_samples', 'correct', 'accuracy']].to_string(index=False))
print(f"\nWorst performing classes:")
print(df_acc.head(10)[['label', 'total_samples', 'correct', 'accuracy']].to_string(index=False))

# Plot distribution of accuracies
plt.figure(figsize=(12, 6))
plt.hist(df_acc['accuracy'], bins=50, edgecolor='black', alpha=0.7)
plt.xlabel('Per-Class Accuracy', fontsize=12)
plt.ylabel('Number of Classes', fontsize=12)
plt.title('Distribution of Per-Class Accuracies', fontsize=14, fontweight='bold')
plt.axvline(df_acc['accuracy'].mean(), color='red', linestyle='--', label=f'Mean: {df_acc["accuracy"].mean():.3f}')
plt.axvline(df_acc['accuracy'].median(), color='green', linestyle='--', label=f'Median: {df_acc["accuracy"].median():.3f}')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 10. Classification Report

In [None]:
# Generate classification report
class_names = [idx_to_label[i] for i in range(num_classes)]
report = classification_report(y_test_encoded, y_pred_encoded, 
                               target_names=class_names,
                               output_dict=True,
                               zero_division=0)

print("Classification Report:")
print("=" * 80)
print(f"Overall Accuracy: {report['accuracy']:.4f}")
print(f"\nMacro Average:")
print(f"  Precision: {report['macro avg']['precision']:.4f}")
print(f"  Recall: {report['macro avg']['recall']:.4f}")
print(f"  F1-Score: {report['macro avg']['f1-score']:.4f}")
print(f"\nWeighted Average:")
print(f"  Precision: {report['weighted avg']['precision']:.4f}")
print(f"  Recall: {report['weighted avg']['recall']:.4f}")
print(f"  F1-Score: {report['weighted avg']['f1-score']:.4f}")

## 11. Top-3 Accuracy Analysis

In [None]:
# Calculate top-3 accuracy
print("Calculating top-3 accuracy...")
print("Note: This requires computing similarities for all classes per image.")

top3_correct = 0

for i in tqdm(range(len(X_test_images)), desc="Top-3 predictions"):
    img = X_test_images[i]
    true_label = y_test_encoded[i]
    
    # Compute similarities for all classes
    similarities = {}
    for cls_idx in classifier.classes_:
        template_data = classifier.templates[cls_idx]
        similarity = classifier._compute_similarity(img, template_data)
        similarities[cls_idx] = similarity
    
    # Get top-3 predictions
    top3_pred = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:3]
    top3_classes = [cls_idx for cls_idx, _ in top3_pred]
    
    # Check if true label is in top-3
    if true_label in top3_classes:
        top3_correct += 1

top3_accuracy = top3_correct / len(y_test_encoded)
print(f"\nTop-3 Accuracy: {top3_accuracy:.4f} ({top3_accuracy*100:.2f}%)")
print(f"Top-1 Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Improvement: {(top3_accuracy - accuracy):.4f} ({(top3_accuracy - accuracy)*100:.2f}%)")

## 12. Save Results

In [None]:
# Save confusion matrix and results
output_dir = Path('results/similarity')
output_dir.mkdir(parents=True, exist_ok=True)

# Save confusion matrix as numpy array
np.save(output_dir / 'confusion_matrix.npy', cm)
np.save(output_dir / 'confusion_matrix_normalized.npy', cm_normalized)

# Save per-class accuracies
df_acc.to_csv(output_dir / 'per_class_accuracy.csv', index=False)

# Save summary statistics
summary = {
    'test_accuracy': float(accuracy),
    'test_top3_accuracy': float(top3_accuracy),
    'num_test_samples': int(len(y_test_encoded)),
    'num_classes': int(num_classes),
    'method': method,
    'use_sobel': use_sobel,
    'mean_per_class_accuracy': float(df_acc['accuracy'].mean()),
    'median_per_class_accuracy': float(df_acc['accuracy'].median()),
    'std_per_class_accuracy': float(df_acc['accuracy'].std()),
    'macro_avg_precision': float(report['macro avg']['precision']),
    'macro_avg_recall': float(report['macro avg']['recall']),
    'macro_avg_f1': float(report['macro avg']['f1-score']),
    'weighted_avg_precision': float(report['weighted avg']['precision']),
    'weighted_avg_recall': float(report['weighted avg']['recall']),
    'weighted_avg_f1': float(report['weighted avg']['f1-score'])
}

with open(output_dir / 'evaluation_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print(f"\nResults saved to {output_dir}/")
print(f"  - confusion_matrix.npy")
print(f"  - confusion_matrix_normalized.npy")
print(f"  - per_class_accuracy.csv")
print(f"  - evaluation_summary.json")