In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

import torch
from torchvision import transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, precision_recall_curve, average_precision_score
from pathlib import Path
import json
from PIL import Image
from collections import defaultdict

In [None]:
# Configuration class for evaluation settings
class EvaluationConfig:
    def __init__(self):
        # Paths and device configuration
        self.DATA_ROOT = Path("./data")
        self.DATA_YAML = "./data/data.yaml"
        self.CHECKPOINT_DIR = Path(".")
        self.DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.NUM_CLASSES = 13  # 12 classes + background
        
        # Evaluation settings
        self.CONFIDENCE_THRESHOLDS = np.arange(0.0, 1.01, 0.005)
        self.IOU_THRESHOLD = 0.3
        self.RESULTS_DIR = Path("./results/faster_rcnn_evaluation")

config = EvaluationConfig()
class_names = ['background', 'Ants', 'Bees', 'Beetles', 'Caterpillars', 'Earthworms', 'Earwigs', 'Grasshoppers', 'Moths', 'Slugs', 'Snails', 'Wasps', 'Weevils']

print(f"Current directory: {Path.cwd()}")
print(f"Data root: {config.DATA_ROOT.resolve()}")
print(f"Results will be saved to: {config.RESULTS_DIR.resolve()}")
print(f"Checkpoint dir: {config.CHECKPOINT_DIR.resolve()}")
print(f"Classes ({len(class_names)-1}): {class_names[1:]}")

In [None]:
# Function to load the trained Faster R-CNN model
def load_trained_model():
    # Create model architecture
    model = fasterrcnn_resnet50_fpn(weights=None)
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, config.NUM_CLASSES)
    
    # Load trained weights by checking multiple possible locations
    checkpoint_paths = [
        config.CHECKPOINT_DIR / 'faster_rcnn_checkpoint_epoch_12.pth',
        config.CHECKPOINT_DIR / 'best_faster_rcnn_model.pth',
        config.CHECKPOINT_DIR / 'faster_rcnn_checkpoint_epoch_10.pth',
        config.CHECKPOINT_DIR / 'faster_rcnn_checkpoint_epoch_8.pth',
    ]
    
    # Find the first existing checkpoint
    checkpoint_path = None
    for path in checkpoint_paths:
        if path.exists():
            checkpoint_path = path
            print(f"Found checkpoint: {path}")
            break
    
    # If no checkpoint found, raise an error listing available .pth files
    if checkpoint_path is None:
        available = list(config.CHECKPOINT_DIR.glob('*.pth'))
        raise FileNotFoundError(
            f"Could not find checkpoint. Available .pth files:\n" +
            "\n".join([f"  - {p}" for p in available])
        )
    
    # Load the checkpoint
    checkpoint = torch.load(checkpoint_path, map_location=config.DEVICE, weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.to(config.DEVICE)
    model.eval()
    
    print(f"Model loaded from: {checkpoint_path}")
    print(f"Epoch: {checkpoint.get('epoch', 'Unknown')}")
    print(f"Training loss: {checkpoint.get('train_loss', 'Unknown'):.4f}")
    print(f"Validation loss: {checkpoint.get('valid_loss', 'Unknown'):.4f}")
    
    return model

# Load the model
trained_model = load_trained_model()

In [None]:
# Function to load ground truth annotations from test set
def load_ground_truth_annotations():
    # Directories for test labels and images
    test_labels_dir = config.DATA_ROOT / 'test' / 'labels'
    test_images_dir = config.DATA_ROOT / 'test' / 'images'
    
    label_files = list(test_labels_dir.glob('*.txt'))
    
    annotations = {}
    images_not_found = 0
    
    # Process each label file
    for i, label_file in enumerate(label_files):
        img_name = label_file.stem
        
        # Find corresponding image
        img_path = None
        for ext in ['.jpg', '.jpeg', '.png']:
            potential_path = test_images_dir / f"{img_name}{ext}"
            if potential_path.exists():
                img_path = potential_path
                break
        
        # If no image found, log and continue
        if img_path is None:
            images_not_found += 1
            if i < 3:
                print(f"   No matching image found!")
            continue
            
        # Load image dimensions
        img = Image.open(img_path)
        img_width, img_height = img.size
        
        # Parse YOLO format annotations
        boxes = []
        labels = []
        
        # Read label file
        try:
            with open(label_file, 'r') as f:
                lines = [line.strip() for line in f if line.strip()]
            
            # Convert YOLO format to bounding boxes
            for line in lines:
                parts = line.split()
                if len(parts) >= 5:
                    class_id = int(parts[0]) + 1  # Convert to 1-indexed
                    center_x = float(parts[1]) * img_width
                    center_y = float(parts[2]) * img_height
                    width = float(parts[3]) * img_width
                    height = float(parts[4]) * img_height
                    
                    x1 = center_x - width / 2
                    y1 = center_y - height / 2
                    x2 = center_x + width / 2
                    y2 = center_y + height / 2
                    
                    boxes.append([x1, y1, x2, y2])
                    labels.append(class_id)
        
        except Exception as e:
            continue
        
        # Store annotations if any boxes found
        if len(boxes) > 0:
            annotations[str(img_path)] = {
                'boxes': np.array(boxes),
                'labels': np.array(labels)
            }
    
    print(f"\nSummary:")
    print(f"   Total label files: {len(label_files)}")
    print(f"   Images not found: {images_not_found}")
    print(f"   Loaded {len(annotations)} test images with annotations")
    return annotations

# Load ground truth
gt_annotations = load_ground_truth_annotations()

In [None]:
# Function to run the model on test set and collect predictions
def evaluate_model():    
    # Image transformation
    transform = transforms.Compose([
        transforms.Resize((640, 640)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    all_predictions = []
    
    # Process each test image
    for i, (img_path, gt_data) in enumerate(gt_annotations.items()):
        try:
            # Load and preprocess image
            image = Image.open(img_path).convert('RGB')
            original_size = image.size
            input_tensor = transform(image).unsqueeze(0).to(config.DEVICE)
            
            # Run inference
            with torch.no_grad():
                predictions = trained_model(input_tensor)
            
            # Process predictions
            pred = predictions[0]
            pred_boxes = pred['boxes'].cpu().numpy()
            pred_scores = pred['scores'].cpu().numpy()
            pred_labels = pred['labels'].cpu().numpy()
            
            # Scale boxes back to original size
            if len(pred_boxes) > 0:
                scale_x = original_size[0] / 640
                scale_y = original_size[1] / 640
                pred_boxes[:, [0, 2]] *= scale_x
                pred_boxes[:, [1, 3]] *= scale_y
            
            # Store predictions
            for box, score, label in zip(pred_boxes, pred_scores, pred_labels):
                all_predictions.append({
                    'image': img_path,
                    'box': box,
                    'score': score,
                    'label': label
                })
                
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue
    
    print(f"Evaluation complete: {len(all_predictions)} total predictions")
    return all_predictions

# Run evaluation
predictions = evaluate_model()

In [None]:
# Function to calculate IoU between two boxes
# Args:
#   - box1, box2: Lists or arrays of [x1, y1, x2, y2]
# Returns:
#   - IoU value (float)
def calculate_iou(box1, box2):
    # Calculate intersection coordinates
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    if x2 <= x1 or y2 <= y1:
        return 0.0
    
    # Calculate intersection and union areas
    intersection = (x2 - x1) * (y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0.0

# Function to calculate precision, recall, F1 at a given confidence threshold
# Args:
#   - conf_threshold: Confidence threshold to filter predictions
# Returns:
#   - metrics: Dictionary of precision, recall, F1 for each class
def calculate_metrics_at_confidence(conf_threshold=0.5):    
    metrics = {}
    total_predictions_made = 0
    total_matches_found = 0
    
    # Process each class except background
    for class_idx in range(1, len(class_names)):
        class_name = class_names[class_idx]
        
        # Filter predictions by class and confidence
        class_preds = [p for p in predictions 
                      if p['label'] == class_idx and p['score'] >= conf_threshold]
        
        # Get ground truth for this class
        class_gt = []
        for img_path, gt_data in gt_annotations.items():
            for box, label in zip(gt_data['boxes'], gt_data['labels']):
                if label == class_idx:
                    class_gt.append({'image': img_path, 'box': box, 'label': label})
        
        total_predictions_made += len(class_preds)
        
        # Calculate matches
        tp = 0
        fp = 0
        fn = 0
        
        image_groups = defaultdict(lambda: {'pred': [], 'gt': []})
        
        for pred in class_preds:
            image_groups[pred['image']]['pred'].append(pred)
        
        for gt in class_gt:
            image_groups[gt['image']]['gt'].append(gt)
        
        matches_for_class = 0
        
        # Process each image
        for img_path, data in image_groups.items():
            pred_boxes = [p['box'] for p in data['pred']]
            gt_boxes = [g['box'] for g in data['gt']]
            
            matched_gt = set()
            
            # Match predictions to ground truth boxes based on IoU
            for pred_box in pred_boxes:
                best_iou = 0
                best_gt_idx = -1
                
                # Find best matching ground truth box
                for gt_idx, gt_box in enumerate(gt_boxes):
                    if gt_idx in matched_gt:
                        continue
                    
                    iou = calculate_iou(pred_box, gt_box)
                    if iou > best_iou:
                        best_iou = iou
                        best_gt_idx = gt_idx
                
                # Determine if it's a TP or FP
                if best_iou >= config.IOU_THRESHOLD:
                    tp += 1
                    matched_gt.add(best_gt_idx)
                    matches_for_class += 1
                else:
                    fp += 1
            
            fn += len(gt_boxes) - len(matched_gt)
        
        total_matches_found += matches_for_class
        
        # Calculate metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        metrics[class_name] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'tp': tp,
            'fp': fp,
            'fn': fn
        }
    
    return metrics

metrics_05 = calculate_metrics_at_confidence(0.05)

In [None]:
# Create results directory
config.RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# Function to plot Precision-Recall curves for each class
def plot_precision_recall_curves():    
    plt.figure(figsize=(12, 8))
    
    for class_idx in range(1, len(class_names)):
        class_name = class_names[class_idx]
        
        # Collect data for PR curve
        y_true = []
        y_scores = []
        
        # Get all predictions and ground truth for this class
        class_preds = [p for p in predictions if p['label'] == class_idx]
        
        for img_path, gt_data in gt_annotations.items():
            img_gt = [g for g in gt_data['labels'] if g == class_idx]
            img_preds = [p for p in class_preds if p['image'] == img_path]
            
            # Sort predictions by score
            img_preds.sort(key=lambda x: x['score'], reverse=True)
            
            # Calculate matches
            for pred in img_preds:
                # Check if this prediction matches any ground truth
                is_match = len(img_gt) > 0
                y_true.append(1 if is_match else 0)
                y_scores.append(pred['score'])
        
        if len(y_true) > 0:
            precision, recall, _ = precision_recall_curve(y_true, y_scores)
            ap = average_precision_score(y_true, y_scores) if len(set(y_true)) > 1 else 0
            
            plt.plot(recall, precision, label=f'{class_name} {ap:.3f}', linewidth=2)
    
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    
    save_path = config.RESULTS_DIR / 'BoxPR_curve.png'
    plt.tight_layout()
    plt.savefig(save_path, dpi=100, bbox_inches='tight')
    plt.show()
    
    print(f"PR curve saved: {save_path}")

# Generate plot
plot_precision_recall_curves()

In [None]:
# Function to plot F1-confidence curves
def plot_f1_confidence_curves():
    plt.figure(figsize=(12, 8))
    
    all_f1_scores = []
    confidence_range = np.arange(0.0, 1.01, 0.005) 
    
    # Calculate F1 for each confidence threshold
    for conf_threshold in confidence_range:
        metrics = calculate_metrics_at_confidence(conf_threshold)
        avg_f1 = np.mean([m['f1'] for m in metrics.values()]) if metrics else 0
        all_f1_scores.append(avg_f1)
    
    # Plot average F1
    plt.plot(config.CONFIDENCE_THRESHOLDS, all_f1_scores, 
             linewidth=3, label=f'all classes {max(all_f1_scores):.2f}')
    
    plt.xlabel('Confidence')
    plt.ylabel('F1')
    plt.title('F1-Confidence Curve')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    save_path = config.RESULTS_DIR / 'BoxF1_curve.png'
    plt.tight_layout()
    plt.savefig(save_path, dpi=100, bbox_inches='tight')
    plt.show()
    
    print(f"F1 curve saved: {save_path}")

# Generate plot
plot_f1_confidence_curves()

In [None]:
# Function to generate confusion matrix
def generate_confusion_matrix():
    # Collect all predictions and ground truth for confusion matrix
    y_true = []
    y_pred = []

    conf_threshold = 0.05
    
    # Process each image
    for img_path, gt_data in gt_annotations.items():
        # Get predictions for this image
        img_preds = [p for p in predictions if p['image'] == img_path and p['score'] >= conf_threshold]
        
        # Get ground truth for this image
        img_gt_boxes = gt_data['boxes']
        img_gt_labels = gt_data['labels']
        
        # Match predictions to ground truth using IoU
        matched_gt = set()
        
        for pred in img_preds:
            pred_box = pred['box']
            pred_label = pred['label']
            
            best_iou = 0
            best_gt_label = None
            best_gt_idx = -1
            
            # Find best matching ground truth
            for gt_idx, (gt_box, gt_label) in enumerate(zip(img_gt_boxes, img_gt_labels)):
                if gt_idx in matched_gt:
                    continue
                
                iou = calculate_iou(pred_box, gt_box)
                if iou > best_iou:
                    best_iou = iou
                    best_gt_label = gt_label
                    best_gt_idx = gt_idx
            
            if best_iou >= config.IOU_THRESHOLD:
                # True positive/correct prediction
                y_true.append(best_gt_label)
                y_pred.append(pred_label)
                matched_gt.add(best_gt_idx)
            else:
                # False positive/wrong prediction
                y_true.append(0) # Background
                y_pred.append(pred_label)
        
        # Add false negatives (missed ground truth)
        for gt_idx, gt_label in enumerate(img_gt_labels):
            if gt_idx not in matched_gt:
                y_true.append(gt_label)
                y_pred.append(0)  # Predicted as background
    
    if len(y_true) == 0:
        print("WARNING: No data for confusion matrix!")
        return
    
    # Create confusion matrix
    labels = list(range(len(class_names)))
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    
    # Plot confusion matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Confusion Matrix', fontsize=16)
    plt.ylabel('True Label', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=14)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    
    save_path = config.RESULTS_DIR / 'confusion_matrix.png'
    plt.tight_layout()
    plt.savefig(save_path, dpi=100, bbox_inches='tight')
    plt.show()
    
    # Plot normalized confusion matrix
    cm_normalized = cm.astype('float') / (cm.sum(axis=1)[:, np.newaxis] + 1e-8)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.title('Normalized Confusion Matrix', fontsize=16)
    plt.ylabel('True Label', fontsize=14)
    plt.xlabel('Predicted Label', fontsize=14)
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    
    save_path = config.RESULTS_DIR / 'confusion_matrix_normalized.png'
    plt.tight_layout()
    plt.savefig(save_path, dpi=100, bbox_inches='tight')
    plt.show()
    
    print(f"Confusion matrices saved")
    return cm

# Generate confusion matrix
confusion_matrix_result = generate_confusion_matrix()

In [None]:
# Function to plot Recall-Confidence curves
def plot_recall_confidence_curves():
    plt.figure(figsize=(12, 8))
    
    all_recall_scores = []
    confidence_range = np.arange(0.0, 1.01, 0.005) 
    
    # Calculate Recall for each confidence threshold
    for conf_threshold in confidence_range:
        metrics = calculate_metrics_at_confidence(conf_threshold)
        avg_recall = np.mean([m['recall'] for m in metrics.values()]) if metrics else 0
        all_recall_scores.append(avg_recall)
    
    # Plot average recall
    plt.plot(config.CONFIDENCE_THRESHOLDS, all_recall_scores, 
             color='blue', linewidth=3, 
             label=f'all classes {max(all_recall_scores):.2f} at {config.CONFIDENCE_THRESHOLDS[np.argmax(all_recall_scores)]:.3f}')
    
    # Plot individual classes
    colors = plt.cm.tab10(np.linspace(0, 1, len(class_names)-1))
    
    # Plot recall for each class
    for class_idx in range(1, len(class_names)):
        class_name = class_names[class_idx]
        class_recall_scores = []
        
        for conf_threshold in confidence_range:
            metrics = calculate_metrics_at_confidence(conf_threshold)
            recall = metrics.get(class_name, {}).get('recall', 0)
            class_recall_scores.append(recall)
        
        plt.plot(config.CONFIDENCE_THRESHOLDS, class_recall_scores,
                color=colors[class_idx-1], label=class_name, alpha=0.7)
    
    plt.xlabel('Confidence', fontsize=12)
    plt.ylabel('Recall', fontsize=12)
    plt.title('Recall-Confidence Curve', fontsize=14)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    
    save_path = config.RESULTS_DIR / 'BoxR_curve.png'
    plt.tight_layout()
    plt.savefig(save_path, dpi=100, bbox_inches='tight')
    plt.show()
    
    print(f"Recall curve saved: {save_path}")

# Generate plot
plot_recall_confidence_curves()

In [None]:
# Function to plot Precision-Confidence curves
def plot_precision_confidence_curves():
    plt.figure(figsize=(12, 8))
    
    all_precision_scores = []
    confidence_range = np.arange(0.0, 1.01, 0.005) 
    
    # Calculate Precision for each confidence threshold
    for conf_threshold in confidence_range:
        metrics = calculate_metrics_at_confidence(conf_threshold)
        avg_precision = np.mean([m['precision'] for m in metrics.values()]) if metrics else 0
        all_precision_scores.append(avg_precision)
    
    # Plot average precision
    plt.plot(config.CONFIDENCE_THRESHOLDS, all_precision_scores, 
             color='blue', linewidth=3, 
             label=f'all classes {max(all_precision_scores):.2f} at {config.CONFIDENCE_THRESHOLDS[np.argmax(all_precision_scores)]:.3f}')
    
    # Plot individual classes
    colors = plt.cm.tab10(np.linspace(0, 1, len(class_names)-1))
    
    # Plot precision for each class
    for class_idx in range(1, len(class_names)):
        class_name = class_names[class_idx]
        class_precision_scores = []
        
        for conf_threshold in confidence_range:
            metrics = calculate_metrics_at_confidence(conf_threshold)
            precision = metrics.get(class_name, {}).get('precision', 0)
            class_precision_scores.append(precision)
        
        plt.plot(config.CONFIDENCE_THRESHOLDS, class_precision_scores,
                color=colors[class_idx-1], label=class_name, alpha=0.7)
    
    plt.xlabel('Confidence', fontsize=12)
    plt.ylabel('Precision', fontsize=12)
    plt.title('Precision-Confidence Curve', fontsize=14)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    
    save_path = config.RESULTS_DIR / 'BoxP_curve.png'
    plt.tight_layout()
    plt.savefig(save_path, dpi=100, bbox_inches='tight')
    plt.show()
    
    print(f"Precision curve saved: {save_path}")

# Generate plot
plot_precision_confidence_curves()

In [None]:
# Function to create complete evaluation summary matching YOLO output
def create_comprehensive_summary():    
    # Calculate overall metrics
    overall_precision = np.mean([m['precision'] for m in metrics_05.values()])
    overall_recall = np.mean([m['recall'] for m in metrics_05.values()])
    overall_f1 = np.mean([m['f1'] for m in metrics_05.values()])

    # Save complete summary
    summary = {
        'model': 'Faster R-CNN',
        'dataset': 'AgroPest-12',
        'overall_metrics': {
            'precision': float(overall_precision),
            'recall': float(overall_recall),
            'f1_score': float(overall_f1)
        },
        'class_metrics': {k: {key: float(val) for key, val in v.items()} 
                        for k, v in metrics_05.items()},
        'evaluation_settings': {
            'iou_threshold': config.IOU_THRESHOLD,
            'confidence_threshold': 0.05,
            'num_test_images': len(gt_annotations)
        }
    }
    
    # Save summary as evaluation_summary.json
    summary_path = config.RESULTS_DIR / 'evaluation_summary.json'
    with open(summary_path, 'w') as f:
        json.dump(summary, f, indent=2)
    
    print(f"\nAll results saved to: {config.RESULTS_DIR}/")
    
    return summary

# Generate final summary
final_results = create_comprehensive_summary()