# üîç QR Code Model Evaluation - Simple & Fast

**Quick evaluation of your trained model**

## üìã Prerequisites on Kaggle:
1. Add your model dataset: `/kaggle/input/qr-fishing/pytorch/default/1/best_model.pth`
2. Add QR codes dataset: `benign-and-malicious-qr-codes`
3. Enable GPU (optional, but faster)
4. **Run all cells** (Ctrl+A, then Shift+Enter)

---

In [None]:
# ============================================================================
# STEP 1: Install & Import (Run this first!)
# ============================================================================

import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image, ImageFile
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, classification_report, 
    precision_recall_fscore_support, roc_auc_score,
    roc_curve, precision_recall_curve, average_precision_score
)
from tqdm.auto import tqdm
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

ImageFile.LOAD_TRUNCATED_IMAGES = True

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

print('‚úÖ Imports successful!')

In [None]:
# ============================================================================
# STEP 2: Configuration - UPDATE THESE PATHS IF NEEDED!
# ============================================================================

# üîß CHANGE THESE IF YOUR PATHS ARE DIFFERENT
MODEL_PATH = '/kaggle/input/qr-fishing/pytorch/default/1/best_model.pth'
DATA_DIR = '/kaggle/input/benign-and-malicious-qr-codes/QR codes'
OUTPUT_DIR = '/kaggle/working'

# Model config (must match training)
IMG_SIZE = 256
MODEL_NAME = 'efficientnet_b3'
BATCH_SIZE = 64
NUM_WORKERS = 2

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'üî• Device: {device}')
if torch.cuda.is_available():
    print(f'   GPU: {torch.cuda.get_device_name(0)}')
else:
    print('   ‚ö†Ô∏è No GPU - evaluation will be slower')

# Verify paths
print(f'\nüìÇ Checking paths...')
if os.path.exists(MODEL_PATH):
    print(f'   ‚úÖ Model found: {MODEL_PATH}')
else:
    print(f'   ‚ùå Model NOT found: {MODEL_PATH}')
    print(f'   üí° Add your model dataset in Kaggle!')

if os.path.exists(DATA_DIR):
    print(f'   ‚úÖ Data found: {DATA_DIR}')
else:
    print(f'   ‚ùå Data NOT found: {DATA_DIR}')
    print(f'   üí° Add the QR codes dataset in Kaggle!')

os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f'\n‚úÖ Configuration complete!')

In [None]:
# ============================================================================
# STEP 3: Define Model Architecture (Must match training!)
# ============================================================================

class QRClassifier(nn.Module):
    def __init__(self, model_name='efficientnet_b3', dropout_rate=0.3, hidden_units=256):
        super(QRClassifier, self).__init__()
        
        if model_name == 'efficientnet_b3':
            self.backbone = models.efficientnet_b3(pretrained=False)
        elif model_name == 'efficientnet_b2':
            self.backbone = models.efficientnet_b2(pretrained=False)
        elif model_name == 'efficientnet_b0':
            self.backbone = models.efficientnet_b0(pretrained=False)
        else:
            raise ValueError(f'Unsupported model: {model_name}')
        
        in_features = self.backbone.classifier[1].in_features
        self.backbone.classifier = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(in_features, hidden_units),
            nn.BatchNorm1d(hidden_units),
            nn.ReLU(),
            nn.Dropout(p=dropout_rate/2),
            nn.Linear(hidden_units, 1)
        )
    
    def forward(self, x):
        return self.backbone(x)

print('‚úÖ Model architecture defined!')

In [None]:
# ============================================================================
# STEP 4: Load Pre-trained Model
# ============================================================================

print(f'üì¶ Loading model from: {MODEL_PATH}\n')

try:
    # Initialize model
    model = QRClassifier(model_name=MODEL_NAME, dropout_rate=0.3, hidden_units=256).to(device)
    
    # Load checkpoint
    checkpoint = torch.load(MODEL_PATH, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    print(f'‚úÖ Model loaded successfully!')
    print(f'   Trained epochs: {checkpoint.get("epoch", "Unknown")}')
    print(f'   Best val accuracy: {checkpoint.get("val_acc", 0):.4f}')
    
except Exception as e:
    print(f'‚ùå Error loading model: {e}')
    print(f'\nüí° Troubleshooting:')
    print(f'   1. Check if MODEL_PATH is correct in Step 2')
    print(f'   2. Make sure you added the model dataset in Kaggle')
    print(f'   3. Verify MODEL_NAME matches your trained model')
    raise

In [None]:
# ============================================================================
# STEP 5: Prepare Test Data
# ============================================================================

class QRDataset(Dataset):
    def __init__(self, file_label_pairs, transform=None):
        self.files = [p for p, _ in file_label_pairs]
        self.labels = [lbl for _, lbl in file_label_pairs]
        self.transform = transform
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        try:
            image = Image.open(self.files[idx]).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, self.labels[idx]
        except Exception:
            if self.transform:
                black_img = Image.new('RGB', (IMG_SIZE, IMG_SIZE), (0, 0, 0))
                return self.transform(black_img), self.labels[idx]
            return torch.zeros(3, IMG_SIZE, IMG_SIZE), self.labels[idx]

# Transform
test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Collect images
print('üìÇ Loading test images...')
benign_dir = os.path.join(DATA_DIR, 'benign', 'benign')
malicious_dir = os.path.join(DATA_DIR, 'malicious', 'malicious')

image_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"}

benign_files = [(str(f), 0) for ext in image_extensions 
                for f in Path(benign_dir).glob(f'**/*{ext}')]
malicious_files = [(str(f), 1) for ext in image_extensions 
                   for f in Path(malicious_dir).glob(f'**/*{ext}')]

all_files = benign_files + malicious_files
random.shuffle(all_files)

# Use 10% as test set
test_size = int(len(all_files) * 0.10)
test_pairs = all_files[:test_size]

print(f'‚úÖ Test set: {len(test_pairs):,} images')
print(f'   Benign: {len(benign_files):,}')
print(f'   Malicious: {len(malicious_files):,}')

# Create dataloader
test_dataset = QRDataset(test_pairs, transform=test_transform)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=NUM_WORKERS, pin_memory=True
)

print(f'‚úÖ DataLoader ready!')

In [None]:
# ============================================================================
# STEP 6: Run Evaluation (This takes 2-5 minutes)
# ============================================================================

print('üîç Evaluating model on test set...\n')

model.eval()
all_preds = []
all_labels = []
all_probs = []

with torch.no_grad():
    for images, labels in tqdm(test_loader, desc='Testing'):
        images = images.to(device, non_blocking=True)
        labels = labels.float().unsqueeze(1).to(device, non_blocking=True)
        
        outputs = model(images)
        probs = torch.sigmoid(outputs)
        predicted = (probs >= 0.5).float()
        
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
        all_probs.extend(probs.cpu().numpy())

# Convert to numpy
test_preds = np.array(all_preds).flatten()
test_labels = np.array(all_labels).flatten()
test_probs = np.array(all_probs).flatten()

test_preds_binary = (test_probs >= 0.5).astype(int)
test_labels_binary = test_labels.astype(int)

print('\n‚úÖ Evaluation complete!')

In [None]:
# ============================================================================
# STEP 7: Calculate Metrics & Show Results
# ============================================================================

# Calculate all metrics
accuracy = (test_preds_binary == test_labels_binary).mean()
precision, recall, f1, _ = precision_recall_fscore_support(
    test_labels_binary, test_preds_binary, average='binary'
)
roc_auc = roc_auc_score(test_labels_binary, test_probs)
avg_precision = average_precision_score(test_labels_binary, test_probs)

# Class-wise metrics
benign_correct = sum((test_labels_binary == 0) & (test_preds_binary == 0))
benign_total = sum(test_labels_binary == 0)
malicious_correct = sum((test_labels_binary == 1) & (test_preds_binary == 1))
malicious_total = sum(test_labels_binary == 1)

# Display results
print(f'\n{"="*70}')
print(f'üéØ TEST SET EVALUATION RESULTS')
print(f'{"="*70}')
print(f'Test Size:          {len(test_labels):,} images')
print(f'{"="*70}')
print(f'üìä Overall Metrics:')
print(f'  Accuracy:         {accuracy:.4f} ({accuracy*100:.2f}%)')
print(f'  Precision:        {precision:.4f}')
print(f'  Recall:           {recall:.4f}')
print(f'  F1-Score:         {f1:.4f}')
print(f'  ROC-AUC:          {roc_auc:.4f}')
print(f'  Avg Precision:    {avg_precision:.4f}')
print(f'{"="*70}')
print(f'üìà Per-Class Performance:')
print(f'  Benign:           {benign_correct}/{benign_total} ({benign_correct/benign_total*100:.1f}%)')
print(f'  Malicious:        {malicious_correct}/{malicious_total} ({malicious_correct/malicious_total*100:.1f}%)')
print(f'{"="*70}\n')

# Classification report
print('üìã Detailed Classification Report:')
print(classification_report(test_labels_binary, test_preds_binary, 
                          target_names=['Benign', 'Malicious'], digits=4))

In [None]:
# ============================================================================
# STEP 8: Confusion Matrix Visualization
# ============================================================================

cm = confusion_matrix(test_labels_binary, test_preds_binary)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Benign', 'Malicious'],
            yticklabels=['Benign', 'Malicious'],
            cbar_kws={'label': 'Count'})
axes[0].set_ylabel('True Label', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Predicted Label', fontsize=12, fontweight='bold')
axes[0].set_title('Confusion Matrix (Counts)', fontsize=14, fontweight='bold')

# Normalized
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_norm, annot=True, fmt='.2%', cmap='Blues', ax=axes[1],
            xticklabels=['Benign', 'Malicious'],
            yticklabels=['Benign', 'Malicious'],
            cbar_kws={'label': 'Percentage'})
axes[1].set_ylabel('True Label', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Predicted Label', fontsize=12, fontweight='bold')
axes[1].set_title('Confusion Matrix (Normalized)', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'confusion_matrix.png'), dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Saved: confusion_matrix.png')

In [None]:
# ============================================================================
# STEP 9: ROC & Precision-Recall Curves
# ============================================================================

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# ROC Curve
fpr, tpr, _ = roc_curve(test_labels_binary, test_probs)
axes[0].plot(fpr, tpr, color='darkorange', lw=2.5, 
            label=f'ROC curve (AUC = {roc_auc:.4f})')
axes[0].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', 
            label='Random classifier (AUC = 0.5000)')
axes[0].fill_between(fpr, tpr, alpha=0.2, color='darkorange')
axes[0].set_xlim([0.0, 1.0])
axes[0].set_ylim([0.0, 1.05])
axes[0].set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
axes[0].set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
axes[0].set_title('ROC Curve', fontsize=14, fontweight='bold')
axes[0].legend(loc="lower right", fontsize=10)
axes[0].grid(alpha=0.3)

# Precision-Recall Curve
precision_curve, recall_curve, _ = precision_recall_curve(test_labels_binary, test_probs)
axes[1].plot(recall_curve, precision_curve, color='green', lw=2.5, 
            label=f'PR curve (AP = {avg_precision:.4f})')
axes[1].fill_between(recall_curve, precision_curve, alpha=0.2, color='green')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('Recall', fontsize=12, fontweight='bold')
axes[1].set_ylabel('Precision', fontsize=12, fontweight='bold')
axes[1].set_title('Precision-Recall Curve', fontsize=14, fontweight='bold')
axes[1].legend(loc="lower left", fontsize=10)
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'roc_pr_curves.png'), dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Saved: roc_pr_curves.png')

In [None]:
# ============================================================================
# STEP 10: Prediction Distribution Analysis
# ============================================================================

benign_probs = test_probs[test_labels_binary == 0]
malicious_probs = test_probs[test_labels_binary == 1]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(benign_probs, bins=50, alpha=0.7, label='Benign (True)', 
            color='blue', edgecolor='black')
axes[0].hist(malicious_probs, bins=50, alpha=0.7, label='Malicious (True)', 
            color='red', edgecolor='black')
axes[0].axvline(x=0.5, color='green', linestyle='--', linewidth=2.5, 
               label='Decision Threshold (0.5)')
axes[0].set_xlabel('Predicted Probability (Malicious)', fontsize=12, fontweight='bold')
axes[0].set_ylabel('Frequency', fontsize=12, fontweight='bold')
axes[0].set_title('Distribution of Predicted Probabilities', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].grid(alpha=0.3)

# Box plot
data_to_plot = [benign_probs, malicious_probs]
bp = axes[1].boxplot(data_to_plot, labels=['Benign\n(True)', 'Malicious\n(True)'], 
                     patch_artist=True,
                     boxprops=dict(facecolor='lightblue', edgecolor='black', linewidth=1.5),
                     medianprops=dict(color='red', linewidth=2.5),
                     whiskerprops=dict(linewidth=1.5),
                     capprops=dict(linewidth=1.5))
axes[1].axhline(y=0.5, color='green', linestyle='--', linewidth=2, 
               label='Threshold (0.5)')
axes[1].set_ylabel('Predicted Probability (Malicious)', fontsize=12, fontweight='bold')
axes[1].set_title('Probability Distribution by True Class', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].grid(alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'prediction_distribution.png'), dpi=150, bbox_inches='tight')
plt.show()

print('‚úÖ Saved: prediction_distribution.png')

# Statistics
print(f'\nüìä Probability Statistics:')
print(f'Benign (True):')
print(f'  Mean: {benign_probs.mean():.4f}, Median: {np.median(benign_probs):.4f}, Std: {benign_probs.std():.4f}')
print(f'Malicious (True):')
print(f'  Mean: {malicious_probs.mean():.4f}, Median: {np.median(malicious_probs):.4f}, Std: {malicious_probs.std():.4f}')

In [None]:
# ============================================================================
# STEP 11: Save Predictions to CSV
# ============================================================================

predictions_df = pd.DataFrame({
    'true_label': test_labels_binary,
    'true_label_name': ['Benign' if l == 0 else 'Malicious' for l in test_labels_binary],
    'predicted_label': test_preds_binary,
    'predicted_label_name': ['Benign' if p == 0 else 'Malicious' for p in test_preds_binary],
    'probability_malicious': test_probs,
    'probability_benign': 1 - test_probs,
    'correct': test_labels_binary == test_preds_binary,
    'confidence': np.maximum(test_probs, 1 - test_probs)
})

# Add error analysis
predictions_df['prediction_type'] = 'Correct'
predictions_df.loc[(predictions_df['true_label'] == 0) & (predictions_df['predicted_label'] == 1), 'prediction_type'] = 'False Positive'
predictions_df.loc[(predictions_df['true_label'] == 1) & (predictions_df['predicted_label'] == 0), 'prediction_type'] = 'False Negative'

# Save
csv_path = os.path.join(OUTPUT_DIR, 'test_predictions.csv')
predictions_df.to_csv(csv_path, index=False)

print(f'‚úÖ Saved: test_predictions.csv')
print(f'\nüìä Prediction Summary:')
print(predictions_df['prediction_type'].value_counts())
print(f'\nüìÑ First 10 rows:')
print(predictions_df.head(10))

In [None]:
# ============================================================================
# STEP 12: Sample Predictions (Visual Inspection)
# ============================================================================

print('\nüì∑ Sample Predictions (Random 15):')
print('='*80)

sample_indices = random.sample(range(len(test_pairs)), min(15, len(test_pairs)))

for idx in sample_indices:
    img_path, true_label = test_pairs[idx]
    true_label_str = "Malicious" if true_label == 1 else "Benign"
    pred_label = "Malicious" if test_preds_binary[idx] == 1 else "Benign"
    prob = test_probs[idx]
    confidence = max(prob, 1 - prob)
    correct = "‚úÖ" if pred_label == true_label_str else "‚ùå"
    
    print(f'{os.path.basename(img_path)[:55]:55s}')
    print(f'  True: {true_label_str:10s} | Pred: {pred_label:10s} | Prob: {prob:.3f} | Conf: {confidence:.1%} {correct}')
    print()

print('='*80)

In [None]:
# ============================================================================
# STEP 13: Error Analysis - Show Worst Predictions
# ============================================================================

print('\nüîç Error Analysis: Most Confident Mistakes\n')

# Get errors
errors_df = predictions_df[predictions_df['correct'] == False].copy()
errors_df = errors_df.sort_values('confidence', ascending=False)

print(f'Total errors: {len(errors_df)} ({len(errors_df)/len(predictions_df)*100:.2f}%)')
print(f'False Positives: {sum(predictions_df["prediction_type"] == "False Positive")}')
print(f'False Negatives: {sum(predictions_df["prediction_type"] == "False Negative")}')

if len(errors_df) > 0:
    print(f'\n‚ùå Top 10 Most Confident Errors:')
    print('='*80)
    for idx, row in errors_df.head(10).iterrows():
        print(f'True: {row["true_label_name"]:10s} | '
              f'Pred: {row["predicted_label_name"]:10s} | '
              f'Confidence: {row["confidence"]:.1%} | '
              f'Type: {row["prediction_type"]}')
else:
    print('\nüéâ Perfect predictions! No errors found.')

In [None]:
# ============================================================================
# FINAL SUMMARY
# ============================================================================

print(f'\n{"="*80}')
print(f'‚úÖ EVALUATION COMPLETE!')
print(f'{"="*80}')
print(f'\nüìÅ Artifacts saved to: {OUTPUT_DIR}')
print(f'   ‚úÖ confusion_matrix.png')
print(f'   ‚úÖ roc_pr_curves.png')
print(f'   ‚úÖ prediction_distribution.png')
print(f'   ‚úÖ test_predictions.csv')
print(f'\nüìä Key Results:')
print(f'   Accuracy:  {accuracy*100:.2f}%')
print(f'   Precision: {precision:.4f}')
print(f'   Recall:    {recall:.4f}')
print(f'   F1-Score:  {f1:.4f}')
print(f'   ROC-AUC:   {roc_auc:.4f}')
print(f'\nüí° Next Steps:')
print(f'   1. Download the visualizations from the Output section')
print(f'   2. Review test_predictions.csv for detailed analysis')
print(f'   3. Check the error analysis above for model weaknesses')
print(f'{"="*80}')
print(f'\nüéâ Well done! Your model evaluation is complete.')