# üß™ FaceNet Model Evaluation & Inference

Notebook ini berisi program inferensi untuk mengevaluasi performa model FaceNet.

**Metrik yang dihitung:**
- Accuracy
- Precision
- Recall
- F1-Score
- Confusion Matrix

**Model:** FaceNet (InceptionResnetV1) dengan transfer learning dari VGGFace2

## 1. Import Libraries

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pickle
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from tqdm import tqdm

print("‚úÖ Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 2. Define Model Architecture

Arsitektur classifier yang sama dengan training

In [None]:
class EmbeddingDataset(Dataset):
    """Dataset untuk embeddings"""
    def __init__(self, embeddings, labels, label_to_idx):
        self.embeddings = torch.FloatTensor(embeddings)
        self.labels = torch.LongTensor([label_to_idx[label] for label in labels])
    
    def __len__(self):
        return len(self.embeddings)
    
    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]


class EmbeddingClassifier(nn.Module):
    """Classifier head untuk embeddings"""
    def __init__(self, embedding_dim=512, num_classes=None, dropout_rate=0.5):
        super(EmbeddingClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, 256)
        self.bn1 = nn.BatchNorm1d(256)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(256, 128)
        self.bn2 = nn.BatchNorm1d(128)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

print("‚úÖ Model architecture defined")

## 3. Load Saved Model

Memuat model FaceNet yang sudah dilatih

In [None]:
# Configuration
MODEL_PKL_PATH = './models/facenet_model_20251201_225633.pkl'
MODEL_PTH_PATH = './models/facenet_classifier_20251201_225633.pth'
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"üñ•Ô∏è  Device: {DEVICE}")
print(f"üìÅ Model PKL: {MODEL_PKL_PATH}")
print(f"üìÅ Model PTH: {MODEL_PTH_PATH}")

In [None]:
# Load model data (embeddings, labels, mappings)
print("üì¶ Loading model data...")

with open(MODEL_PKL_PATH, 'rb') as f:
    model_data = pickle.load(f)

# Extract data
embeddings = model_data['embeddings']
labels = model_data['labels']
label_to_idx = model_data['label_to_idx']
idx_to_label = model_data['idx_to_label']

num_classes = len(label_to_idx)
embedding_dim = embeddings.shape[1]

print(f"\n‚úÖ Model data loaded!")
print(f"   Total embeddings: {len(embeddings)}")
print(f"   Embedding dimension: {embedding_dim}")
print(f"   Number of classes: {num_classes}")
print(f"   Model type: {model_data.get('model_type', 'FaceNet')}")

if 'best_val_acc' in model_data:
    print(f"   Training Val Accuracy: {model_data['best_val_acc']:.2f}%")

In [None]:
# Load classifier weights
print("üì¶ Loading classifier weights...")

classifier = EmbeddingClassifier(
    embedding_dim=embedding_dim,
    num_classes=num_classes,
    dropout_rate=0.5
).to(DEVICE)

classifier.load_state_dict(torch.load(MODEL_PTH_PATH, map_location=DEVICE))
classifier.eval()

print(f"‚úÖ Classifier loaded successfully!")
print(f"   Architecture: 512 ‚Üí 256 ‚Üí 128 ‚Üí {num_classes}")

## 4. Prepare Validation Dataset

Menggunakan split yang sama dengan training (20% validation)

In [None]:
# Split data dengan random_state yang sama dengan training
VALIDATION_SPLIT = 0.2
RANDOM_STATE = 42

X_train, X_val, y_train, y_val = train_test_split(
    embeddings, labels,
    test_size=VALIDATION_SPLIT,
    random_state=RANDOM_STATE,
    stratify=labels
)

print(f"üìä Data Split:")
print(f"   Training set: {len(X_train)} samples")
print(f"   Validation set: {len(X_val)} samples")
print(f"   Split ratio: {VALIDATION_SPLIT*100:.0f}% validation")

In [None]:
# Create validation DataLoader
val_dataset = EmbeddingDataset(X_val, y_val, label_to_idx)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print(f"‚úÖ Validation DataLoader created")
print(f"   Batches: {len(val_loader)}")

## 5. Run Inference on Validation Set

Menjalankan inferensi untuk mendapatkan prediksi

In [None]:
# Run inference
print("üîç Running inference on validation set...")

all_predictions = []
all_labels = []
all_probabilities = []

classifier.eval()
with torch.no_grad():
    for embeddings_batch, labels_batch in tqdm(val_loader, desc="Inference"):
        embeddings_batch = embeddings_batch.to(DEVICE)
        
        # Forward pass
        outputs = classifier(embeddings_batch)
        probabilities = F.softmax(outputs, dim=1)
        _, predictions = torch.max(outputs, 1)
        
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(labels_batch.numpy())
        all_probabilities.extend(probabilities.cpu().numpy())

all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)
all_probabilities = np.array(all_probabilities)

print(f"\n‚úÖ Inference completed!")
print(f"   Total predictions: {len(all_predictions)}")

## 6. Calculate Evaluation Metrics

Menghitung Accuracy, Precision, Recall, dan F1-Score

In [None]:
# Calculate metrics
print("üìä Calculating Evaluation Metrics...")
print("=" * 70)

# Overall Metrics
accuracy = accuracy_score(all_labels, all_predictions)
precision_macro = precision_score(all_labels, all_predictions, average='macro', zero_division=0)
recall_macro = recall_score(all_labels, all_predictions, average='macro', zero_division=0)
f1_macro = f1_score(all_labels, all_predictions, average='macro', zero_division=0)

precision_weighted = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
recall_weighted = recall_score(all_labels, all_predictions, average='weighted', zero_division=0)
f1_weighted = f1_score(all_labels, all_predictions, average='weighted', zero_division=0)

print(f"\nüéØ OVERALL METRICS (Validation Set)")
print("=" * 70)
print(f"\nüìà Accuracy: {accuracy * 100:.2f}%")
print(f"\n--- Macro Average (unweighted mean across all classes) ---")
print(f"   Precision: {precision_macro * 100:.2f}%")
print(f"   Recall:    {recall_macro * 100:.2f}%")
print(f"   F1-Score:  {f1_macro * 100:.2f}%")
print(f"\n--- Weighted Average (weighted by support/samples per class) ---")
print(f"   Precision: {precision_weighted * 100:.2f}%")
print(f"   Recall:    {recall_weighted * 100:.2f}%")
print(f"   F1-Score:  {f1_weighted * 100:.2f}%")

In [None]:
# Create summary table
metrics_summary = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision (Macro)', 'Recall (Macro)', 'F1-Score (Macro)',
               'Precision (Weighted)', 'Recall (Weighted)', 'F1-Score (Weighted)'],
    'Score': [accuracy, precision_macro, recall_macro, f1_macro,
              precision_weighted, recall_weighted, f1_weighted],
    'Percentage': [f"{accuracy*100:.2f}%", f"{precision_macro*100:.2f}%", 
                   f"{recall_macro*100:.2f}%", f"{f1_macro*100:.2f}%",
                   f"{precision_weighted*100:.2f}%", f"{recall_weighted*100:.2f}%", 
                   f"{f1_weighted*100:.2f}%"]
})

print("\nüìã METRICS SUMMARY TABLE")
print("=" * 70)
print(metrics_summary.to_string(index=False))

## 7. Per-Class Metrics

Menampilkan metrik untuk setiap kelas/person

In [None]:
# Get class names
class_names = [idx_to_label[i] for i in range(num_classes)]

# Calculate per-class metrics
precision_per_class = precision_score(all_labels, all_predictions, average=None, zero_division=0)
recall_per_class = recall_score(all_labels, all_predictions, average=None, zero_division=0)
f1_per_class = f1_score(all_labels, all_predictions, average=None, zero_division=0)

# Create per-class dataframe
per_class_metrics = pd.DataFrame({
    'Class': class_names,
    'Precision': precision_per_class,
    'Recall': recall_per_class,
    'F1-Score': f1_per_class
})

# Sort by F1-Score
per_class_metrics_sorted = per_class_metrics.sort_values('F1-Score', ascending=False)

print("\nüìä PER-CLASS METRICS (Sorted by F1-Score)")
print("=" * 70)
print(per_class_metrics_sorted.to_string(index=False))

In [None]:
# Classification Report (sklearn)
print("\nüìã DETAILED CLASSIFICATION REPORT")
print("=" * 70)
print(classification_report(all_labels, all_predictions, target_names=class_names, zero_division=0))

## 8. Visualization

In [None]:
# Bar chart for overall metrics
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Overall Metrics
metrics_names = ['Accuracy', 'Precision\n(Macro)', 'Recall\n(Macro)', 'F1-Score\n(Macro)']
metrics_values = [accuracy, precision_macro, recall_macro, f1_macro]
colors = ['#2ecc71', '#3498db', '#e74c3c', '#9b59b6']

bars = axes[0].bar(metrics_names, metrics_values, color=colors, edgecolor='black', linewidth=1.2)
axes[0].set_ylim(0, 1.1)
axes[0].set_ylabel('Score', fontsize=12)
axes[0].set_title('üìä FaceNet Model - Overall Metrics', fontsize=14, fontweight='bold')
axes[0].axhline(y=0.9, color='green', linestyle='--', alpha=0.5, label='90% threshold')

# Add value labels
for bar, val in zip(bars, metrics_values):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                 f'{val*100:.1f}%', ha='center', va='bottom', fontsize=11, fontweight='bold')

# Plot 2: Macro vs Weighted
x = np.arange(3)
width = 0.35

macro_values = [precision_macro, recall_macro, f1_macro]
weighted_values = [precision_weighted, recall_weighted, f1_weighted]

bars1 = axes[1].bar(x - width/2, macro_values, width, label='Macro', color='#3498db', edgecolor='black')
bars2 = axes[1].bar(x + width/2, weighted_values, width, label='Weighted', color='#e67e22', edgecolor='black')

axes[1].set_ylabel('Score', fontsize=12)
axes[1].set_title('üìà Macro vs Weighted Metrics', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(['Precision', 'Recall', 'F1-Score'])
axes[1].set_ylim(0, 1.1)
axes[1].legend()

# Add value labels
for bar, val in zip(bars1, macro_values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                 f'{val*100:.1f}%', ha='center', va='bottom', fontsize=9, fontweight='bold')
for bar, val in zip(bars2, weighted_values):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02,
                 f'{val*100:.1f}%', ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.tight_layout()
plt.savefig('evaluation_metrics.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Chart saved as 'evaluation_metrics.png'")

In [None]:
# Confusion Matrix (Top 20 classes for readability)
cm = confusion_matrix(all_labels, all_predictions)

# Get top 20 classes by support
class_support = np.bincount(all_labels, minlength=num_classes)
top_20_indices = np.argsort(class_support)[-20:]

# Filter confusion matrix
cm_top20 = cm[np.ix_(top_20_indices, top_20_indices)]
top_20_names = [idx_to_label[i][:15] for i in top_20_indices]  # Truncate names

plt.figure(figsize=(14, 12))
sns.heatmap(cm_top20, annot=True, fmt='d', cmap='Blues',
            xticklabels=top_20_names, yticklabels=top_20_names)
plt.title('üîç Confusion Matrix (Top 20 Classes)', fontsize=14, fontweight='bold')
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Confusion matrix saved as 'confusion_matrix.png'")

In [None]:
# Top and Bottom performers
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Top 10 performers
top_10 = per_class_metrics_sorted.head(10)
axes[0].barh(top_10['Class'], top_10['F1-Score'], color='#2ecc71', edgecolor='black')
axes[0].set_xlim(0, 1.1)
axes[0].set_xlabel('F1-Score', fontsize=12)
axes[0].set_title('üèÜ Top 10 Best Performing Classes', fontsize=14, fontweight='bold')
axes[0].invert_yaxis()

for i, (idx, row) in enumerate(top_10.iterrows()):
    axes[0].text(row['F1-Score'] + 0.02, i, f"{row['F1-Score']*100:.1f}%", va='center', fontsize=10)

# Bottom 10 performers
bottom_10 = per_class_metrics_sorted.tail(10)
axes[1].barh(bottom_10['Class'], bottom_10['F1-Score'], color='#e74c3c', edgecolor='black')
axes[1].set_xlim(0, 1.1)
axes[1].set_xlabel('F1-Score', fontsize=12)
axes[1].set_title('‚ö†Ô∏è Bottom 10 Classes (Need Improvement)', fontsize=14, fontweight='bold')
axes[1].invert_yaxis()

for i, (idx, row) in enumerate(bottom_10.iterrows()):
    axes[1].text(row['F1-Score'] + 0.02, i, f"{row['F1-Score']*100:.1f}%", va='center', fontsize=10)

plt.tight_layout()
plt.savefig('top_bottom_performers.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úÖ Top/Bottom performers chart saved as 'top_bottom_performers.png'")

## 9. Summary & Conclusions

In [None]:
print("\n" + "=" * 70)
print("üìã FACENET MODEL EVALUATION SUMMARY")
print("=" * 70)

print(f"""
üèóÔ∏è  MODEL ARCHITECTURE:
    - Base Model: InceptionResnetV1 (FaceNet)
    - Pre-trained on: VGGFace2 (3.3M images, 9131 identities)
    - Classifier: 512 ‚Üí 256 ‚Üí 128 ‚Üí {num_classes} classes
    - Total embeddings: {len(embeddings)}

üìä DATASET:
    - Total samples: {len(embeddings)}
    - Training samples: {len(X_train)}
    - Validation samples: {len(X_val)}
    - Number of classes: {num_classes}

üéØ EVALUATION RESULTS (Validation Set):
    ‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
    ‚îÇ  Metric              ‚îÇ  Score          ‚îÇ
    ‚îú‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î§
    ‚îÇ  Accuracy            ‚îÇ  {accuracy*100:6.2f}%        ‚îÇ
    ‚îÇ  Precision (Macro)   ‚îÇ  {precision_macro*100:6.2f}%        ‚îÇ
    ‚îÇ  Recall (Macro)      ‚îÇ  {recall_macro*100:6.2f}%        ‚îÇ
    ‚îÇ  F1-Score (Macro)    ‚îÇ  {f1_macro*100:6.2f}%        ‚îÇ
    ‚îÇ  Precision (Weighted)‚îÇ  {precision_weighted*100:6.2f}%        ‚îÇ
    ‚îÇ  Recall (Weighted)   ‚îÇ  {recall_weighted*100:6.2f}%        ‚îÇ
    ‚îÇ  F1-Score (Weighted) ‚îÇ  {f1_weighted*100:6.2f}%        ‚îÇ
    ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

üìà INTERPRETATION:
    - Accuracy: Overall correct predictions
    - Precision: Out of all positive predictions, how many were correct
    - Recall: Out of all actual positives, how many were identified
    - F1-Score: Harmonic mean of Precision and Recall

‚úÖ Model uses Transfer Learning from VGGFace2 pretrained weights
‚úÖ This is NOT zero-shot learning - requires training data per class
""")

print("=" * 70)
print("‚úÖ Evaluation completed successfully!")
print("=" * 70)

In [None]:
# Save metrics to CSV
metrics_summary.to_csv('evaluation_metrics_summary.csv', index=False)
per_class_metrics.to_csv('per_class_metrics.csv', index=False)

print("üíæ Metrics saved to:")
print("   - evaluation_metrics_summary.csv")
print("   - per_class_metrics.csv")