<a href="https://colab.research.google.com/github/cs-iuu/ocr-2025-fall-cv/blob/main/notebooks/12.evaluation_metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 12. Evaulation Metrics

In [2]:
import numpy as np
from collections import defaultdict
import pandas as pd # Import pandas for nice table display (optional, but helpful in a notebook)

# --- CELL 1: Setup and Data Simulation ---

print("--- 1. Data Setup: Multi-Class Imbalance Simulation ---")

# We are simulating a classification task for 3 Cyrillic characters:
# Class 0: 'А' (Common) - Support: 80
# Class 1: 'Б' (Medium) - Support: 15
# Class 2: 'Щ' (Rare) - Support: 5

# Total Samples: 100

# Ground Truth (Actual) Labels
Y_TRUE = np.array([
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20 A's
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40 A's
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60 A's
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80 A's
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 10 B's
    1, 1, 1, 1, 1, # 15 B's
    2, 2, 2, 2, 2  # 5 Щ's (Shcha)
])

# Model Predictions (Simulated to be good on common, bad on rare)
Y_PRED = np.array([
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20 A's
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40 A's
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60 A's
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 80 A's (75 correct predictions, 5 FPs on other classes)

    # Mistakes on 'Б' (Class 1) - 12 correct, 3 missed (FNs)
    1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
    1, 1, 0, 0, 0,

    # Mistakes on 'Щ' (Class 2) - 0 correct, 5 missed (FNs)
    0, 0, 0, 0, 0
])


# --- Display Summary ---
CLASSES = np.unique(Y_TRUE)
print(f"Total Samples: {len(Y_TRUE)}")
print(f"Classes found: {CLASSES}")
counts = defaultdict(int)
for label in Y_TRUE:
    counts[label] += 1
print(f"Class Support (Imbalance): {dict(counts)}")

# --- CELL 2: Metric Functions - STUDENTS MUST COMPLETE TODOS ---

def calculate_confusion_components(y_true, y_pred, target_class):
    """
    Calculates the True Positives (TP), False Positives (FP), and False Negatives (FN)
    for a given target class (one-vs-rest approach).
    """
    TP = 0
    FP = 0
    FN = 0

    for actual, predicted in zip(y_true, y_pred):
        # TODO 1: Implement the logic to count TP, FP, and FN based on definitions:

        # TP: Actual is TARGET_CLASS AND Predicted is TARGET_CLASS
        if actual == target_class and predicted == target_class:
            TP += 1
        # FN: Actual is TARGET_CLASS AND Predicted is NOT TARGET_CLASS
        elif actual == target_class and predicted != target_class:
            FN += 1
        # FP: Actual is NOT TARGET_CLASS AND Predicted is TARGET_CLASS
        elif actual != target_class and predicted == target_class:
            FP += 1

    return TP, FP, FN


def calculate_precision(TP, FP):
    """
    Calculates Precision: TP / (TP + FP)
    """
    # TODO 2: Calculate Precision, handling division by zero if (TP + FP) is zero.
    if (TP + FP) == 0:
        return 0.0 # If the model never predicted this class, precision is undefined (treat as 0)
    return TP / (TP + FP)


def calculate_recall(TP, FN):
    """
    Calculates Recall: TP / (TP + FN)
    """
    # TODO 3: Calculate Recall, handling division by zero if (TP + FN) is zero.
    if (TP + FN) == 0:
        return 0.0 # If the class had no actual samples, recall is undefined (treat as 0)
    return TP / (TP + FN)


def calculate_f1_score(precision, recall):
    """
    Calculates F1-Score: 2 * (P * R) / (P + R) (Harmonic Mean)
    """
    # TODO 4: Calculate F1-Score, handling division by zero if (Precision + Recall) is zero.
    if (precision + recall) == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


# --- CELL 3: Multi-Class Aggregation Logic ---

def calculate_macro_f1(y_true, y_pred, classes):
    """
    Calculates the Macro F1-Score by averaging the F1-Scores of all classes.
    """
    results = []
    class_f1_scores = []

    # Map class IDs to human-readable labels for display
    class_map = {0: 'А (Common)', 1: 'Б (Medium)', 2: 'Щ (Rare)'}

    # 1. Loop through each class label
    for class_id in classes:
        class_label = class_map.get(class_id, f'Class {class_id}')

        # 2. Calculate the Confusion Matrix components for the current class
        TP, FP, FN = calculate_confusion_components(y_true, y_pred, class_id)

        # 3. Calculate Precision, Recall, and F1-Score for the current class
        P = calculate_precision(TP, FP)
        R = calculate_recall(TP, FN)
        F1 = calculate_f1_score(P, R)

        # Store results for table display
        results.append({
            'Class': class_label,
            'Support': counts[class_id],
            'TP': TP, 'FP': FP, 'FN': FN,
            'Precision': P,
            'Recall': R,
            'F1-Score': F1
        })
        class_f1_scores.append(F1)

    # Display results nicely
    df = pd.DataFrame(results).set_index('Class')
    print("\n--- Per-Class Metrics ---")
    print(df.to_markdown(floatfmt=".4f"))

    # 5. Calculate the Macro F1-Score (unweighted average of all F1-scores)
    # TODO 5: Implement the final average calculation
    if not class_f1_scores:
        return 0.0
    macro_f1 = sum(class_f1_scores) / len(class_f1_scores)

    return macro_f1


# --- CELL 4: Execution and Analysis ---

if __name__ == "__main__":

    # 1. Calculate and display the Macro F1 Score
    final_macro_f1 = calculate_macro_f1(Y_TRUE, Y_PRED, CLASSES)

    # 2. Calculate overall Accuracy (for comparison)
    correct_predictions = np.sum(Y_TRUE == Y_PRED)
    total_samples = len(Y_TRUE)
    accuracy = correct_predictions / total_samples

    # 3. Print Final Results

    print("\n====================================")
    print(f"OVERALL ACCURACY = {accuracy:.4f} ({correct_predictions}/{total_samples})")
    print(f"FINAL RESULT: MACRO F1-SCORE = {final_macro_f1:.4f}")
    print("====================================")

    # --- Student Analysis Prompts ---
    print("\n[Discussion Prompts]:")
    print("1. Compare the Macro F1-Score to the Overall Accuracy. Why is the Macro F1-Score significantly lower?")
    print("2. Look at the metrics for 'Щ' (Class 2). Why is its Recall 0.0 and its Precision 0.0?")
    print("3. Explain why Macro F1-Score is the superior evaluation metric for this imbalanced OCR problem.")

--- 1. Data Setup: Multi-Class Imbalance Simulation ---
Total Samples: 100
Classes found: [0 1 2]
Class Support (Imbalance): {np.int64(0): 80, np.int64(1): 15, np.int64(2): 5}

--- Per-Class Metrics ---
| Class      |   Support |      TP |     FP |     FN |   Precision |   Recall |   F1-Score |
|:-----------|----------:|--------:|-------:|-------:|------------:|---------:|-----------:|
| А (Common) |   80.0000 | 80.0000 | 9.0000 | 0.0000 |      0.8989 |   1.0000 |     0.9467 |
| Б (Medium) |   15.0000 | 11.0000 | 0.0000 | 4.0000 |      1.0000 |   0.7333 |     0.8462 |
| Щ (Rare)   |    5.0000 |  0.0000 | 0.0000 | 5.0000 |      0.0000 |   0.0000 |     0.0000 |

OVERALL ACCURACY = 0.9100 (91/100)
FINAL RESULT: MACRO F1-SCORE = 0.5976

[Discussion Prompts]:
1. Compare the Macro F1-Score to the Overall Accuracy. Why is the Macro F1-Score significantly lower?
2. Look at the metrics for 'Щ' (Class 2). Why is its Recall 0.0 and its Precision 0.0?
3. Explain why Macro F1-Score is the superior 