Imports and Configuration

In [1]:
# ==== CPU-only MNIST Webcam Inference ====
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # force CPU

import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model

try:
    tf.config.set_visible_devices([], 'GPU')
except Exception:
    pass

# ----------------- Config -----------------
MODEL_PATH = "mnist_cnn_advanced.h5"  # Your model file
CAM_INDEX = 0
ROI_SIZE = 500
USE_AUTO_THRESH = True
BIN_THRESH = 140

Load Model

In [2]:
# ----------------- Model Loading -----------------
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model not found: '{MODEL_PATH}'")

model = load_model(MODEL_PATH, compile=False)
print("Model loaded successfully!")

Model loaded successfully!


Preprocessing Function

In [3]:
# ----------------- Preprocessing Function -----------------
def preprocess_webcam_image(bgr_roi, use_auto=True, manual_thresh=140):
    """ROI (BGR) -> (x_in: 1x28x28x1 float, dbg: 28x28 uint8). White digit on black, centered, MNIST-like."""
    gray = cv2.cvtColor(bgr_roi, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5, 5), 0)
    
    if use_auto:
        th = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
                                  cv2.THRESH_BINARY_INV, 25, 10)
    else:
        _, th = cv2.threshold(gray, manual_thresh, 255, cv2.THRESH_BINARY_INV)
    
    th = cv2.medianBlur(th, 3)
    th = cv2.dilate(th, np.ones((3, 3), np.uint8), iterations=1)
    
    ys, xs = np.where(th > 0)
    if len(xs) == 0 or len(ys) == 0:
        blank = np.zeros((28, 28), dtype=np.uint8)
        return blank.reshape(1, 28, 28, 1).astype(np.float32), blank
    
    pad = 6
    x0, x1 = max(0, xs.min()-pad), min(th.shape[1], xs.max()+pad+1)
    y0, y1 = max(0, ys.min()-pad), min(th.shape[0], ys.max()+pad+1)
    crop = th[y0:y1, x0:x1]
    
    h, w = crop.shape
    if h >= w:
        new_h = 20
        new_w = max(1, int(round(w * (20.0 / h))))
    else:
        new_w = 20
        new_h = max(1, int(round(h * (20.0 / w))))
    
    resized = cv2.resize(crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
    
    canvas = np.zeros((28, 28), dtype=np.uint8)
    y_off = (28 - new_h) // 2
    x_off = (28 - new_w) // 2
    canvas[y_off:y_off+new_h, x_off:x_off+new_w] = resized
    
    # Center via image moments
    M = cv2.moments(canvas, binaryImage=True)
    if M["m00"] != 0:
        cx = M["m10"] / M["m00"]
        cy = M["m01"] / M["m00"]
        dx = int(round(14 - cx))
        dy = int(round(14 - cy))
        T = np.float32([[1, 0, dx], [0, 1, dy]])
        canvas = cv2.warpAffine(canvas, T, (28, 28))
    
    x_in = (canvas.astype(np.float32) / 255.0).reshape(1, 28, 28, 1)
    return x_in, canvas

Main Webcam Function

In [4]:
# ----------------- Main Webcam Function -----------------
def webcam_prediction_fixed_input():
    cap = cv2.VideoCapture(CAM_INDEX)
    if not cap.isOpened():
        print("ERROR: Could not open webcam")
        return
    
    print("Webcam ready! Draw numbers in the green box.")
    print("Press 'q' to quit, 't' to toggle auto/fixed threshold")
    print("Use '[' and ']' to adjust fixed threshold")
    
    use_auto_thresh = USE_AUTO_THRESH
    bin_thresh = BIN_THRESH
    
    # For stabilizing predictions
    prediction_history = []
    history_size = 5  # Number of previous predictions to consider
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Define ROI
        height, width = frame.shape[:2]
        roi_size = ROI_SIZE
        x1 = (width - roi_size) // 2
        y1 = (height - roi_size) // 2
        x2 = x1 + roi_size
        y2 = y1 + roi_size
        
        # Draw ROI
        display_frame = frame.copy()
        cv2.rectangle(display_frame, (x1, y1), (x2, y2), (255, 0, 0), 2)
        
        # Extract and process ROI
        roi = frame[y1:y2, x1:x2]
        
        # Process for model prediction
        processed, debug_img = preprocess_webcam_image(roi, use_auto_thresh, bin_thresh)
        
        # Predict
        with tf.device('/CPU:0'):
            prediction = model.predict(processed, verbose=0)
        
        # Add to prediction history for stabilization
        prediction_history.append(prediction[0])
        if len(prediction_history) > history_size:
            prediction_history.pop(0)
        
        # Calculate average prediction
        avg_prediction = np.mean(prediction_history, axis=0)
        predicted_class = np.argmax(avg_prediction)
        confidence = np.max(avg_prediction)
        
        # Display results
        cv2.putText(display_frame, f"Pred: {predicted_class}", (10, 30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
        cv2.putText(display_frame, f"Conf: {confidence:.2f}", (10, 70), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
        
        # Show threshold mode
        thresh_mode = "AUTO" if use_auto_thresh else f"FIXED({bin_thresh})"
        cv2.putText(display_frame, f"Thresh: {thresh_mode}", (10, 110), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 1)
        
        # Show processed 28x28 image (what CNN sees)
        debug_display = cv2.resize(debug_img, (100, 100))
        debug_display = cv2.cvtColor(debug_display, cv2.COLOR_GRAY2BGR)
        display_frame[10:110, width-110:width-10] = debug_display
        cv2.putText(display_frame, "CNN Sees", (width-105, 120), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 0, 0), 1)
        
        cv2.imshow('MNIST Prediction (Fixed Input)', display_frame)
        
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            break
        elif key == ord('t'):
            use_auto_thresh = not use_auto_thresh
            prediction_history = []  # Reset history when changing threshold
        elif key == ord('['):
            bin_thresh = max(0, bin_thresh - 5)
            prediction_history = []  # Reset history when changing threshold
        elif key == ord(']'):
            bin_thresh = min(255, bin_thresh + 5)
            prediction_history = []  # Reset history when changing threshold
    
    cap.release()
    cv2.destroyAllWindows()

Run the Webcam

In [5]:
# ----------------- Run Webcam Inference -----------------
print("Starting webcam inference...")
webcam_prediction_fixed_input()
print("Webcam inference completed!")

Starting webcam inference...


Webcam ready! Draw numbers in the green box.
Press 'q' to quit, 't' to toggle auto/fixed threshold
Use '[' and ']' to adjust fixed threshold
Webcam inference completed!
