In [15]:
# Jupyter Notebook Cell 1: Imports and Setup
import cv2
import numpy as np
import torch
import time
from pathlib import Path

# YOLOX imports
from yolox.data.data_augment import ValTransform
from yolox.data.datasets import COCO_CLASSES
from yolox.exp import get_exp
from yolox.utils import fuse_model, get_model_info, postprocess

# Custom visualization function
def vis(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
    """Enhanced visualization function with debug info"""
    for i in range(len(boxes)):
        print(f"Processing box {i}: {boxes[i]}, score: {scores[i]}, class_id: {cls_ids[i]}")
        box = boxes[i]
        cls_id = int(cls_ids[i])
        score = scores[i]
        if score < conf:
            continue
        x0 = int(box[0])
        y0 = int(box[1])
        x1 = int(box[2])
        y1 = int(box[3])

        color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
        text = '{}:{:.1f}%'.format(class_names[cls_id], score * 100)
        txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
        font = cv2.FONT_HERSHEY_SIMPLEX

        txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
        cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)

        txt_bk_color = (_COLORS[cls_id] * 255 * 0.7).astype(np.uint8).tolist()
        cv2.rectangle(
            img,
            (x0, y0 + 1),
            (x0 + txt_size[0] + 1, y0 + int(1.5*txt_size[1])),
            txt_bk_color,
            -1
        )
        cv2.putText(img, text, (x0, y0 + txt_size[1]), font, 0.4, txt_color, thickness=1)

    return img

# Color palette for visualization
_COLORS = np.array([
    0.000, 0.447, 0.741,
    0.850, 0.325, 0.098,
    0.929, 0.694, 0.125,
    0.494, 0.184, 0.556,
    0.466, 0.674, 0.188,
    0.301, 0.745, 0.933,
    0.635, 0.078, 0.184,
    0.300, 0.300, 0.300,
    0.600, 0.600, 0.600,
    1.000, 0.000, 0.000,
    1.000, 0.500, 0.000,
    0.749, 0.749, 0.000,
    0.000, 1.000, 0.000,
    0.000, 0.000, 1.000,
    0.667, 0.000, 1.000,
    0.333, 0.333, 0.000,
    0.333, 0.667, 0.000,
    0.333, 1.000, 0.000,
    0.667, 0.333, 0.000,
    0.667, 0.667, 0.000,
    0.667, 1.000, 0.000,
    1.000, 0.333, 0.000,
    1.000, 0.667, 0.000,
    1.000, 1.000, 0.000,
    0.000, 0.333, 0.500,
    0.000, 0.667, 0.500,
    0.000, 1.000, 0.500,
    0.333, 0.000, 0.500,
    0.333, 0.333, 0.500,
    0.333, 0.667, 0.500,
    0.333, 1.000, 0.500,
    0.667, 0.000, 0.500,
    0.667, 0.333, 0.500,
    0.667, 0.667, 0.500,
    0.667, 1.000, 0.500,
    1.000, 0.000, 0.500,
    1.000, 0.333, 0.500,
    1.000, 0.667, 0.500,
    1.000, 1.000, 0.500,
    0.000, 0.333, 1.000,
    0.000, 0.667, 1.000,
    0.000, 1.000, 1.000,
    0.333, 0.000, 1.000,
    0.333, 0.333, 1.000,
    0.333, 0.667, 1.000,
    0.333, 1.000, 1.000,
    0.667, 0.000, 1.000,
    0.667, 0.333, 1.000,
    0.667, 0.667, 1.000,
    0.667, 1.000, 1.000,
    1.000, 0.000, 1.000,
    1.000, 0.333, 1.000,
    1.000, 0.667, 1.000,
    0.333, 0.000, 0.000,
    0.500, 0.000, 0.000,
    0.667, 0.000, 0.000,
    0.833, 0.000, 0.000,
    1.000, 0.000, 0.000,
    0.000, 0.167, 0.000,
    0.000, 0.333, 0.000,
    0.000, 0.500, 0.000,
    0.000, 0.667, 0.000,
    0.000, 0.833, 0.000,
    0.000, 1.000, 0.000,
    0.000, 0.000, 0.167,
    0.000, 0.000, 0.333,
    0.000, 0.000, 0.500,
    0.000, 0.000, 0.667,
    0.000, 0.000, 0.833,
    0.000, 0.000, 1.000,
    0.000, 0.000, 0.000,
    0.143, 0.143, 0.143,
    0.286, 0.286, 0.286,
    0.429, 0.429, 0.429,
    0.571, 0.571, 0.571,
    0.714, 0.714, 0.714,
    0.857, 0.857, 0.857,
    0.000, 0.447, 0.741,
    0.314, 0.717, 0.741,
    0.50, 0.5, 0
]).astype(np.float32).reshape(-1, 3)

print("✅ Imports and setup complete!")

# Jupyter Notebook Cell 2: YOLOX Predictor Class
class YOLOXVideoPredictor:
    def __init__(self, model_path, model_name="yolox-nano", conf_threshold=0.3, nms_threshold=0.45, input_size=416):
        """
        Initialize YOLOX predictor for video inference
        
        Args:
            model_path (str): Path to the trained model checkpoint
            model_name (str): YOLOX model variant (yolox-nano, yolox-s, etc.)
            conf_threshold (float): Confidence threshold for detections
            nms_threshold (float): NMS threshold
            input_size (int): Input image size
        """
        self.model_path = model_path
        self.conf_threshold = conf_threshold
        self.nms_threshold = nms_threshold
        self.input_size = (input_size, input_size)
        
        # Load experiment and model
        print(f"🔄 Loading YOLOX model: {model_name}")
        self.exp = get_exp(exp_file=None, exp_name=model_name)
        
        # Override settings to match your trained model
        self.exp.num_classes = 1  # Single class (person)
        self.exp.class_names = ["person"]
        self.exp.test_size = self.input_size
        self.exp.test_conf = conf_threshold
        self.exp.nmsthre = nms_threshold
        
        # Create model
        self.model = self.exp.get_model()
        print(f"📊 Model Summary: {get_model_info(self.model, self.exp.test_size)}")
        
        # Load checkpoint
        print(f"📦 Loading checkpoint from: {model_path}")
        if not Path(model_path).exists():
            raise FileNotFoundError(f"Model file not found: {model_path}")
        
        ckpt = torch.load(model_path, map_location="cpu", weights_only=False)
        self.model.load_state_dict(ckpt["model"], strict=False)  # Use strict=False for compatibility
        self.model.eval()
        
        # Setup preprocessing
        self.preproc = ValTransform(legacy=False)
        
        print("✅ Model loaded successfully!")
    
    def inference(self, frame):
        """Run inference on a single frame"""
        height, width = frame.shape[:2]
        
        # Calculate resize ratio
        ratio = min(self.input_size[0] / height, self.input_size[1] / width)
        
        # Preprocess frame
        img, _ = self.preproc(frame, None, self.input_size)
        img = torch.from_numpy(img).unsqueeze(0).float()
        
        # Run inference
        with torch.no_grad():
            outputs = self.model(img)
            outputs = postprocess(
                outputs, 
                self.exp.num_classes, 
                self.conf_threshold,
                self.nms_threshold, 
                class_agnostic=True
            )
        
        return outputs[0], ratio
    
    def visualize(self, frame, detections, ratio):
        """Visualize detections on frame"""
        if detections is None:
            return frame
        
        # Convert detections to numpy
        detections = detections.cpu().numpy()
        
        # Extract components
        bboxes = detections[:, 0:4] / ratio  # Scale back to original size
        scores = detections[:, 4] * detections[:, 5]  # obj_conf * cls_conf
        cls_ids = detections[:, 6]
        
        # Use visualization function
        result_frame = vis(
            frame.copy(), 
            bboxes, 
            scores, 
            cls_ids, 
            conf=self.conf_threshold, 
            class_names=self.exp.class_names
        )
        
        return result_frame

print("✅ YOLOXVideoPredictor class defined!")

# Jupyter Notebook Cell 3: Main Video Inference Function
def run_yolox_video_inference(
    video_path, 
    model_path=r"C:\Users\Tairin Pairor\Documents\Github\Tarin%20Project\person-detection\models\best_ckpt 1.pth",
    conf_threshold=0.3,
    nms_threshold=0.45,
    input_size=416,
    model_name="yolox-nano",
    show_fps=True,
    flip_video=False
):
    """
    Run YOLOX inference on video with live display
    
    Args:
        video_path (str): Path to input video file
        model_path (str): Path to YOLOX model checkpoint
        conf_threshold (float): Confidence threshold for detections
        nms_threshold (float): NMS threshold  
        input_size (int): Model input size
        model_name (str): YOLOX model variant
        show_fps (bool): Display FPS counter
        flip_video (bool): Flip video vertically (for thermal cameras)
    """
    
    print(f"🎬 Starting video inference on: {video_path}")
    print(f"🤖 Using model: {model_path}")
    print(f"⚙️ Settings: conf={conf_threshold}, nms={nms_threshold}, size={input_size}")
    
    # Initialize predictor
    try:
        predictor = YOLOXVideoPredictor(
            model_path=model_path,
            model_name=model_name,
            conf_threshold=conf_threshold,
            nms_threshold=nms_threshold,
            input_size=input_size
        )
    except Exception as e:
        print(f"❌ Failed to initialize predictor: {e}")
        return
    
    # Open video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"❌ Failed to open video: {video_path}")
        return
    
    # Get video properties
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    print(f"📹 Video info: {width}x{height}, {fps:.1f}fps, {frame_count} frames")
    print("🎮 Controls: 'q' to quit, 'p' to pause, SPACE to step frame")
    
    # Create window
    cv2.namedWindow("YOLOX Person Detection", cv2.WINDOW_NORMAL)
    # cv2.resizeWindow("YOLOX Person Detection", 1024, 768)
    
    frame_idx = 0
    paused = False
    last_time = time.time()
    
    while True:
        if not paused:
            ret, frame = cap.read()
            if not ret:
                print("📺 End of video reached")
                break
            
            frame_idx += 1
        
        # Optional: flip frame (useful for thermal cameras)
        if flip_video:
            frame = cv2.flip(frame, 0)
        
        # Run inference
        start_time = time.time()
        detections, ratio = predictor.inference(frame)
        inference_time = time.time() - start_time
        
        # Visualize results
        result_frame = predictor.visualize(frame, detections, ratio)
        
        # Add info overlay
        info_text = []
        info_text.append(f"Frame: {frame_idx}/{frame_count}")
        info_text.append(f"Inference: {inference_time*1000:.1f}ms")
        
        if detections is not None:
            person_count = len(detections)
            info_text.append(f"Persons: {person_count}")
        else:
            info_text.append("Persons: 0")
        
        if show_fps:
            current_time = time.time()
            display_fps = 1.0 / (current_time - last_time) if (current_time - last_time) > 0 else 0
            info_text.append(f"FPS: {display_fps:.1f}")
            last_time = current_time
        
        # Draw info overlay
        # y_offset = 30
        # for text in info_text:
        #     cv2.putText(result_frame, text, (10, y_offset), 
        #                cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        #     y_offset += 30
        
        # Add status bar
        if paused:
            cv2.putText(result_frame, "PAUSED - Press 'p' to resume", 
                       (10, height - 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 255), 2)
        
        # Display frame
        cv2.imshow("YOLOX Person Detection", result_frame)
        
        # Handle keyboard input
        key = cv2.waitKey(1) & 0xFF
        if key == ord('q') or key == 27:  # 'q' or ESC
            print("🛑 Stopping video inference")
            break
        elif key == ord('p'):  # Toggle pause
            paused = not paused
            print(f"⏸️ Paused: {paused}")
        elif key == ord(' ') and paused:  # Step frame when paused
            paused = False
            continue
        elif key == ord('r'):  # Restart video
            cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
            frame_idx = 0
            print("🔄 Restarting video")
    
    # Cleanup
    cap.release()
    cv2.destroyAllWindows()
    print("✅ Video inference completed!")

print("✅ Main inference function defined!")

# Jupyter Notebook Cell 4: Quick Test Function
def quick_test():
    """Quick test with default settings"""
    video_path = input("Enter video path (or press Enter for webcam): ").strip()
    
    if not video_path:
        print("📹 Using webcam (camera 0)")
        video_path = 0
    
    run_yolox_video_inference(
        video_path=video_path,
        conf_threshold=0.3,
        nms_threshold=0.45,
        show_fps=True,
        flip_video=True  # Set to True for thermal cameras
    )

print("✅ Quick test function ready!")
print("\n🚀 Ready to use! Call run_yolox_video_inference() with your video path")
print("📝 Example usage:")
print('run_yolox_video_inference(r"C:\\path\\to\\your\\video.mp4")')
print("📝 Or use quick_test() for interactive setup")

# path =  r"C:\Users\Tairin Pairor\Downloads\NTUC-P-57_202506031442_202506031445.webm"
path = r"C:\Users\Tairin Pairor\Downloads\NTUC-P-57_202506031631_202506031634.webm"
# path = r"C:\Users\Tairin Pairor\Downloads\ruoxuan.webm"
# path = r"C:\Users\Tairin Pairor\Downloads\NTUC-P-57_202506031838_202506031840.webm"
# path = r"C:\Users\Tairin Pairor\Downloads\NTUC-P-57_202506041801_202506041803.webm"

# Example usage for your specific case:
run_yolox_video_inference(
    video_path=path,
    conf_threshold=0.82,
    nms_threshold=0.45,
    flip_video=False  # For thermal camera footage
)

✅ Imports and setup complete!
✅ YOLOXVideoPredictor class defined!
✅ Main inference function defined!
✅ Quick test function ready!

🚀 Ready to use! Call run_yolox_video_inference() with your video path
📝 Example usage:
run_yolox_video_inference(r"C:\path\to\your\video.mp4")
📝 Or use quick_test() for interactive setup
🎬 Starting video inference on: C:\Users\Tairin Pairor\Downloads\NTUC-P-57_202506031631_202506031634.webm
🤖 Using model: C:\Users\Tairin Pairor\Documents\Github\Tarin%20Project\person-detection\models\best_ckpt 1.pth
⚙️ Settings: conf=0.82, nms=0.45, size=416
🔄 Loading YOLOX model: yolox-nano
📊 Model Summary: Params: 2.24M, Gflops: 2.93
📦 Loading checkpoint from: C:\Users\Tairin Pairor\Documents\Github\Tarin%20Project\person-detection\models\best_ckpt 1.pth
✅ Model loaded successfully!
📹 Video info: 384x288, 5.0fps, 849 frames
🎮 Controls: 'q' to quit, 'p' to pause, SPACE to step frame
Processing box 0: [  1.0801743 167.39421    96.99274   262.32913  ], score: 0.833284318447