In [1]:
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from torchvision.transforms import functional as F
from PIL import Image
import timm
from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class PersonTracker:
    def __init__(self, video_path, yolo_model="yolo11x.pt", embedding_model="resnet50"):
        self.video_path = video_path
        self.detector = YOLO(yolo_model)
        
        # Initialize embedding model
        self.embedding_model = timm.create_model(embedding_model, pretrained=True)
        self.embedding_model.eval()
        if torch.cuda.is_available():
            self.embedding_model = self.embedding_model.cuda()
        self.embedding_model.reset_classifier(0)
        
        self.person_embeddings = {}  # Track ID to embedding mapping
        self.target_id = None
        self.target_embedding = None
        self.similarity_threshold = 0.7

    def extract_embedding(self, frame, bbox):
        """Extract embedding from person crop"""
        x1, y1, x2, y2 = map(int, bbox)
        person_crop = frame[y1:y2, x1:x2]
        
        # Convert to PIL and preprocess
        person_crop = Image.fromarray(cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB))
        person_crop = person_crop.resize((224, 224))
        
        img_tensor = F.to_tensor(person_crop)
        img_tensor = F.normalize(img_tensor, 
                               mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
        img_tensor = img_tensor.unsqueeze(0)
        
        if torch.cuda.is_available():
            img_tensor = img_tensor.cuda()
            
        with torch.no_grad():
            embedding = self.embedding_model(img_tensor)
            
        return embedding.cpu().numpy().flatten()

    def analyze_frame(self, skip_seconds=0):
        """Analyze frame to detect persons and create embeddings mapping"""
        # Clear previous mappings
        self.person_embeddings.clear()
        self.target_id = None
        self.target_embedding = None
        
        # Get frame from video
        cap = cv2.VideoCapture(self.video_path)
        fps = cap.get(cv2.CAP_PROP_FPS)
        skip_frames = int(fps * skip_seconds)
        
        # Skip frames if needed
        for _ in range(skip_frames):
            cap.read()
            
        success, frame = cap.read()
        cap.release()
        
        if not success:
            raise ValueError("Could not extract frame")
        
        # Run detection with tracking
        results = self.detector.track(frame, persist=True, classes=[0])
        
        if not results[0].boxes.data.shape[0]:
            raise ValueError("No persons detected in frame")
        
        # Create visualization frame
        display_frame = frame.copy()
        
        # Process each detection
        detected_persons = []
        boxes = results[0].boxes
        
        # Debug print
        print(f"Number of detections: {len(boxes)}")
        
        for box in boxes:
            # Check if track_id exists
            if hasattr(box, 'id') and box.id is not None:
                track_id = int(box.id.item())  # Convert tensor to int
                bbox = box.xyxy[0].cpu().numpy()  # Get box coordinates
                
                # Convert bbox to integers
                x1, y1, x2, y2 = map(int, bbox)
                
                # Extract embedding
                embedding = self.extract_embedding(frame, [x1, y1, x2, y2])
                
                # Store in mapping
                self.person_embeddings[track_id] = {
                    'embedding': embedding,
                    'bbox': [x1, y1, x2, y2]
                }
                
                # Draw bounding box and track ID
                cv2.rectangle(display_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                
                # Add track ID text
                label = f"ID: {track_id}"
                cv2.putText(display_frame, label, (x1, y1 - 10),
                           cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
                
                detected_persons.append({
                    'track_id': track_id,
                    'bbox': [x1, y1, x2, y2]
                })
        
        # Debug print
        print(f"Number of processed detections: {len(detected_persons)}")
        
        # Verify frame was modified
        if len(detected_persons) > 0:
            cv2.imwrite("debug_frame.jpg", display_frame)  # Save for debugging
        
        return {
            'frame': display_frame,
            'detected_persons': detected_persons
        }

    def track_person(self, target_id, display=True):
        """Track specified person through video"""
        if target_id not in self.person_embeddings:
            raise ValueError(f"Invalid target ID: {target_id}")
        
        self.target_id = target_id
        self.target_embedding = self.person_embeddings[target_id]['embedding']
        
        cap = cv2.VideoCapture(self.video_path)
        
        try:
            while cap.isOpened():
                success, frame = cap.read()
                if not success:
                    break
                
                # Run detection and tracking
                results = self.detector.track(frame, persist=True, classes=[0])
                
                best_match = None
                best_similarity = -1
                
                # Find best matching person
                for detection in results[0].boxes.data:
                    current_embedding = self.extract_embedding(frame, detection[:4])
                    similarity = 1 - cosine(self.target_embedding, current_embedding)
                    
                    if similarity > best_similarity:
                        best_similarity = similarity
                        best_match = {
                            'bbox': detection[:4],
                            'similarity': similarity
                        }
                
                # Update and display if good match found
                if best_match and best_match['similarity'] > self.similarity_threshold:
                    if display:
                        x1, y1, x2, y2 = map(int, best_match['bbox'])
                        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                        
                        label = f"ID: {self.target_id} Sim: {best_match['similarity']:.2f}"
                        cv2.putText(frame, label, (x1, y1 - 10),
                                  cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
                        
                        cv2.imshow('Tracking', frame)
                        if cv2.waitKey(1) & 0xFF == ord('q'):
                            break
                    
                    # Update target embedding with temporal smoothing
                    current_embedding = self.extract_embedding(frame, best_match['bbox'])
                    self.target_embedding = 0.9 * self.target_embedding + 0.1 * current_embedding
                    self.target_embedding /= np.linalg.norm(self.target_embedding)
                
        finally:
            cap.release()
            if display:
                cv2.destroyAllWindows()



In [3]:
video_path = "/home/praveen/VisionAI/exp/test_k0GtOO6f.mp4"
tracker = PersonTracker(video_path)

In [4]:
print("Analyzing frame for person detection...")
result = tracker.analyze_frame(skip_seconds=0)

Analyzing frame for person detection...

0: 384x640 11 persons, 104.4ms
Speed: 2.5ms preprocess, 104.4ms inference, 99.5ms postprocess per image at shape (1, 3, 384, 640)
Number of detections: 11
Number of processed detections: 11


In [5]:
result_frame = result['frame']
detected_persons = result['detected_persons']

# save the frame
cv2.imwrite("detected_persons.jpg", result_frame)


True

In [6]:
# Get target ID from user
target_id = int(input("\nEnter the ID of the person to track: "))

# Start tracking
print(f"\nTracking person with ID: {target_id}")
tracker.track_person(target_id, display=True)


Tracking person with ID: 6

0: 384x640 11 persons, 47.5ms
Speed: 7.0ms preprocess, 47.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 39.7ms
Speed: 1.3ms preprocess, 39.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 42.1ms
Speed: 1.7ms preprocess, 42.1ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 40.5ms
Speed: 2.1ms preprocess, 40.5ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 40.5ms
Speed: 1.8ms preprocess, 40.5ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 41.4ms
Speed: 2.2ms preprocess, 41.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 41.6ms
Speed: 2.3ms preprocess, 41.6ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 41.4ms
Speed: 1.5ms preprocess, 41.4ms inference, 1.

In [None]:
def main():
    video_path = "path_to_your_video.mp4"
    tracker = PersonTracker(video_path)
    
    try:
        # Analyze first frame
        print("Analyzing frame for person detection...")
        result = tracker.analyze_frame(skip_seconds=0)
        
        # Display detected persons
        cv2.imshow('Detected Persons', result['frame'])
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
        # Print detected person IDs
        print("\nDetected persons:")
        for person in result['detected_persons']:
            print(f"Track ID: {person['track_id']}")
            
        # Get target ID from user
        target_id = int(input("\nEnter the ID of the person to track: "))
        
        # Start tracking
        print(f"\nTracking person with ID: {target_id}")
        tracker.track_person(target_id, display=True)
        
    except ValueError as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()

In [None]:
def main():
    video_path = "/home/praveen/VisionAI/exp/test_k0GtOO6f.mp4"
    tracker = PersonTracker(video_path)
    
    # Initialize tracking with persons in first frame
    try:
        initial_frame, initial_results = tracker.initialize_tracking(skip_seconds=0)
        
        # Display initial detections
        for detection in initial_results.boxes.data:
            if hasattr(detection, 'id'):
                track_id = int(detection.id)
                bbox = detection[:4]
                initial_frame = tracker.draw_tracking_result(initial_frame, bbox, track_id)
        
        cv2.imshow('Initial Detections', initial_frame)
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        
        # Start tracking (will automatically select largest person if target_id not specified)
        tracker.track_person(display=True)
        
    except ValueError as e:
        print(f"Error: {e}")

if __name__ == "__main__":
    main()


0: 384x640 11 persons, 61.3ms
Speed: 2.3ms preprocess, 61.3ms inference, 13.2ms postprocess per image at shape (1, 3, 384, 640)


In [1]:
from ultralytics import YOLO
model = YOLO('yolo11x.pt')

In [None]:
model.track(
    source='/home/praveen/VisionAI/exp/test_k0GtOO6f.mp4',  # can be a filename, RTSP URL, HTTP/HTTPS URL, or a connected camera
    conf=0.25,  # confidence threshold (0-1)
    iou=0.45,  # NMS IoU threshold (0-1)
    persist=True,
    show=True,  # show results
    tracker='/home/praveen/VisionAI/exp/botsort.yaml',  # tracker algorithm
)



errors for large sources or long-running streams and videos. See https://docs.ultralytics.com/modes/predict/ for help.

Example:
    results = model(source=..., stream=True)  # generator of Results objects
    for r in results:
        boxes = r.boxes  # Boxes object for bbox outputs
        masks = r.masks  # Masks object for segment masks outputs
        probs = r.probs  # Class probabilities for classification outputs

video 1/1 (frame 1/1056) /home/praveen/VisionAI/exp/test_k0GtOO6f.mp4: 384x640 11 persons, 43.5ms
video 1/1 (frame 2/1056) /home/praveen/VisionAI/exp/test_k0GtOO6f.mp4: 384x640 11 persons, 39.1ms
video 1/1 (frame 3/1056) /home/praveen/VisionAI/exp/test_k0GtOO6f.mp4: 384x640 11 persons, 43.6ms
video 1/1 (frame 4/1056) /home/praveen/VisionAI/exp/test_k0GtOO6f.mp4: 384x640 11 persons, 43.6ms
video 1/1 (frame 5/1056) /home/praveen/VisionAI/exp/test_k0GtOO6f.mp4: 384x640 11 persons, 41.2ms
video 1/1 (frame 6/1056) /home/praveen/VisionAI/exp/test_k0GtOO6f.mp4: 384x640 11 

[ultralytics.engine.results.Results object with attributes:
 
 boxes: ultralytics.engine.results.Boxes object
 keypoints: None
 masks: None
 names: {0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus', 6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant', 11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat', 16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear', 22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag', 27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard', 32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove', 36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle', 40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl', 46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli', 51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair', 57: 'couch', 58: 'potted p

: 