In [1]:
pip install torch torchvision torchreid deep_sort_realtime

Note: you may need to restart the kernel to use updated packages.


In [2]:
!pip install deep-sort-realtime



In [4]:
%pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.2.88-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.6-py3-none-any.whl.metadata (9.1 kB)
Downloading ultralytics-8.2.88-py3-none-any.whl (871 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m871.7/871.7 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading ultralytics_thop-2.0.6-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.2.88 ultralytics-thop-2.0.6
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Path to the log file
log_file = '/kaggle/working/progress.log'

# List of already processed videos
processed_videos = [
    'videoplayback.mp4',
    'Preference Assessment with Toys_ Multiple Stimulus without Replacement (MSWO).mp4',
    'Exploring the Therapeutic Playroom.mp4',
    'Discrete Trial Training.mp4',
    'Group Therapy for Autism Spectrum Disorder.mp4',
    'Matching.mp4'
]

# Write these filenames to the log file
with open(log_file, 'w') as f:
    for video in processed_videos:
        f.write(video + '\n')

print(f"Preloaded {len(processed_videos)} processed videos into the log file.")


Preloaded 6 processed videos into the log file.


In [None]:
import cv2
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import os
from deep_sort_realtime.deepsort_tracker import DeepSort
from ultralytics import YOLO

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def load_feature_extractor():
    """Load the ResNet model for feature extraction and move it to GPU."""
    model = models.resnet50(pretrained=True)
    model.fc = torch.nn.Identity()  # Remove the final classification layer
    model = model.to(device)
    model.eval()
    return model

def preprocess_image(image):
    """Preprocess image for feature extraction."""
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    return transform(image).unsqueeze(0).to(device)

def extract_features(image, model):
    """Extract features from the image using the feature extractor model."""
    image = preprocess_image(image)
    with torch.no_grad():
        features = model(image)
    return features.cpu().numpy()

def initialize_tracker():
    """Initialize the DeepSORT tracker with default parameters."""
    return DeepSort(
        max_age=20,
        n_init=5,
        nms_max_overlap=0.5,
        max_cosine_distance=0.1,
        nn_budget=100
    )

def load_model(model_path):
    """Load the YOLO model for object detection."""
    model = YOLO(model_path)
    model.to(device)
    return model

def open_video(video_path):
    """Open the input video and return the capture object."""
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    return cap, frame_width, frame_height, fps

def create_video_writer(output_path, frame_width, frame_height, fps):
    """Create a VideoWriter object to save the output video."""
    fourcc = cv2.VideoWriter_fourcc(*'XVID')
    return cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

def perform_detection(frame, model):
    """Perform detection on the frame using YOLO model."""
    results = model(frame)
    detections = []

    for result in results:
        if result.boxes is not None:
            for box in result.boxes:
                xyxy = box.xyxy[0]
                conf = box.conf[0]
                cls = int(box.cls[0])
                x1, y1, x2, y2 = map(int, xyxy)
                detections.append(([x1, y1, x2-x1, y2-y1], conf, cls))

    return detections

def update_tracking(detections, tracker, frame, feature_extractor):
    """Update the tracker with the detections."""
    features = []
    for detection in detections:
        bbox, conf, cls = detection
        x1, y1, w, h = bbox
        obj_image = frame[y1:y1+h, x1:x1+w]
        feature = extract_features(obj_image, feature_extractor)
        features.append(feature)

    tracks = tracker.update_tracks(detections, frame=frame)
    return tracks

def draw_tracks(frame, tracks, model):
    """Draw the bounding boxes and labels with IDs."""
    for track in tracks:
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        ltrb = track.to_ltrb()
        x1, y1, x2, y2 = map(int, ltrb)

        cls = track.det_class
        label = f'ID {track_id}: {model.names[cls]}'

        color = (0, 255, 0)
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
        cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

    return frame

def process_video(input_video_path, output_video_path, model, tracker, feature_extractor, log_file):
    """Main pipeline to process the video."""
    cap, frame_width, frame_height, fps = open_video(input_video_path)
    out = create_video_writer(output_video_path, frame_width, frame_height, fps)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        detections = perform_detection(frame, model)
        tracks = update_tracking(detections, tracker, frame, feature_extractor)
        frame = draw_tracks(frame, tracks, model)

        out.write(frame)

    cap.release()
    out.release()
    print(f'Video saved to {output_video_path}')

    # Log the processed video filename
    with open(log_file, 'a') as f:
        f.write(os.path.basename(input_video_path) + '\n')

def process_directory(input_dir, output_dir, model, tracker, feature_extractor, log_file):
    """Process all videos in the input directory and save to the output directory."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Read log file to get processed videos
    if os.path.exists(log_file):
        with open(log_file, 'r') as f:
            processed_videos = set(line.strip() for line in f)
    else:
        processed_videos = set()

    for filename in os.listdir(input_dir):
        if filename.endswith('.mp4'):
            if filename in processed_videos:
                print(f'Skipping {filename} (already processed)')
                continue

            input_path = os.path.join(input_dir, filename)
            output_path = os.path.join(output_dir, f'track_{filename}')
            print(f'Processing {input_path}...')
            process_video(input_path, output_path, model, tracker, feature_extractor, log_file)

if __name__ == "__main__":
    # Load the model and initialize the tracker
    model_path = '/kaggle/input/yolo-model/model c.pt'
    input_dir = '/kaggle/input/test-videos'
    output_dir = '/kaggle/working/output'
    log_file = '/kaggle/working/progress.log'

    model = load_model(model_path)
    feature_extractor = load_feature_extractor()
    tracker = initialize_tracker()

    # Process all videos in the input directory
    process_directory(input_dir, output_dir, model, tracker, feature_extractor, log_file)



Skipping Matching.mp4 (already processed)
Skipping Group Therapy for Autism Spectrum Disorder.mp4 (already processed)
Skipping Exploring the Therapeutic Playroom.mp4 (already processed)
Skipping Discrete Trial Training.mp4 (already processed)
Skipping videoplayback.mp4 (already processed)
Skipping Preference Assessment with Toys_ Multiple Stimulus without Replacement (MSWO).mp4 (already processed)
Processing /kaggle/input/test-videos/Speech Therapy Training Session- Moderate to Severe Autism.mp4...


OpenCV: FFMPEG: tag 0x44495658/'XVID' is not supported with codec id 12 and format 'mp4 / MP4 (MPEG-4 Part 14)'
OpenCV: FFMPEG: fallback to use tag 0x7634706d/'mp4v'



0: 384x640 1 Therapist, 1 Child, 72.9ms
Speed: 3.5ms preprocess, 72.9ms inference, 135.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Therapist, 1 Child, 18.3ms
Speed: 2.1ms preprocess, 18.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 Therapists, 1 Child, 18.8ms
Speed: 1.3ms preprocess, 18.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Therapist, 1 Child, 18.6ms
Speed: 1.3ms preprocess, 18.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Therapist, 1 Child, 18.8ms
Speed: 1.3ms preprocess, 18.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Therapist, 1 Child, 18.4ms
Speed: 1.3ms preprocess, 18.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Therapist, 1 Child, 22.0ms
Speed: 1.1ms preprocess, 22.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Therapist, 1 Chil