In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
import cv2
import torch
import os
import pandas as pd
from pathlib import Path
from tqdm.notebook import tqdm

In [3]:
# !python3 -m pip install --force-reinstall moviepy

In [4]:
import cv2
import torch
import pandas as pd
from tqdm.notebook import tqdm

# === CONFIG ===
video_path = "./Oscars_2020_Highlights_Oscars_2009_Kate_Montage.mp4"
output_csv = './detections_log.csv'
output_video = './output_with_boxes.mp4' # This will be the video WITHOUT audio initially
final_output_video_with_audio = './output_final_with_audio.mp4' # New: This will be the final video WITH audio
model_path = './celeb_yolov5s7/weights/best.pt'

min_conf_score = 0.1
max_conf_score = 0.9

# === LOAD MODEL ===
model = torch.hub.load('yolov5', 'custom', path=model_path, source='local')
model.conf = min_conf_score
model.classes = [0, 1, 3, 4, 5, 6, 8]

class_wise_confidence_scores = {0: 0.1,
                                1: 0.75,
                                3: 0.95,
                                4: 0.89,
                                5: 0.85,
                                6: 0.9,
                                8: 0.9}


# === OPEN VIDEO ===
cap = cv2.VideoCapture(video_path)
framerate = cap.get(cv2.CAP_PROP_FPS)
frame_width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
print(f"frame count: {frame_count}")

# === INIT VIDEO WRITER ===
# Use 'mp4v' for .mp4. If you encounter issues, try 'XVID' for .avi and then convert.
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video, fourcc, framerate, (frame_width, frame_height))

frame_id = 0
all_detections = []
reappearances = ""
detected_classes = {0}

print(f"Running inference on {video_path}...")

# Use ceil to ensure progress bar accounts for partial seconds/frames
pbar = tqdm(total=frame_count, desc="Processing Frames") 
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)
    detections = results.xyxy[0].cpu().numpy()

    for det in detections:
        x1, y1, x2, y2, conf, cls = det
        cls = int(cls)

        # Draw bounding box
        label = f"{model.names[cls]} {conf:.2f}"
        color = (0, 255, 0) # Green BGR
        cv2.rectangle(frame, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
        cv2.putText(frame, label, (int(x1), int(y1) - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, color, 2)

        # Apply class-wise confidence threshold
        if conf < class_wise_confidence_scores[cls]:
            continue
        
        # # Skip class 0 if it's meant to be ignored (e.g., general person)
        # if cls == 0:
        #     continue

        timestamp_seconds = frame_id / framerate
        minutes = int(timestamp_seconds // 60)
        seconds = int(timestamp_seconds % 60)
        timestamp = f"{minutes:02}:{seconds:02}" # Format as MM:SS

        if cls in detected_classes:
            reappearances += (
                f"reappearance of {model.names[cls]} at frame {frame_id} "
                f"with confidence {conf:.3f} at timestamp = {timestamp}.\n"
            )
            # You might want to log reappearances to a separate list/file if needed
            continue # Skip logging reappearance to all_detections

        all_detections.append({
            'frame': frame_id,
            'timestamp': timestamp_seconds, # Storing in seconds for precision
            'x1': round(x1, 2),
            'y1': round(y1, 2),
            'x2': round(x2, 2),
            'y2': round(y2, 2),
            'conf': round(conf, 3),
            'class': cls,
            'label': model.names[cls]
        })
        detected_classes.add(cls)
        print(f"{model.names[cls]} detected at frame {frame_id}; "
              f"timestamp = {timestamp}; confidence: {round(conf, 3)}")

    out.write(frame)  # Save annotated frame
    frame_id += 1
    pbar.update(1) # Update progress bar for each frame

pbar.close()
cap.release()
out.release()
print(f"Inference complete. Processed {frame_id} frames. Video saved to {output_video}")

# === SAVE LOG ===
df = pd.DataFrame(all_detections)
df.to_csv(output_csv, index=False)
print(f"Detections saved to {output_csv}")

  import pkg_resources as pkg
YOLOv5  v7.0-423-gc488fd40 Python-3.12.6 torch-2.5.1+cu121 CUDA:0 (NVIDIA GeForce GTX 1050, 4096MiB)

Fusing layers... 
Model summary: 157 layers, 7034398 parameters, 0 gradients, 15.8 GFLOPs
Adding AutoShape... 


frame count: 11620
Running inference on ./Oscars_2020_Highlights_Oscars_2009_Kate_Montage.mp4...


Processing Frames:   0%|          | 0/11620 [00:00<?, ?it/s]

Tom Hanks detected at frame 126; timestamp = 00:04; confidence: 0.9049999713897705
Brad Pitt detected at frame 696; timestamp = 00:23; confidence: 0.7860000133514404
Scarlett Johansson detected at frame 2060; timestamp = 01:08; confidence: 0.9300000071525574
Leonardo DiCaprio detected at frame 5277; timestamp = 02:55; confidence: 0.8669999837875366
Hugh Jackman detected at frame 5400; timestamp = 03:00; confidence: 0.9539999961853027
Kate Winslet detected at frame 9196; timestamp = 05:06; confidence: 0.8930000066757202
Inference complete. Processed 11620 frames. Video saved to ./output_with_boxes.mp4
Detections saved to ./detections_log.csv


In [5]:
# === MERGE AUDIO ===
from moviepy.editor import VideoFileClip, AudioFileClip # Import moviepy
print(f"Merging audio from {video_path} to {output_video}...")
try:
    video_clip = VideoFileClip(output_video)
    audio_clip = AudioFileClip(video_path) # Extract audio from original video

    final_clip = video_clip.set_audio(audio_clip)
    final_clip.write_videofile(final_output_video_with_audio, codec="libx264", audio_codec="aac")

    video_clip.close()
    audio_clip.close()
    print(f"Final video with audio saved to {final_output_video_with_audio}")
except Exception as e:
    print(f"Error merging audio: {e}")
    print("Please ensure 'ffmpeg' is installed and accessible in your system's PATH.")



Merging audio from ./Oscars_2020_Highlights_Oscars_2009_Kate_Montage.mp4 to ./output_with_boxes.mp4...
Moviepy - Building video ./output_final_with_audio.mp4.
MoviePy - Writing audio in output_final_with_audioTEMP_MPY_wvf_snd.mp4


                                                                     

MoviePy - Done.
Moviepy - Writing video ./output_final_with_audio.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready ./output_final_with_audio.mp4
Final video with audio saved to ./output_final_with_audio.mp4


In [6]:
# !ffmpeg -i "Oscars_2020_Highlights_Oscars_2009_Kate_Montage.mp4" -i "output_with_boxes.mp4" -c:v copy -c:a aac -map 1:v:0 -map 0:a:0 "final_output_with_audio_direct_ffmpeg.mp4"

In [7]:
print(reappearances)

reappearance of Generic Face at frame 10 with confidence 0.239 at timestamp = 00:00.
reappearance of Generic Face at frame 80 with confidence 0.128 at timestamp = 00:02.
reappearance of Generic Face at frame 81 with confidence 0.156 at timestamp = 00:02.
reappearance of Generic Face at frame 82 with confidence 0.198 at timestamp = 00:02.
reappearance of Tom Hanks at frame 127 with confidence 0.944 at timestamp = 00:04.
reappearance of Tom Hanks at frame 128 with confidence 0.950 at timestamp = 00:04.
reappearance of Tom Hanks at frame 129 with confidence 0.950 at timestamp = 00:04.
reappearance of Tom Hanks at frame 130 with confidence 0.939 at timestamp = 00:04.
reappearance of Tom Hanks at frame 131 with confidence 0.926 at timestamp = 00:04.
reappearance of Tom Hanks at frame 132 with confidence 0.913 at timestamp = 00:04.
reappearance of Tom Hanks at frame 144 with confidence 0.911 at timestamp = 00:04.
reappearance of Tom Hanks at frame 145 with confidence 0.922 at timestamp = 00: