## Optimized virsion

In [2]:
import os
import csv
import numpy as np
import pandas as pd
import cv2
from PIL import Image
import torchvision.transforms as T
from skimage.metrics import structural_similarity as ssim
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import CLIPProcessor, CLIPModel

In [3]:
def get_timestamp(frame_idx, fps):
    seconds = frame_idx / fps
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    ms = int((seconds - int(seconds)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d}.{ms:03d}"

In [4]:
def process_video(video_path, interval_sec=3):
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    if fps == 0:
        fps = 30
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_interval = int(fps * interval_sec)

    records = []

    for frame_idx in range(0, total_frames, frame_interval):
        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
        ret, frame = cap.read()
        if not ret:
            break
        records.append((frame, frame_idx))

    cap.release()
    return records, fps

In [5]:
def filter_keyframes(records, hash_threshold=5, ssim_threshold=0.90, ssim_compare_window=3):
    hasher = cv2.img_hash.PHash_create()
    seen_hashes = []
    distinct_records = []

    for frame, frame_idx in records:
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        resized_gray = cv2.resize(gray, (128, 128))

        img_hash = hasher.compute(frame)
        is_duplicate_hash = any(cv2.norm(img_hash, h, cv2.NORM_HAMMING) <= hash_threshold for h in seen_hashes)
        if is_duplicate_hash:
            continue
        seen_hashes.append(img_hash)

        is_distinct_ssim = True
        for prev_frame, _ in distinct_records[-ssim_compare_window:]:
            prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
            prev_resized = cv2.resize(prev_gray, (128, 128))
            if ssim(resized_gray, prev_resized) > ssim_threshold:
                is_distinct_ssim = False
                break

        if is_distinct_ssim:
            distinct_records.append((frame, frame_idx))

    return distinct_records

In [None]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

def get_clip_embedding(frame):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    inputs = clip_processor(images=image, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = clip_model.get_image_features(**inputs)
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
    return embeddings[0].cpu().numpy()

def filter_keyframes_clip(records, similarity_threshold=0.85, compare_window=5):
    distinct_records = []
    past_embeddings = []

    for frame, frame_idx in records:
        emb = get_clip_embedding(frame)

        is_distinct = True
        for prev_emb in past_embeddings[-compare_window:]:
            sim = cosine_similarity([emb], [prev_emb])[0][0]
            if sim > similarity_threshold:
                is_distinct = False
                break

        if is_distinct:
            distinct_records.append((frame, frame_idx))
            past_embeddings.append(emb)

    return distinct_records


In [7]:
def save_records_to_disk(records, output_dir, output_csv, fps):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        
    with open(output_csv, mode='w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["keyframe", "timestamp"])

        for i, (frame, frame_idx) in enumerate(records):
            frame_name = f"keyframe_{i:04d}.jpg"
            out_path = os.path.join(output_dir, frame_name)
            cv2.imwrite(out_path, frame, [int(cv2.IMWRITE_JPEG_QUALITY), 85])
            timestamp = get_timestamp(frame_idx, fps)
            writer.writerow([out_path, timestamp])

    return pd.read_csv(output_csv)

In [8]:
output_dir = "keyframes"
output_csv = "keyframes.csv"

In [10]:
for v in os.listdir("raw_videos"):
    video_path = os.path.join("raw_videos", v)
    if not os.path.isfile(video_path):
        continue

    print(f"Processing video: {video_path}")
    records, fps = process_video(video_path, interval_sec=3)
    print(f"Extracted {len(records)} frames from video at {fps} FPS.")
    
    print("Filtering keyframes using hash and SSIM...")
    filtered_records = filter_keyframes(records, hash_threshold=5, ssim_threshold=0.95, ssim_compare_window=5)
    
    print("Filtering keyframes using CLIP...")
    final_records = filter_keyframes_clip(filtered_records, similarity_threshold=0.90, compare_window=5)
    
    print(f"Saving {len(final_records)} keyframes to disk...\n")
    output_dir = os.path.join("keyframes", os.path.splitext(v)[0])
    output_csv = os.path.join(output_dir, "keyframes.csv")
    save_records_to_disk(final_records, output_dir, output_csv, fps)

Processing video: raw_videos\Dr. Mohamed Ismail (720p, h264).mp4
Extracted 104 frames from video at 29.97002997002997 FPS.
Filtering keyframes using hash and SSIM...
Filtering keyframes using CLIP...
Saving 16 keyframes to disk...

Processing video: raw_videos\Filters - Mohammad Ayed (720p, h264).mp4
Extracted 441 frames from video at 29.97002997002997 FPS.
Filtering keyframes using hash and SSIM...
Filtering keyframes using CLIP...
Saving 20 keyframes to disk...

Processing video: raw_videos\Linear Regression - Hesham Asem (720p, h264).mp4
Extracted 394 frames from video at 25.0 FPS.
Filtering keyframes using hash and SSIM...
Filtering keyframes using CLIP...
Saving 10 keyframes to disk...

Processing video: raw_videos\Perceptual Hashing To Compare Images Explained - Cryptography for Everybody (720p, h264).mp4
Extracted 353 frames from video at 29.97002997002997 FPS.
Filtering keyframes using hash and SSIM...
Filtering keyframes using CLIP...
Saving 18 keyframes to disk...

