In [3]:
import os
import cv2
import torch
import numpy as np
from torchvision import models, transforms
from PIL import Image
from sklearn.cluster import KMeans
import random
import torch.nn as nn

In [4]:
model = models.video.mc3_18(pretrained=True)
model.fc = nn.Identity()
model.eval()



Downloading: "https://download.pytorch.org/models/mc3_18-a90a0ba3.pth" to C:\Users\KIIT/.cache\torch\hub\checkpoints\mc3_18-a90a0ba3.pth


100.0%


VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

In [5]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [None]:
def extract_features_3d(video_path, clip_len=32, stride=16):
    cap = cv2.VideoCapture(video_path)
    frames = []

    # --- Read all frames ---
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame = cv2.resize(frame, (112, 112))
        frames.append(frame)
    cap.release()

    if len(frames) < clip_len:
        raise ValueError("Video too short for one full clip.")

    features, key_frames = [], []

    # --- Form overlapping clips ---
    for i in range(0, len(frames) - clip_len + 1, stride):
        clip = frames[i:i+clip_len]
        clip_np = np.array(clip)
        clip_tensor = torch.tensor(clip_np).permute(3, 0, 1, 2).unsqueeze(0) / 255.0  # [1,3,T,H,W]
        clip_tensor = clip_tensor.to("cpu").float()

        with torch.no_grad():
            feat = model(clip_tensor).cpu().numpy().flatten()
        features.append(feat)

        # Save middle frame as representative for this clip
        mid_idx = i + clip_len // 2
        key_frames.append(frames[mid_idx])

    return np.array(features), key_frames

In [9]:
from sklearn.preprocessing import StandardScaler

def get_keyframes(features, frames, k, normalize=True):
    if normalize:
        features = StandardScaler().fit_transform(features)

    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit(features)
    centers = kmeans.cluster_centers_
    labels = kmeans.labels_

    keyframes, keyframe_indices = [], []
    for cluster_id in range(k):
        cluster_feats = features[labels == cluster_id]
        cluster_idxs = np.where(labels == cluster_id)[0]
        if len(cluster_idxs) == 0:
            continue
        center = centers[cluster_id]
        distances = np.linalg.norm(cluster_feats - center, axis=1)
        idx = cluster_idxs[np.argmin(distances)]
        keyframes.append(frames[idx])
        keyframe_indices.append(idx)

    keyframe_indices = sorted(keyframe_indices)
    keyframes = [frames[i] for i in keyframe_indices]
    return keyframes, keyframe_indices

In [10]:
video_folder = "videos/"
output_root = "keyframes4/"
os.makedirs(output_root, exist_ok=True)

In [11]:
import os
import cv2
import random
import numpy as np



video_files = [f for f in os.listdir(video_folder) if f.endswith((".mp4", ".avi", ".mov"))]

for n, video_file in enumerate(video_files):
    video_path = os.path.join(video_folder, video_file)
    print(f"\n🎬 Processing {video_file}...")

    try:
        # 🔹 Extract spatio-temporal features using 3D CNN
        features, frames = extract_features_3d(video_path, clip_len=32, stride=16)
    except Exception as e:
        print(f"⚠️ Skipped {video_file} ({e})")
        continue

    if len(frames) == 0 or len(features) == 0:
        print(f"⚠️ Skipped {video_file} (no valid clips)")
        continue

    # 🔹 Choose number of clusters (keyframes)
    K = int(random.uniform(0.05, 0.15) * len(frames))
    K = max(1, K)  # at least 1 keyframe

    # 🔹 Select keyframes based on features
    keyframes, indices = get_keyframes(features, frames, K)

    # 🔹 Prepare output folder
    base_name = os.path.splitext(video_file)[0]
    out_dir = os.path.join(output_root, base_name)
    os.makedirs(out_dir, exist_ok=True)

    # 🔹 Save keyframes
    for i, (frame, idx) in enumerate(zip(keyframes, indices)):
        frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        out_path = os.path.join(out_dir, f"keyframe_{i+1}_frame{idx}.jpg")
        cv2.imwrite(out_path, frame_bgr)
    print(f"✅ Saved {len(keyframes)} keyframes to {out_dir}")

    # 🔹 Save keyframe indices
    txt_path = os.path.join(out_dir, f"{base_name}_indices.txt")
    with open(txt_path, "w") as f:
        for idx in indices:
            f.write(str(idx) + "\n")
    print(f"🗒️ Saved indices to {txt_path}")


🎬 Processing Air_Force_One.mp4...


  clip_tensor = torch.tensor(clip).permute(3, 0, 1, 2).unsqueeze(0) / 255.0  # [1,3,T,H,W]


KeyboardInterrupt: 