In [27]:
import os
import cv2
import torch
import numpy as np
from torchvision import models, transforms
from PIL import Image
from sklearn.cluster import KMeans
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [28]:
efficientnetb3 = models.efficientnet_b3(pretrained = True)
backbone = efficientnetb3.features
backbone.eval()



Sequential(
  (0): Conv2dNormActivation(
    (0): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU(inplace=True)
  )
  (1): Sequential(
    (0): MBConv(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
          (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): SiLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (avgpool): AdaptiveAvgPool2d(output_size=1)
          (fc1): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
          (fc2): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
          (activation): SiLU(inplace=True)
          (scale_activation): Sigmoid()
        )
        (2): Conv2dNormActivation(
          (0): Conv2d(40, 24, kernel_size=(1, 1), stride=(1, 1)

In [29]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [30]:
def extract_features_from_video(video_path, backbone = backbone, transform = transform, device='cpu', sample_rate=5):
    """
    Extract EfficientNet-B3 features from frames of a video.

    Args:
        video_path (str): Path to video
        backbone (nn.Module): Feature extractor model (EfficientNet without classifier)
        transform (callable): Image preprocessing transform
        device (str): 'cuda' or 'cpu'
        sample_rate (int): Process every Nth frame

    Returns:
        torch.Tensor: Shape [num_frames, 1536]
    """
    cap = cv2.VideoCapture(video_path)
    features = []
    idx = 0

    if not cap.isOpened():
        print(f"❌ Error: Cannot open video {video_path}")
        return torch.empty(0)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if idx % sample_rate == 0:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(rgb)
            tensor = transform(img).unsqueeze(0).to(device)  # [1,3,H,W]

            with torch.no_grad():
                feat = backbone(tensor)  # [1,1536,1,1]
                if isinstance(feat, tuple):
                    feat = feat[0]
                feat = feat.view(feat.size(0), -1)  # [1,1536]
                feat = feat.squeeze(0)              # [1536]
                features.append(feat.cpu())

        idx += 1

    cap.release()

    if len(features) == 0:
        print(f"⚠️ Warning: No features extracted from {video_path}")
        return torch.empty(0)

    features = torch.stack(features)  # [num_frames, 1536]
    return features


In [31]:
class DSNAgent(nn.Module):
    def __init__(self, input_dim=1536, hidden_dim=256):
        super(DSNAgent, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()  

    def forward(self, x):
        h, _ = self.lstm(x)
        probs = self.sigmoid(self.fc(h))
        return probs.squeeze(-1)

In [32]:
class FeatureReducer(nn.Module):
    def __init__(self, input_dim, output_dim=1536):
        super(FeatureReducer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.linear(x)

In [33]:
def compute_diversity_reward(selected_features):
    """
    Computes diversity reward: encourages selecting dissimilar frames.
    Input: selected_features [K, feature_dim] (tensor)
    Output: scalar tensor (on same device)
    """
    if selected_features.size(0) <= 1:
        return torch.tensor(0.0, device=selected_features.device)

    # Normalize features
    norm_features = F.normalize(selected_features, p=2, dim=1)  # [K, feature_dim]

    # Cosine similarity matrix
    sim_matrix = torch.matmul(norm_features, norm_features.t())  # [K, K]

    # Diversity = 1 - mean similarity
    diversity = 1 - sim_matrix.mean()

    return diversity


In [34]:
def compute_representativeness_reward(video_features, selected_features):
    """
    Computes representativeness reward: encourages selected frames
    to be close to all video frames.
    Inputs:
        video_features: [num_frames, feature_dim] tensor
        selected_features: [K, feature_dim] tensor
    Returns:
        scalar tensor on same device
    """
    if selected_features.size(0) == 0:
        return torch.tensor(0.0, device=video_features.device)

    # Compute pairwise distances
    distances = torch.cdist(video_features, selected_features)  # [num_frames, K]

    # For each frame, take distance to nearest selected frame
    min_dist, _ = torch.min(distances, dim=1)

    # Reward: exp(-average distance)
    representativeness = torch.exp(-min_dist.mean())

    return representativeness


In [35]:
def train_dsn(agent, video_features, optimizer, lambda_div=0.5, lambda_rep=0.5):
    agent.train()
    optimizer.zero_grad()

    # Ensure feature tensor is not empty
    if video_features.numel() == 0:
        return 0.0, 0.0

    # Make sure features are 2D: [num_frames, feature_dim]
    if video_features.dim() == 1:
        video_features = video_features.unsqueeze(0)
    if video_features.dim() == 0:
        return 0.0, 0.0

    # Add batch dimension: [1, num_frames, feature_dim]
    video_features = video_features.unsqueeze(0).to("cpu")

    # Forward pass
    probs = agent(video_features)                 # [1, num_frames]
    m = torch.distributions.Bernoulli(probs)
    actions = m.sample()                          # [1, num_frames]

    # Select frames
    selected_indices = (actions[0] == 1).nonzero(as_tuple=True)[0]
    if len(selected_indices) > 0:
        selected_features = video_features[0, selected_indices]
        R_div = compute_diversity_reward(selected_features)
        R_rep = compute_representativeness_reward(video_features[0], selected_features)
        reward = lambda_div * R_div + lambda_rep * R_rep
    else:
        reward = torch.tensor(0.0, device=video_features.device)

    # REINFORCE loss
    log_probs = m.log_prob(actions)
    loss = -(log_probs * reward).mean()

    loss.backward()
    optimizer.step()

    return reward.item(), loss.item()

In [36]:
#agent = DSNAgent(input_dim=1536).cuda()
video_folder = "videos/"
video_list = [f for f in os.listdir(video_folder) if f.endswith((".mp4", ".avi", ".mov"))]

# Initialize agent, feature reducer, optimizer
agent = DSNAgent(input_dim=1536, hidden_dim=256).to("cpu")
optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)


In [37]:
# Create a single reducer for all videos
example_feat = extract_features_from_video(os.path.join(video_folder, video_list[0]))
input_dim = example_feat.shape[1] if example_feat.dim() > 1 else 1
reducer = FeatureReducer(input_dim=input_dim, output_dim=1536).to(device)

num_epochs = 50

for epoch in range(num_epochs):
    total_reward = 0.0
    total_loss = 0.0

    for vid_name in video_list:
        vid_path = os.path.join(video_folder, vid_name)

        # Extract features
        video_features = extract_features_from_video(vid_path).to("cpu")

        # Reduce feature dimension
        if video_features.dim() == 1:
            video_features = video_features.unsqueeze(0)
        video_features = reducer(video_features)

        # Train DSN
        reward, loss = train_dsn(agent, video_features, optimizer)
        total_reward += reward
        total_loss += loss

    avg_reward = total_reward / len(video_list)
    avg_loss = total_loss / len(video_list)
    print(f"Epoch [{epoch+1}/{num_epochs}] | Reward: {avg_reward:.4f} | Loss: {avg_loss:.4f}")

KeyboardInterrupt: 

In [93]:
video_features = extract_features_from_video("Videos/Air_Force_One.mp4")
print(video_features.type())
print(video_features)

torch.FloatTensor
tensor([[-0.0745,  0.0713, -0.0109,  ..., -0.0025, -0.0209, -0.0068],
        [-0.0485, -0.1234, -0.0426,  ..., -0.0022, -0.0196, -0.0065],
        [-0.0424, -0.0921,  0.0458,  ..., -0.0023, -0.0184, -0.0077],
        ...,
        [-0.0334, -0.0692, -0.0901,  ..., -0.0034, -0.0282, -0.0149],
        [-0.0298, -0.1075, -0.1031,  ..., -0.0030, -0.0230, -0.0141],
        [-0.0212, -0.1196, -0.1218,  ..., -0.0026, -0.0201, -0.0111]])


In [95]:
video_folder = "videos/"
video_list = [f for f in os.listdir(video_folder) if f.endswith((".mp4", ".avi", ".mov"))]
print(video_list)

['Air_Force_One.mp4', 'Base jumping.mp4', 'Bearpark_climbing.mp4', 'Bike Polo.mp4', 'Bus_in_Rock_Tunnel.mp4', 'car_over_camera.mp4', 'Car_railcrossing.mp4', 'Cockpit_Landing.mp4', 'Cooking.mp4', 'Eiffel Tower.mp4', 'Excavators river crossing.mp4', 'Fire Domino.mp4', 'Jumps.mp4', 'Kids_playing_in_leaves.mp4', 'Notre_Dame.mp4', 'Paintball.mp4', 'paluma_jump.mp4', 'playing_ball.mp4', 'Playing_on_water_slide.mp4', 'Saving dolphines.mp4', 'Scuba.mp4', 'St Maarten Landing.mp4', 'Statue of Liberty.mp4', 'Uncut_Evening_Flight.mp4', 'Valparaiso_Downhill.mp4']


In [None]:
torch.save(agent.state_dict(), "dsn_model.pth")
# later...
agent.load_state_dict(torch.load("dsn_model.pth"))

agent.eval()
features = extract_features_from_video("test_video.mp4").cuda().unsqueeze(0)
probs = agent(features).squeeze(0)
selected = (probs > 0.5).nonzero(as_tuple=True)[0]  # indices of selected frames
