In [1]:
import os
import cv2
import torch
import numpy as np
from torchvision import models, transforms
from PIL import Image
from sklearn.cluster import KMeans
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [2]:
efficientnetb3 = models.efficientnet_b3(pretrained = True)
backbone = efficientnetb3.features
backbone.eval()



Sequential(
  (0): Conv2dNormActivation(
    (0): Conv2d(3, 40, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU(inplace=True)
  )
  (1): Sequential(
    (0): MBConv(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(40, 40, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=40, bias=False)
          (1): BatchNorm2d(40, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): SiLU(inplace=True)
        )
        (1): SqueezeExcitation(
          (avgpool): AdaptiveAvgPool2d(output_size=1)
          (fc1): Conv2d(40, 10, kernel_size=(1, 1), stride=(1, 1))
          (fc2): Conv2d(10, 40, kernel_size=(1, 1), stride=(1, 1))
          (activation): SiLU(inplace=True)
          (scale_activation): Sigmoid()
        )
        (2): Conv2dNormActivation(
          (0): Conv2d(40, 24, kernel_size=(1, 1), stride=(1, 1)

In [3]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [4]:
def extract_features_from_video(video_path, backbone = backbone, transform = transform, device='cpu', sample_rate=5):
    """
    Extract EfficientNet-B3 features from frames of a video.

    Args:
        video_path (str): Path to video
        backbone (nn.Module): Feature extractor model (EfficientNet without classifier)
        transform (callable): Image preprocessing transform
        device (str): 'cuda' or 'cpu'
        sample_rate (int): Process every Nth frame

    Returns:
        torch.Tensor: Shape [num_frames, 1536]
    """
    cap = cv2.VideoCapture(video_path)
    features = []
    idx = 0

    if not cap.isOpened():
        print(f"❌ Error: Cannot open video {video_path}")
        return torch.empty(0)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if idx % sample_rate == 0:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            img = Image.fromarray(rgb)
            tensor = transform(img).unsqueeze(0).to(device)  # [1,3,H,W]

            with torch.no_grad():
                feat = backbone(tensor)  # [1,1536,1,1]
                if isinstance(feat, tuple):
                    feat = feat[0]
                feat = feat.view(feat.size(0), -1)  # [1,1536]
                feat = feat.squeeze(0)              # [1536]
                features.append(feat.cpu())

        idx += 1

    cap.release()

    if len(features) == 0:
        print(f"⚠️ Warning: No features extracted from {video_path}")
        return torch.empty(0)

    features = torch.stack(features)  # [num_frames, 1536]
    return features


In [5]:
class DSNAgent(nn.Module):
    def __init__(self, input_dim=1536, hidden_dim=256):
        super(DSNAgent, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()  

    def forward(self, x):
        h, _ = self.lstm(x)
        probs = self.sigmoid(self.fc(h))
        return probs.squeeze(-1)

In [6]:
class FeatureReducer(nn.Module):
    def __init__(self, input_dim, output_dim=1536):
        super(FeatureReducer, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.linear(x)

In [11]:
from scipy.io import loadmat

gt_dir = os.path.join(os.getcwd(), "GT")

def reward_gent(selected_frames, threshold=0.5):
    

    GT_files = [f for f in os.listdir(gt_dir) if f.endswith(".mat")]
    total_reward = 0
    count = 0

    for gt_file in GT_files:
        data = loadmat(os.path.join(gt_dir, gt_file))
        if 'gt_score' not in data:
            print(f"⚠️ 'gt_score' not found in {gt_file}")
            continue

        gt_scores = np.squeeze(data['gt_score']).astype(float)
        n = min(len(selected_frames), len(gt_scores))
        pred = selected_frames[:n].detach().cpu().numpy()
        true = np.array(gt_scores[:n])

        
        selected_idx = np.where(pred > threshold)[0]

        if len(selected_idx) == 0:
            
            total_reward += 0.0
            count += 1
            continue

        
        gt_selected_mean = np.mean(true[selected_idx])

        
        max_true = np.max(true) if np.max(true) > 0 else 1
        normalized_reward = gt_selected_mean / max_true

        total_reward += normalized_reward
        count += 1

    return total_reward / max(count, 1)


In [12]:
def train_dsn(agent, video_features, optimizer, llambda=0.5):
    agent.train()
    optimizer.zero_grad()

    # Ensure feature tensor is not empty
    if video_features.numel() == 0:
        return 0.0, 0.0

    # Make sure features are 2D: [num_frames, feature_dim]
    if video_features.dim() == 1:
        video_features = video_features.unsqueeze(0)
    if video_features.dim() == 0:
        return 0.0, 0.0

    # Add batch dimension: [1, num_frames, feature_dim]
    video_features = video_features.unsqueeze(0).to("cpu")

    # Forward pass
    probs = agent(video_features)                 # [1, num_frames]
    m = torch.distributions.Bernoulli(probs)
    actions = m.sample()                          # [1, num_frames]

    # Select frames
    selected_indices = (actions[0] == 1).nonzero(as_tuple=True)[0]
    if len(selected_indices) > 0:
        selected_features = video_features[0, selected_indices]
        b = reward_gent(selected_features)
        reward = llambda * b
    else:
        reward = torch.tensor(0.0, device=video_features.device)

    # REINFORCE loss
    log_probs = m.log_prob(actions)
    loss = -(log_probs * reward).mean()

    loss.backward()
    optimizer.step()

    return reward.item(), loss.item()

In [13]:
video_folder = "videos/"
video_list = [f for f in os.listdir(video_folder) if f.endswith((".mp4", ".avi", ".mov"))]

# Initialize agent, feature reducer, optimizer
agent = DSNAgent(input_dim=1536, hidden_dim=256).to("cpu")
optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)

In [14]:
example_feat = extract_features_from_video(os.path.join(video_folder, video_list[0]))
input_dim = example_feat.shape[1] if example_feat.dim() > 1 else 1
reducer = FeatureReducer(input_dim=input_dim, output_dim=1536).to("cpu")

num_epochs = 50

for epoch in range(num_epochs):
    total_reward = 0.0
    total_loss = 0.0

    for vid_name in video_list:
        vid_path = os.path.join(video_folder, vid_name)

        # Extract features
        video_features = extract_features_from_video(vid_path).to("cpu")

        # Reduce feature dimension
        if video_features.dim() == 1:
            video_features = video_features.unsqueeze(0)
        video_features = reducer(video_features)

        # Train DSN
        rewards, loss = train_dsn(agent, video_features, optimizer)
        total_reward += rewards
        total_loss += loss

    avg_reward = total_reward / len(video_list)
    avg_loss = total_loss / len(video_list)
    print(f"Epoch [{epoch+1}/{num_epochs}] | Reward: {avg_reward:.4f} | Loss: {avg_loss:.4f}")

KeyboardInterrupt: 