In [1]:
import torch
import numpy as np
import cv2
from PIL import Image
from torchvision import transforms
from argparse import Namespace
from modules.tokenization_clip import SimpleTokenizer as ClipTokenizer
from main_xclip import init_model 

In [14]:
# Function to prepare video frames from a real video file
def prepare_video(video_path, num_frames=8, height=224, width=224):
    cap = cv2.VideoCapture(video_path)
    
    # Get video properties
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # Select frames at regular intervals
    frame_indices = np.linspace(0, total_frames-1, num_frames, dtype=int)
    
    # Prepare transformation
    transform = transforms.Compose([
        transforms.Resize((height, width)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], 
                             std=[0.26862954, 0.26130258, 0.27577711])
    ])
    
    # Extract frames
    frames = []
    for idx in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if ret:
            # Convert from BGR to RGB
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Convert to PIL Image
            pil_img = Image.fromarray(frame)
            # Apply transformations
            img_tensor = transform(pil_img)
            frames.append(img_tensor)
    
    cap.release()
    
    # Stack frames into a single tensor [num_frames, channels, height, width]
    video_tensor = torch.stack(frames)
    
    # Reshape to match model input requirements [batch, pair, bs, ts, channels, height, width]
    video_tensor = video_tensor.unsqueeze(0).unsqueeze(0).unsqueeze(0)
    video_tensor = video_tensor.permute(0, 1, 2, 3, 4, 5, 6)
    
    return video_tensor

# Function to prepare text input
def prepare_text(text, tokenizer, max_words=20):
    tokens = tokenizer.encode(text)  # Use the appropriate method to tokenize the text
    tokens = tokens[:max_words]  # Truncate tokens to max_words
    padding_length = max_words - len(tokens)
    pad_token_id = 0  # Define a padding token ID (commonly 0 or another value)
    tokens += [pad_token_id] * padding_length  # Pad tokens to max_words
    attention_mask = [1] * len(tokens) + [0] * padding_length
    return torch.tensor([tokens]), torch.tensor([attention_mask])

# Main code
args = Namespace(
    # Required arguments
    output_dir='./output',
    cross_model='cross-base',
    local_rank=0,
    # Other necessary arguments with default values
    task_type='retrieval',
    datatype='msrvtt',
    pretrained_clip_name='ViT-B/32',
    max_words=20,
    max_frames=8,
    sim_header="meanP",
    loose_type=True,
    n_gpu=1,
    cache_dir='',
    init_model=None,
)

# Initialize device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()

# Initialize tokenizer
tokenizer = ClipTokenizer()

# Initialize the model
model = init_model(args, device, n_gpu, args.local_rank)

# Put the model in evaluation mode
model.eval()

# Configure the model to use meanP and loose_type=True
model.sim_header = "meanP"
model.loose_type = True

# Path to your video file
video_path = "/home/s3705609/data1/VATEX/data/_-GgBjU0XMk_000050_000060.mp4"  # Replace with actual path

# Your text query
text_query = "A cat is sleeping"  # Replace with actual text

# Prepare the inputs
video_tensor = prepare_video(video_path)
input_ids, attention_mask = prepare_text(text_query, tokenizer)

# Move tensors to device
video_tensor = video_tensor.to(device)
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
segment_ids = torch.zeros_like(input_ids).to(device)
video_mask = torch.ones(1, args.max_frames).to(device)

# Function to compute similarity
def compute_similarity_manually(model, input_ids, segment_ids, input_mask, video, video_mask):
    # First, get text features
    sequence_hidden, seq_features = model.clip.encode_text(input_ids, return_hidden=True)
    sequence_hidden, seq_features = sequence_hidden.float(), seq_features.float()
    sequence_hidden = sequence_hidden.view(input_ids.size(0), -1, sequence_hidden.size(-1))
    
    # Reshape video for processing
    b, pair, bs_vid, ts, c, h, w = video.shape
    video_reshaped = video.view(b * pair * bs_vid * ts, c, h, w)
    
    # Get visual features
    visual_hidden = model.clip.encode_image(video_reshaped).float()
    visual_hidden = visual_hidden.view(b, -1, visual_hidden.size(-1))
    
    # Normalize features
    sentence_output = sequence_hidden.squeeze(1)
    sentence_output = sentence_output / sentence_output.norm(dim=-1, keepdim=True)
    
    video_output = visual_hidden
    video_output = video_output / video_output.norm(dim=-1, keepdim=True)
    video_mask_un = video_mask.to(dtype=torch.float).unsqueeze(-1)
    video_output = video_output * video_mask_un
    video_mask_un_sum = torch.sum(video_mask_un, dim=1, dtype=torch.float)
    video_mask_un_sum[video_mask_un_sum == 0.] = 1.
    video_output = torch.sum(video_output, dim=1) / video_mask_un_sum
    video_output = video_output / video_output.norm(dim=-1, keepdim=True)
    
    # Compute simple cosine similarity
    similarity = torch.matmul(sentence_output, video_output.t()) * model.clip.logit_scale.exp()
    
    return similarity

# Compute similarity score
with torch.no_grad():
    similarity = compute_similarity_manually(model, input_ids, segment_ids, attention_mask, video_tensor, video_mask)
    print(f"Similarity score for '{text_query}':", similarity.item())

Stage-One:True, Stage-Two:False
Test retrieval by loose type.
	 embed_dim: 512
	 image_resolution: 224
	 vision_layers: 12
	 vision_width: 768
	 vision_patch_size: 32
	 context_length: 77
	 vocab_size: 49408
	 transformer_width: 512
	 transformer_heads: 8
	 transformer_layers: 12
	 cut_top_layer: 0
	 sim_header: meanP
[aac @ 0x5555a1673fc0] skip_data_stream_element: Input buffer exhausted before END element found


RuntimeError: stack expects a non-empty TensorList