In [1]:
# Text Embedding

import numpy as np
import torch
from transformers import CLIPTokenizer, CLIPTextModelWithProjection

search_sentence = "a basketball player performing a slam dunk"

model = CLIPTextModelWithProjection.from_pretrained("Searchium-ai/clip4clip-webvid150k")
tokenizer = CLIPTokenizer.from_pretrained("Searchium-ai/clip4clip-webvid150k")

inputs = tokenizer(text=search_sentence, return_tensors="pt")
outputs = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])

final_output = outputs[0] / outputs[0].norm(dim=-1, keepdim=True)
final_output = final_output.cpu().detach().numpy()
print("final output: ", final_output)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/568 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

final output:  [[ 3.25408089e-03  3.06101162e-02 -4.06834446e-02  4.61272057e-03
  -1.57766044e-02  5.83211333e-02  4.99447668e-03  4.67521586e-02
  -4.83069420e-02  3.04725897e-02  1.57634560e-02 -2.27446780e-02
   1.32764336e-02 -3.03331427e-02  1.68683343e-02  5.01386039e-02
   2.19177678e-02  5.90111539e-02 -8.10485408e-02  5.94285317e-03
   3.03250942e-02 -1.44688124e-02 -4.80068661e-02  3.04664560e-02
  -2.16435455e-02 -3.31622921e-02  3.50620486e-02  2.53910311e-02
  -2.58109812e-02  2.34463289e-02  5.43020386e-03 -2.20337007e-02
   2.19517220e-02  1.27743213e-02 -5.26760751e-03 -6.62063994e-03
   1.27831725e-02 -1.72309522e-02 -3.70556079e-02 -7.25613255e-03
  -1.30738167e-03  2.93124877e-02 -3.73392254e-02  2.49769026e-03
   7.39447959e-03  6.05660118e-02 -1.62404310e-02 -3.56167257e-02
  -6.44444153e-02 -2.67125443e-02  6.91965371e-02 -4.47211899e-02
  -6.60525411e-02 -2.99654398e-02 -1.50894548e-03 -1.45027079e-02
  -1.23195080e-02 -1.07174404e-02  6.69275550e-03 -1.29440846

In [2]:
example = "/home/peterchen/M2/ADEPT/data/mafw/videos/00095.mp4"

In [3]:
# Video Embedding

from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode
from PIL import Image
import cv2
import numpy as np
import torch

def video2image(video_path, frame_rate=1.0, size=224):
    def preprocess(size, n_px):
        return Compose([
            Resize(size, interpolation=InterpolationMode.BICUBIC),
            CenterCrop(size),
            lambda image: image.convert("RGB"),
            ToTensor(),
            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ])(n_px)
        
    # cap = cv2.VideoCapture(video_path)
    cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    if fps < 1:
        images = np.zeros([3, size, size], dtype=np.float32)
        print("ERROR: problem reading video file: ", video_path)
    else:
        total_duration = (frameCount + fps - 1) // fps
        start_sec, end_sec = 0, total_duration
        interval = fps / frame_rate
        frame_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))
        ret = True
        images = np.zeros([len(frame_idx), 3, size, size], dtype=np.float32)
        
        for i, idx in enumerate(frame_idx):
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret: break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            last_frame = i
            images[i, :, :, :] = preprocess(size, Image.fromarray(frame).convert("RGB"))
        
        images = images[:last_frame+1]
    cap.release()
    video_frames = torch.tensor(images)
    return video_frames

video = video2image(example)

In [5]:
from transformers import CLIPVisionModelWithProjection

model = CLIPVisionModelWithProjection.from_pretrained("Searchium-ai/clip4clip-webvid150k")
model = model.eval()
visual_output = model(video)

# Normalizing the embeddings and calculating mean between all embeddings
visual_output = visual_output["image_embeds"]
visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)
visual_output = torch.mean(visual_output, dim=0)
visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)
print(visual_output)

tensor([-2.5028e-02, -3.5851e-02,  2.0197e-02, -2.0736e-02,  1.1188e-02,
         3.5648e-03,  6.8261e-03,  1.7236e-02, -5.6622e-02, -5.7398e-04,
         9.0653e-03,  3.3710e-02,  1.0795e-05,  1.4801e-02, -4.7317e-03,
        -1.8515e-02,  1.3165e-01, -3.6842e-02,  1.0471e-02, -1.5633e-02,
        -3.1747e-02, -9.7641e-03,  1.9203e-03,  1.7060e-03,  8.7066e-03,
         4.7712e-02, -1.0660e-02,  6.0551e-02,  2.0917e-02,  5.5179e-02,
        -2.1516e-02,  1.4854e-02, -5.7033e-02, -4.0912e-03, -4.8680e-02,
         2.6758e-03,  2.1659e-02, -1.6136e-02, -7.9793e-02,  7.6145e-02,
         5.4530e-03, -9.7047e-03, -1.1157e-02, -6.5071e-03, -4.6192e-02,
         9.9330e-02, -1.6618e-02,  3.8322e-02, -5.2666e-02, -1.2135e-02,
         1.3920e-02,  1.5180e-02, -3.5461e-02, -4.8166e-02, -4.5606e-02,
         3.5957e-02,  9.4075e-02,  1.9590e-02, -2.8555e-02,  1.3412e-02,
         4.1181e-02,  2.6066e-02, -1.0876e-02, -1.2972e-02, -2.0145e-02,
         5.2725e-03,  4.3454e-02, -2.1884e-02, -3.8

# MAFW

In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode
from PIL import Image
import cv2
from tqdm import tqdm
from transformers import CLIPVisionModelWithProjection, CLIPTokenizer, CLIPTextModelWithProjection

# --- 1. 设置与参数定义 (Setup and Parameters) ---

# 请根据您的实际环境修改这些路径
VIDEO_BASE_PATH = '/home/peterchen/M2/ADEPT/data/mafw/videos'
LABEL_FILE_PATH = '/home/peterchen/M2/ADEPT/data/mafw/labels/sampled_850.xlsx'
MODEL_NAME = "Searchium-ai/clip4clip-webvid150k"

# 检查是否有可用的GPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"当前使用的设备 (Using device): {DEVICE}")

# --- 2. 视频与文本编码函数 (Video and Text Encoding Functions) ---

# 视频编码函数 (与您提供的代码一致)
def video2image(video_path, frame_rate=1.0, size=224):
    def preprocess(size, n_px):
        return Compose([
            Resize(size, interpolation=InterpolationMode.BICUBIC),
            CenterCrop(size),
            lambda image: image.convert("RGB"),
            ToTensor(),
            Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
        ])(n_px)

    if not os.path.exists(video_path):
        print(f"错误: 视频文件不存在 (ERROR: Video file not found): {video_path}")
        return None
    
    cap = cv2.VideoCapture(video_path)
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    images = []
    if fps > 0:
        total_duration = (frameCount + fps - 1) // fps
        interval = fps / frame_rate
        frames_idx = np.floor(np.arange(0, total_duration * fps, interval))
        
        for idx in frames_idx:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            images.append(preprocess(size, Image.fromarray(frame).convert("RGB")))
    else:
        print(f"警告: 无法读取视频FPS (WARNING: Could not read FPS for video): {video_path}")

    cap.release()

    if not images:
        print(f"错误: 无法从视频中提取帧 (ERROR: Could not extract frames from video): {video_path}")
        return None
        
    video_frames = torch.stack(images)
    return video_frames

# --- 3. 评估指标计算函数 (Evaluation Metrics Function) ---

# 指标计算函数 (与教程中的代码一致)
def compute_metrics(sim_matrix):
    # sim_matrix 的形状应为 (N_texts, N_videos)
    # 假设文本和视频是一一对应的, 所以理想情况下对角线元素值最大
    nn_idx = np.argsort(-sim_matrix, axis=1)
    
    # 创建一个单位矩阵作为理想的排序结果
    y = np.eye(nn_idx.shape[0])
    
    # 找到每个文本的正确视频匹配项在排序列表中的位置
    ind = np.where(np.take_along_axis(y, nn_idx, axis=1) == 1)[1]
    
    metrics = {}
    metrics['R1'] = float(np.sum(ind == 0)) * 100 / len(ind)
    metrics['R5'] = float(np.sum(ind < 5)) * 100 / len(ind)
    metrics['R10'] = float(np.sum(ind < 10)) * 100 / len(ind)
    metrics['MR'] = np.median(ind) + 1
    metrics["MedianR"] = metrics['MR']
    metrics["MeanR"] = np.mean(ind) + 1

    print('--- 评估结果 (Evaluation Results) ---')
    print(f'相似度矩阵形状 (Similarity-matrix shape): {nn_idx.shape}')
    result_str = (
        f"文本->视频检索 (Text-to-Video): "
        f"R@1: {metrics['R1']:.2f}% - "
        f"R@5: {metrics['R5']:.2f}% - "
        f"R@10: {metrics['R10']:.2f}% - "
        f"Median R: {metrics['MR']:.2f} - "
        f"Mean R: {metrics['MeanR']:.2f}"
    )
    print(result_str)

    # 保存到txt文件
    with open("/home/peterchen/M2/Clip4Clip/metrics_mafw_results.txt", "w", encoding="utf-8") as f:
        f.write('--- 评估结果 (Evaluation Results) ---\n')
        f.write(f'相似度矩阵形状 (Similarity-matrix shape): {nn_idx.shape}\n')
        f.write(result_str + "\n")

    return metrics

# --- 4. 主执行流程 (Main Execution Workflow) ---

def main():
    # --- 步骤 1: 加载模型 ---
    print("正在加载模型 (Loading models)...")
    video_model = CLIPVisionModelWithProjection.from_pretrained(MODEL_NAME).to(DEVICE)
    text_model = CLIPTextModelWithProjection.from_pretrained(MODEL_NAME).to(DEVICE)
    tokenizer = CLIPTokenizer.from_pretrained(MODEL_NAME)
    video_model.eval()
    text_model.eval()
    print("模型加载完成 (Models loaded).")

    # --- 步骤 2: 加载数据标签 ---
    print(f"正在读取标签文件 (Reading label file): {LABEL_FILE_PATH}")
    df = pd.read_excel(LABEL_FILE_PATH)
    # 为确保一一对应, 移除缺少视频名或标题的行
    df.dropna(subset=['video_name', 'eng_caption'], inplace=True)
    video_names = df['video_name'].tolist()
    captions = df['eng_caption'].tolist()
    print(f"找到了 {len(video_names)} 个有效的视频-文本对 (Found {len(video_names)} valid video-text pairs).")

    # --- 步骤 3: 提取所有视频和文本的特征 ---
    all_video_embeds = []
    all_text_embeds = []
    
    print("开始提取特征 (Starting feature extraction)...")
    with torch.no_grad():
        # 提取视频特征
        for video_name in tqdm(video_names, desc="处理视频中 (Processing Videos)"):
            video_path = os.path.join(VIDEO_BASE_PATH, video_name)
            video_frames = video2image(video_path)
            
            if video_frames is None:
                # 如果视频处理失败, 添加一个零向量作为占位符
                # 在后续分析中可以考虑如何处理这些失败案例
                all_video_embeds.append(torch.zeros(512).to(DEVICE))
                continue
            
            video_frames = video_frames.to(DEVICE)
            visual_output = video_model(video_frames)
            
            # 归一化并取均值
            visual_embeds = visual_output["image_embeds"]
            visual_embeds = visual_embeds / visual_embeds.norm(dim=-1, keepdim=True)
            visual_embeds = torch.mean(visual_embeds, dim=0)
            visual_embeds = visual_embeds / visual_embeds.norm(dim=-1, keepdim=True)
            
            all_video_embeds.append(visual_embeds)
            
        # 提取文本特征
        for caption in tqdm(captions, desc="处理文本中 (Processing Texts)"):
            inputs = tokenizer(text=caption, return_tensors="pt").to(DEVICE)
            text_output = text_model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
            
            # 归一化
            text_embed = text_output[0] / text_output[0].norm(dim=-1, keepdim=True)
            all_text_embeds.append(text_embed.squeeze(0))

    # --- 步骤 4: 计算相似度并评估 ---
    print("特征提取完成，开始计算评估指标 (Feature extraction complete. Calculating metrics)...")
    
    # 将列表转换为张量
    video_embeddings_tensor = torch.stack(all_video_embeds).cpu().numpy()
    text_embeddings_tensor = torch.stack(all_text_embeds).cpu().numpy()

    # 计算相似度矩阵 (文本 x 视频)
    similarity_matrix = np.matmul(text_embeddings_tensor, video_embeddings_tensor.T)

    # 计算并打印评估指标
    compute_metrics(similarity_matrix)

if __name__ == '__main__':
    main()

当前使用的设备 (Using device): cuda
正在加载模型 (Loading models)...
模型加载完成 (Models loaded).
正在读取标签文件 (Reading label file): /home/peterchen/M2/ADEPT/data/mafw/labels/sampled_850.xlsx
找到了 850 个有效的视频-文本对 (Found 850 valid video-text pairs).
开始提取特征 (Starting feature extraction)...


处理视频中 (Processing Videos):  13%|█▎        | 108/850 [00:41<01:47,  6.89it/s][h264 @ 0x561d08330500] error while decoding MB 72 39, bytestream -7
[h264 @ 0x561d08330500] error while decoding MB 72 39, bytestream -7
[h264 @ 0x561d08330500] error while decoding MB 72 39, bytestream -7
[h264 @ 0x561d08330500] error while decoding MB 72 39, bytestream -7
[h264 @ 0x561d08330500] error while decoding MB 72 39, bytestream -7
[h264 @ 0x561d08330500] error while decoding MB 72 39, bytestream -7
处理视频中 (Processing Videos):  19%|█▊        | 158/850 [01:00<03:39,  3.16it/s][mpeg4 @ 0x561d0516b100] slice end not reached but screenspace end (3 left C00000, score= -9)
[mpeg4 @ 0x561cfb797700] slice end not reached but screenspace end (3 left C00000, score= -10)
处理视频中 (Processing Videos): 100%|██████████| 850/850 [05:46<00:00,  2.45it/s]
处理文本中 (Processing Texts): 100%|██████████| 850/850 [00:12<00:00, 69.09it/s]


特征提取完成，开始计算评估指标 (Feature extraction complete. Calculating metrics)...
--- 评估结果 (Evaluation Results) ---
相似度矩阵形状 (Similarity-matrix shape): (850, 850)
文本->视频检索 (Text-to-Video): R@1: 6.6% - R@5: 15.6% - R@10: 21.6% - Median R: 75.0 - Mean R: 129.1


In [2]:
print(f"文本->视频检索 (Text-to-Video): R@1: {metrics['R1']:.2f}% - R@5: {metrics['R5']:.2f}% - R@10: {metrics['R10']:.2f}% - Median R: {metrics['MR']:.2f} - Mean R: {metrics['MeanR']:.2f}")

NameError: name 'metrics' is not defined

In [3]:
import pandas as pd

pd.read_excel("/home/peterchen/M2/MER2024/llava_next_video_caption.xlsx")

Unnamed: 0,name,chinese,english,eng_caption,video_path,model_caption,processing_time
0,sample_00000000,在视频中，开头的画面显示一位穿着医生服的男性角色，他的面部表情显得有些严肃和认真，眼神专注地...,"In the video, the opening scene shows a male c...","In the video, the opening scene shows a male c...",/home/peterchen/M2/MER2024/video-selected/samp...,A man with dark hair and a beard stands in a h...,7.151556
1,sample_00000007,在视频中，我们看到一位女士在室内环境中使用电话。她的面部表情显得愉悦，嘴角上扬，显示出她可能...,"In the video, we see a lady using a phone in a...","In the video, we see a lady using a phone in a...",/home/peterchen/M2/MER2024/video-selected/samp...,"A woman with black hair, wearing a traditional...",7.189739
2,sample_00000021,在视频中，画面显示一位中年女性，她坐在凳子上，场景是室内，装修比较豪华。在视频中，她的面部表...,"In the video, the screen shows a middle-aged w...","In the video, the screen shows a middle-aged w...",/home/peterchen/M2/MER2024/video-selected/samp...,A woman with dark hair and visible identity cu...,5.931152
3,sample_00000033,在视频中，画面显示了一位男性的特写镜头。他的面部表情相对紧张，眉头紧锁，可能表明他正在经历一...,"In the video, the screen shows a close-up shot...","In the video, the screen shows a close-up shot...",/home/peterchen/M2/MER2024/video-selected/samp...,"In the video clip, a man with a serious expres...",5.498530
4,sample_00000039,在视频中，开头的画面中，我们看到一位女性角色站在室内环境中，她表情严肃，眼睛瞪圆，盯着对方，...,"In the video, in the opening scene, we see a f...","In the video, in the opening scene, we see a f...",/home/peterchen/M2/MER2024/video-selected/samp...,A woman with long dark hair stands in a room w...,5.195675
...,...,...,...,...,...,...,...
327,sample_00003239,在视频中，画面显示一位男士正在使用手机通话，场景是室内。在视频开头，他一只手叉着腰，一只手拿...,"In the video, the screen shows a man using a c...","In the video, the screen shows a man using a c...",/home/peterchen/M2/MER2024/video-selected/samp...,A man with a black and white striped shirt sta...,6.186628
328,sample_00003253,在视频中，画面显示两个角色坐在一张桌子旁，环境看起来像是一个室内的餐厅，我们主要分析女性的情...,"In the video, the scene shows two characters s...","In the video, the scene shows two characters s...",/home/peterchen/M2/MER2024/video-selected/samp...,"A woman with long hair, wearing a pink dress, ...",7.187324
329,sample_00003284,在视频中，画面显示一位中年女士正在打电话，从背景中的窗帘可以推测出场景是室内。在视频开头，她...,"In the video, the screen shows a middle-aged w...","In the video, the screen shows a middle-aged w...",/home/peterchen/M2/MER2024/video-selected/samp...,"A woman with black hair, wearing a blue blouse...",5.629751
330,sample_00003296,在视频中，画面显示一个穿着红色外套的小女孩，她的周围围着不少人，场景是室外。在视频中，她嘴角...,"In the video, the screen shows a little girl w...","In the video, the screen shows a little girl w...",/home/peterchen/M2/MER2024/video-selected/samp...,"In the scene, a man stands in a dimly lit room...",5.547124
