In [1]:
import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification
from torchvision import transforms

# 加载预训练的面部表情分类模型
processor = AutoImageProcessor.from_pretrained("/Volumes/SD扩展/pre-train/vit-face-expression")
model = AutoModelForImageClassification.from_pretrained("/Volumes/SD扩展/pre-train/vit-face-expression")

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

def preprocess_frames(frames):
    return [transform(frame) for frame in frames]

def deprocess_tensor(tensor):
    # 反归一化
    mean = torch.tensor([0.485, 0.456, 0.406])
    std = torch.tensor([0.229, 0.224, 0.225])
    tensor = tensor * std[:, None, None] + mean[:, None, None]
    return tensor

def extract_features(frames):
    deprocessed_frames = [deprocess_tensor(frame) for frame in frames]
    inputs = processor(images=deprocessed_frames, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    
    hidden_states = outputs.hidden_states[-1]  # 提取最后一层的隐藏状态作为特征
    
    return outputs

In [2]:
import cv2

def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()
    return frames


In [3]:
import os


video_path = '/Volumes/SD扩展/datasets/MSADatasets/MOSI/Raw/_dI--eQ6qVU/3.mp4'
# output_path = 'data/output/features/example_video_features.pt'
    
# 提取视频帧
frames = extract_frames(video_path)
    
    # print(len(frames))
    # 预处理帧
preprocessed_frames = preprocess_frames(frames)
    
    # 提取特征
features = extract_features(preprocessed_frames)
    
print(features)
    # 保存特征
    # torch.save(features, output_path)
    
    # print(f'Features saved to {output_path}')
    



It looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.


ImageClassifierOutput(loss=None, logits=tensor([[ 2.1249, -1.2494, -0.9883,  ...,  0.1131, -0.3146, -0.3394],
        [ 2.1249, -1.2494, -0.9883,  ...,  0.1131, -0.3146, -0.3394],
        [ 2.1249, -1.2494, -0.9883,  ...,  0.1131, -0.3147, -0.3393],
        ...,
        [ 2.1214, -1.2513, -0.9837,  ...,  0.1115, -0.3127, -0.3429],
        [ 2.1215, -1.2513, -0.9838,  ...,  0.1115, -0.3127, -0.3430],
        [ 2.1215, -1.2513, -0.9838,  ...,  0.1115, -0.3127, -0.3430]]), hidden_states=(tensor([[[-0.0015,  0.0057, -0.4979,  ..., -0.0044, -0.0017, -0.7578],
         [-0.1310, -0.4492, -0.6866,  ...,  0.4170,  0.3120, -1.2116],
         [-0.2661, -0.3214, -0.9165,  ...,  0.4523,  0.4302, -1.1631],
         ...,
         [-0.3237, -0.3363, -1.4829,  ...,  0.4078,  0.3517, -1.2370],
         [-0.3899, -0.2599, -1.3782,  ...,  0.4647,  0.4005, -1.3592],
         [-0.3174, -0.2859, -1.0951,  ...,  0.3464,  0.2433, -1.3278]],

        [[-0.0015,  0.0057, -0.4979,  ..., -0.0044, -0.0017, -0.7578

In [5]:
print(type(features))

<class 'transformers.modeling_outputs.ImageClassifierOutput'>


In [7]:
print(features.keys())

odict_keys(['logits', 'hidden_states'])


In [9]:
print(features['logits'].size())

torch.Size([345, 7])


In [17]:
print(type(features['hidden_states'][-1]))

<class 'torch.Tensor'>


In [19]:
print(features['hidden_states'][-2].size())

torch.Size([345, 197, 768])


In [20]:
print(len(features['hidden_states']))

13


In [25]:
from transformers import BertTokenizer, BertModel
import torch

# 初始化分词器和模型
tokenizer = BertTokenizer.from_pretrained('/Volumes/SD扩展/pre-train/bert-base-chinese')
model = BertModel.from_pretrained('/Volumes/SD扩展/pre-train/bert-base-chinese')

# 句子
sentence = "今天天气很好"

# 分词并转换为索引
input_ids = tokenizer(sentence, return_tensors='pt')['input_ids']

# 将输入传递给模型
outputs = model(input_ids)

# 获取最后一层隐藏状态
last_hidden_state = outputs.last_hidden_state
print(last_hidden_state)


Some weights of BertModel were not initialized from the model checkpoint at /Volumes/SD扩展/pre-train/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[[ 0.8366,  0.4820,  0.1251,  ..., -0.1664,  0.1059, -0.1358],
         [ 0.5671,  0.0344,  0.2266,  ..., -1.1902, -0.2806, -0.4668],
         [ 0.1521,  0.1090,  0.3865,  ..., -0.6273,  0.8679, -0.1136],
         ...,
         [ 0.3449,  0.4459,  0.4899,  ..., -0.6067,  0.1509, -0.3140],
         [ 0.8970,  0.1236, -0.6300,  ..., -0.6997,  0.1840, -0.6964],
         [ 0.8366,  0.4820,  0.1251,  ..., -0.1664,  0.1059, -0.1358]]],
       grad_fn=<NativeLayerNormBackward0>)


In [26]:
# 句子
sentence = "天气今天很好"

# 分词并转换为索引
input_ids = tokenizer(sentence, return_tensors='pt')['input_ids']

# 将输入传递给模型
outputs = model(input_ids)

# 获取最后一层隐藏状态
last_hidden_state_1 = outputs.last_hidden_state
print(last_hidden_state_1)

tensor([[[ 0.5040,  0.3856,  0.0859,  ..., -0.1787,  0.1100,  0.0130],
         [ 0.8034, -0.2207, -0.5525,  ..., -0.2998,  0.3629,  0.2626],
         [ 0.7355,  0.5654, -0.5792,  ...,  0.6396,  0.6736,  0.6351],
         ...,
         [ 0.0009,  0.5024,  0.3782,  ..., -0.7962, -0.0899, -0.3727],
         [ 0.5892, -0.0629, -0.8392,  ..., -0.6500,  0.1574, -0.6467],
         [ 0.5040,  0.3856,  0.0859,  ..., -0.1787,  0.1100,  0.0130]]],
       grad_fn=<NativeLayerNormBackward0>)


In [24]:
print(last_hidden_state.size())

torch.Size([1, 8, 768])


In [28]:
#计算余弦相似度
import torch.nn.functional as F
cos_sim = F.cosine_similarity(last_hidden_state, last_hidden_state_1)
print(cos_sim)

tensor([[ 0.5347,  0.4709,  0.3469,  0.7459,  0.2952,  0.8517,  0.6016,  0.6503,
          0.5298,  0.8968,  0.8562,  0.1388, -0.0705,  0.8386,  0.5110,  0.9581,
          0.3209,  0.9579,  0.9361,  0.7247,  0.6734,  0.7062,  0.9689,  0.1525,
          0.8946,  0.2168,  0.8390,  0.4684,  0.9710,  0.2738,  0.7166,  0.6475,
          0.7519,  0.9301,  0.0990,  0.8313,  0.8786,  0.6768,  0.9076,  0.0653,
          0.3075,  0.7665,  0.8917, -0.5388,  0.8474,  0.9494,  0.8764,  0.9402,
          0.7930,  0.7920,  0.4145,  0.9986,  0.9929,  0.6809,  0.7820,  0.9406,
          0.0884,  0.7372,  0.8395,  0.1678,  0.6872,  0.5649, -0.5361,  0.6211,
          0.7638,  0.5574,  0.1575,  0.5766,  0.7465,  0.5650,  0.5790,  0.8175,
          0.7478,  0.3472,  0.9174,  0.8663,  0.8906,  0.3704,  0.7431,  0.9353,
          0.7851,  0.6775,  0.3801,  0.7822,  0.7232,  0.9110,  0.9309,  0.9044,
          0.9526,  0.4985,  0.4189, -0.2786,  0.8222,  0.3656,  0.9097,  0.5324,
          0.9784,  0.8404,  

In [29]:
import os

def get_video_info(root_dir):
    video_info = []

    for video_folder in os.listdir(root_dir):
        video_folder_path = os.path.join(root_dir, video_folder)
        if os.path.isdir(video_folder_path):
            video_id = video_folder
            for video_file in os.listdir(video_folder_path):
                if video_file.endswith(".mp4"):
                    clip_id = os.path.splitext(video_file)[0]
                    video_info.append({"video_id": video_id, "clip_id": clip_id})

    return video_info

root_directory = "/Volumes/SD扩展/datasets/MSADatasets/MOSI/Raw"  # 这里填写你的根目录路径
video_info_list = get_video_info(root_directory)

for info in video_info_list:
    print(info)


{'video_id': 'iiK8YX8oH1E', 'clip_id': '7'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._7'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '6'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._6'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '4'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._4'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '5'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._5'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '1'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._1'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '2'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._2'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '3'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._3'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '18'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._18'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '17'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._17'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '16'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._16'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '14'}
{'video_id': 'iiK8YX8oH1E', 'clip_id': '._14'}
{'