In [23]:
import torchvision.models as models
from torchvision import transforms
from mtcnn import MTCNN
from PIL import Image
import cv2
import numpy as np
import torch
import torch.nn.functional as F
import librosa
from moviepy.editor import VideoFileClip

# 1. 加载 MTCNN 模型
mtcnn = MTCNN()

# 2. 加载预训练的 ResNet50 模型
resnet50 = models.resnet50(pretrained=True)
resnet50.eval()

# 3. 移除最后的全连接层，用于特征提取
resnet50 = torch.nn.Sequential(*list(resnet50.children())[:-1])

# 定义图像预处理的转换
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

#文件路径
old_path = r"C:\Users\50597\Desktop\1.mp4"
new_path = r"C:\Users\50597\Desktop\2.mp4"

# 图片人脸特征识别模块
def extract_face_features(image):
    image = Image.fromarray(image)
    result = mtcnn.detect_faces(np.array(image))
    if not result:
        print("No face detected.")
        return None
    face = result[0]
    x, y, width, height = face['box']
    face = image.crop((x, y, x + width, y + height))
    face_image = face
    face_tensor = preprocess(face_image)
    face_tensor = face_tensor.unsqueeze(0)
    with torch.no_grad():
        features = resnet50(face_tensor)
    features = features.squeeze()
    return features

# 视频人脸特征识别模块
def extract_video_features(video_path, num_frames=30):
    cap = cv2.VideoCapture(video_path)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    frame_indices = np.linspace(0, total_frames - 1, num_frames)
    features_list = []
    for index in frame_indices:
        cap.set(cv2.CAP_PROP_POS_FRAMES, index)
        ret, frame = cap.read()
        if ret:
            features = extract_face_features(frame)
            if features is not None:
                features_list.append(features)
    cap.release()
    if len(features_list) > 0:
        features_mean = torch.stack(features_list).mean(dim=0)
        return features_mean
    else:
        return None
    
#音频特征提取模块
def extract_audio_feature_from_video(video_path, sample_rate=16000):
    # 加载预训练的 VGGish 模型
    vggish_model = hub.load('https://tfhub.dev/google/vggish/1')
    # 使用 moviepy 从视频中提取音频
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    # 将音频保存为临时文件
    temp_audio_path = 'temp_audio.wav'
    audio_clip.write_audiofile(temp_audio_path)
    # 使用 librosa 加载音频数据和采样率
    y, sr = librosa.load(temp_audio_path, sr=sample_rate)  # VGGish 要求采样率为 16kHz
    # 将音频转换为所需的输入格式
    input_data = y.astype(np.float32)
    # 由于VGGish模型要求输入是任意长度的1-D Tensor，我们不需要增加 batch dimension
    # 直接使用input_data作为模型的输入
    # 使用 VGGish 提取特征
    features = vggish_model(input_data)
    return torch.tensor(features.numpy())

# 相似度计算模块
def cos_similarity(feat1, feat2):
    feat1 = feat1.flatten()
    feat2 = feat2.flatten()
    similarity = F.cosine_similarity(feat1.unsqueeze(0), feat2.unsqueeze(0))
    return similarity.item()


# 对比模块
def compare(old_path, new_path, threshold=0.8):
    old_face_feature = extract_video_features(old_path, num_frames=2)
    new_face_feature = extract_video_features(new_path, num_frames=2)
    face_similarity = cos_similarity(old_face_feature, new_face_feature)
    old_voice_feature = extract_audio_feature_from_video(old_path)
    new_voice_feature = extract_audio_feature_from_video(new_path)
    voice_similarity = cos_similarity(old_voice_feature, new_voice_feature)
    print(f'face similarity is {face_similarity}')
    print(f'voice similarity is {voice_similarity}')
    if face_similarity >= threshold and voice_similarity >= threshold:
        print('身份验证通过')
    elif face_similarity <= threshold and voice_similarity >= threshold:
        print('人脸未通过验证')
    elif face_similarity >= threshold and voice_similarity <= threshold:
        print('声音未通过验证')
    else:
        print('人脸和声音均未通过验证')

compare(old_path, new_path)

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.
MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.
face similarity is 0.9539943337440491
voice similarity is 0.8034164309501648
身份验证通过
