In [1]:
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import numpy as np

In [7]:
class AudioExtractor:
    def __init__(self, video_path):
        self.video_clip = VideoFileClip(video_path)
        self.audio_clip = self.video_clip.audio.set_fps(8000)
        if self.video_clip.duration > 420:
            self.audio_clip = self.audio_clip.subclip(0, 420)
        # Аудио из видео
        audio_path = "temp_audio.wav"  # Временный файл
        
        self.audio_clip.write_audiofile(audio_path)

        # Загружаем аудио в новом формате
        self.audio_segment = AudioSegment.from_wav(audio_path)

        # Инициализируем процессор и модель
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
        self.model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

    def extract_features(self) -> torch.Tensor:
        # Конвертируем в np.array
        audio_samples = np.array(self.audio_segment.get_array_of_samples()).astype(np.float32)
        
        # Конвертируем в моно
        if self.audio_segment.channels == 2:
            audio_samples = audio_samples.reshape((-1, 2)).mean(axis=1)
        
        # Нормализуем
        audio_samples /= np.max(np.abs(audio_samples))

        # Подготавливаем inputs для модели
        inputs = self.processor(audio_samples, sampling_rate=16000, return_tensors="pt", padding=True)

        # Получаем эмбеддинги
        with torch.no_grad():
            embeddings = self.model(inputs.input_values).last_hidden_state
        

        return embeddings.mean(dim=1)

In [12]:
path = r"C:\Users\Darya\Downloads\videos\4ac3ae3413347e4bcaa66faf649e5cde.mp4"

audio = AudioExtractor(path) # извлекаем аудио


MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.


In [13]:
res = audio.extract_features() # получаем эмбеддинги

In [11]:
res

tensor([[ 1.0370e-03, -6.8200e-02,  2.0120e-01,  1.9773e-01, -1.0696e-01,
         -1.6552e-01,  4.0946e-01, -1.7380e-01, -9.8619e-02, -1.4888e-01,
          7.0530e-02, -8.4368e-02, -2.1432e-01,  1.4174e-01, -1.2942e-01,
         -1.7895e-02,  2.1447e-01, -1.5651e-01,  2.1261e-02, -8.6758e-02,
         -3.0959e-01, -1.2414e-01, -1.9977e-01,  1.3083e-01,  4.1261e-01,
         -3.2752e-02,  4.4777e-02, -1.0024e-03, -8.8847e-02, -2.5854e-01,
          4.8677e-02, -6.1903e-02,  1.1133e-01,  1.1446e-01,  4.0248e-01,
         -2.4472e-02, -2.7836e-02,  3.6936e-02, -1.2751e-01, -2.8570e-01,
         -4.9669e-02,  2.7006e-01, -1.9610e-01, -1.9149e-01, -9.2743e-02,
         -3.3214e-01, -4.5655e-02, -1.0975e-01,  6.0622e-02, -2.3209e-01,
         -7.9157e-02,  1.2345e-02,  1.7849e-01, -2.8603e-01,  8.7541e-02,
          4.5854e-02,  1.1397e-01, -1.8307e-01,  2.9037e-01, -2.1247e-01,
         -3.0325e-01,  2.1107e-05,  1.1983e-01,  6.2452e-02, -1.7805e-01,
         -3.4691e-02, -8.0617e-02, -2.