In [1]:
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import numpy as np

In [12]:
class AudioExtractor:
    def __init__(self, video_path):
        self.video_clip = VideoFileClip(video_path)
        if self.video_clip.duration > 60:
            self.video_clip = self.video_clip.subclip(0, 60)
        # Extract audio from video
        audio_path = "temp_audio.wav"  # Temporary file to store audio
        self.video_clip.audio.write_audiofile(audio_path)

        # Load the audio using pydub
        self.audio_segment = AudioSegment.from_wav(audio_path)

        # Initialize Wav2Vec2 processor and model
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
        self.model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

    def extract_features(self) -> torch.Tensor:
        # Convert audio segment to raw audio data (numpy array)
        audio_samples = np.array(self.audio_segment.get_array_of_samples()).astype(np.float32)
        
        # Handle stereo audio if necessary
        if self.audio_segment.channels == 2:
            audio_samples = audio_samples.reshape((-1, 2)).mean(axis=1)  # Convert to mono
        
        # Normalize audio data
        audio_samples /= np.max(np.abs(audio_samples))

        # Prepare the input for Wav2Vec2
        inputs = self.processor(audio_samples, sampling_rate=16000, return_tensors="pt", padding=True)

        # Get embeddings
        with torch.no_grad():
            embeddings = self.model(inputs.input_values).last_hidden_state
        
        return embeddings

In [13]:
path = r"C:\Users\Darya\Downloads\videos\3b69f98d51c1028633cff24c7d2937e0.mp4"

audio = AudioExtractor(path)


chunk:  53%|█████▎    | 707/1324 [03:49<00:00, 3587.41it/s, now=None]

MoviePy - Writing audio in temp_audio.wav


chunk:  53%|█████▎    | 707/1324 [03:50<00:00, 3587.41it/s, now=None]

MoviePy - Done.


In [14]:
audio.extract_features()

ValueError: The model corresponding to this feature extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}
 was trained using a sampling rate of 16000. Please make sure that the provided `raw_speech` input was sampled with 16000 and not 8000.