In [52]:
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
import moviepy.audio.fx as afx
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import numpy as np

In [69]:
class AudioExtractor:
    def __init__(self, video_path):
        self.video_clip = VideoFileClip(video_path)
        self.audio_clip = self.video_clip.audio.set_fps(8000)
        if self.video_clip.duration > 420:
            self.audio_clip = self.audio_clip.subclip(0, 420)
        # Extract audio from video
        audio_path = "temp_audio.wav"  # Temporary file to store audio
        
        self.audio_clip.write_audiofile(audio_path)

        # Load the audio using pydub
        self.audio_segment = AudioSegment.from_wav(audio_path)

        # Initialize Wav2Vec2 processor and model
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
        self.model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

    def extract_features(self) -> torch.Tensor:
        # Convert audio segment to raw audio data (numpy array)
        audio_samples = np.array(self.audio_segment.get_array_of_samples()).astype(np.float32)
        
        # Handle stereo audio if necessary
        if self.audio_segment.channels == 2:
            audio_samples = audio_samples.reshape((-1, 2)).mean(axis=1)  # Convert to mono
        
        # Normalize audio data
        audio_samples /= np.max(np.abs(audio_samples))

        # Prepare the input for Wav2Vec2
        inputs = self.processor(audio_samples, sampling_rate=16000, return_tensors="pt", padding=True)

        # Get embeddings
        with torch.no_grad():
            embeddings = self.model(inputs.input_values).last_hidden_state
        
        return embeddings

In [70]:
path = r"C:\Users\Darya\Downloads\videos\3ec7c2b092514dc4ebeaa3036fe9857c.mp4"

audio = AudioExtractor(path)


MoviePy - Writing audio in temp_audio.wav


                                                                      

MoviePy - Done.


In [71]:
audio.extract_features()

tensor([[[ 0.1342, -0.1426, -0.0040,  ...,  0.6017,  0.1571,  0.2050],
         [ 0.0109, -0.1517,  0.0713,  ...,  0.6283, -0.0106, -0.0409],
         [-0.0315, -0.1571,  0.1013,  ...,  0.6137,  0.0038, -0.2005],
         ...,
         [-0.4672, -0.1100,  0.2366,  ...,  0.1547,  0.0495, -0.0290],
         [-0.4605, -0.0996,  0.2325,  ...,  0.1572,  0.0470, -0.0108],
         [-0.4609, -0.1134,  0.2219,  ...,  0.1486,  0.0520, -0.0050]]])