In [1]:
from moviepy.editor import VideoFileClip
from pydub import AudioSegment
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import numpy as np

In [35]:
class AudioExtractor:
    def __init__(self, video_path):
        self.video_clip = VideoFileClip(video_path)
        self.audio_clip = self.video_clip.audio.set_fps(8000)
        if self.video_clip.duration > 60:
            self.video_clip = self.video_clip.subclip(0, 60)
        # Extract audio from video
        audio_path = "temp_audio.wav"  # Temporary file to store audio
        
        self.audio_clip.write_audiofile(audio_path)

        # Load the audio using pydub
        self.audio_segment = AudioSegment.from_wav(audio_path)

        # Initialize Wav2Vec2 processor and model
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
        self.model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")

    def extract_features(self) -> torch.Tensor:
        # Convert audio segment to raw audio data (numpy array)
        audio_samples = np.array(self.audio_segment.get_array_of_samples()).astype(np.float32)
        
        # Handle stereo audio if necessary
        if self.audio_segment.channels == 2:
            audio_samples = audio_samples.reshape((-1, 2)).mean(axis=1)  # Convert to mono
        
        # Normalize audio data
        audio_samples /= np.max(np.abs(audio_samples))

        # Prepare the input for Wav2Vec2
        inputs = self.processor(audio_samples, sampling_rate=16000, return_tensors="pt", padding=True)

        # Get embeddings
        with torch.no_grad():
            embeddings = self.model(inputs.input_values).last_hidden_state
        
        return embeddings

In [36]:
path = r"C:\Users\Darya\Downloads\videos\3b69f98d51c1028633cff24c7d2937e0.mp4"

audio = AudioExtractor(path)


MoviePy - Writing audio in temp_audio.wav


                                                                    

MoviePy - Done.


In [37]:
audio.extract_features()

tensor([[[ 0.1208, -0.1846,  0.3789,  ...,  0.4357,  0.1528,  0.0036],
         [ 0.1390, -0.2295,  0.3784,  ...,  0.6440, -0.0260,  0.1158],
         [ 0.0513, -0.1899,  0.3342,  ...,  0.6386, -0.1095,  0.2583],
         ...,
         [ 0.0041, -0.1939, -0.1719,  ...,  0.1579,  0.0501,  0.1461],
         [-0.0157, -0.2077, -0.1370,  ...,  0.2151, -0.0215,  0.0733],
         [-0.0737, -0.2238, -0.1443,  ...,  0.1543,  0.0058,  0.1797]]])