In [1]:
import os
import torch
import torchaudio
import moviepy.editor as mp
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor
import numpy as np

In [2]:
# Paths and constants
SAVED_MODEL_PATH = 'F:/SRC_Bhuvaneswari/typpo/Crimenet/W2V/Checkpoint/wav2vec2_epoch_10.pt'
LABEL_MAP = {0: 'Normal', 1: 'Abuse', 2: 'Explosion', 3: 'Fighting', 4: 'Car Accident', 5: 'Shooting', 6: 'Riot'}
SAMPLING_RATE = 16000

# Load Wav2Vec2 processor and model with saved weights
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    "facebook/wav2vec2-base",
    num_labels=len(LABEL_MAP)
)
model.load_state_dict(torch.load(SAVED_MODEL_PATH))
model.eval()

# Function to extract audio from video
def extract_audio_from_video(video_path, output_audio_path="temp_audio.wav"):
    video = mp.VideoFileClip(video_path)
    video.audio.write_audiofile(output_audio_path, fps=SAMPLING_RATE)
    return output_audio_path

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Function to preprocess audio and make predictions
def predict_audio_class(audio_path):
    # Load audio file
    waveform, sample_rate = torchaudio.load(audio_path)
    
    # Resample to the model's expected sampling rate if necessary
    if sample_rate != SAMPLING_RATE:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=SAMPLING_RATE)
        waveform = resampler(waveform)
    
    # Convert to mono if needed
    waveform = waveform.mean(dim=0).numpy()
    
    # Process the audio
    inputs = processor(waveform, sampling_rate=SAMPLING_RATE, return_tensors="pt", padding=True)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(inputs.input_values)
        logits = outputs.logits
        predicted_id = torch.argmax(logits, dim=-1).item()
    
    return LABEL_MAP[predicted_id]

In [4]:
# Prediction function to extract audio from video and classify it
def predict_audio_from_video(video_path):
    temp_audio_path = extract_audio_from_video(video_path)
    audio_class = predict_audio_class(temp_audio_path)
    os.remove(temp_audio_path)  # Clean up temporary audio file
    return audio_class

In [5]:
video_file_path = "E:/SRC-Bhuvaneswari/VAD_XDViolence/ViVi/Dataset/XD Violence/Test/Brick.Mansions.2014__#00-16-26_00-17-12_label_B1-0-0.mp4"
predicted_class = predict_audio_from_video(video_file_path)
print(f"Predicted audio class: {predicted_class}")

MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.
Predicted audio class: Fighting
