## Wav2Vec2 Sliding Window Audio Feature Extraction (50% Overlap, 1024D)

In [1]:
import os
import torch
import torchaudio
import numpy as np
from tqdm import tqdm
from transformers import Wav2Vec2Processor, Wav2Vec2Model

# === CONFIGURATION ===
AUDIO_INPUT_DIR = "/kaggle/input/sota-mrsvtt-train-frame/msrvtt_audio_wav"
AUDIO_OUTPUT_DIR = "/kaggle/working/wav2vec2_audio_features_seq_train_overlap"
MODEL_NAME = "facebook/wav2vec2-base-960h"
SEGMENT_SECONDS = 1.0
OVERLAP_RATIO = 0.5
SAMPLE_RATE = 16000

os.makedirs(AUDIO_OUTPUT_DIR, exist_ok=True)

# === LOAD MODEL AND PROCESSOR ===
processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
model = Wav2Vec2Model.from_pretrained(MODEL_NAME).to("cuda").eval()
feature_dim = model.config.hidden_size  # 768

# === PROCESSING LOOP ===
audio_files = sorted([f for f in os.listdir(AUDIO_INPUT_DIR) if f.endswith(".wav")])
print(f"Processing {len(audio_files)} audio files with sliding window...")

for file in tqdm(audio_files):
    try:
        audio_path = os.path.join(AUDIO_INPUT_DIR, file)
        video_id = os.path.splitext(file)[0]
        save_path = os.path.join(AUDIO_OUTPUT_DIR, f"{video_id}_audio.npy")

        if os.path.exists(save_path):
            continue

        waveform, sr = torchaudio.load(audio_path)
        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)
            waveform = resampler(waveform)

        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        segment_len = int(SAMPLE_RATE * SEGMENT_SECONDS)
        stride = int(segment_len * (1 - OVERLAP_RATIO))
        total_len = waveform.shape[1]

        segment_features = []
        for start in range(0, total_len - segment_len + 1, stride):
            end = start + segment_len
            segment = waveform[:, start:end]

            inputs = processor(segment.squeeze(0), sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
            input_values = inputs.input_values.to("cuda")

            with torch.no_grad():
                outputs = model(input_values)
                feat = outputs.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()

            padded_feat = np.zeros(1024, dtype=np.float32)
            padded_feat[:feature_dim] = feat
            segment_features.append(padded_feat)

        if segment_features:
            np.save(save_path, np.stack(segment_features))  # Shape: [num_segments, 1024]
        else:
            print(f"Skipped: {file} (too short for sliding window)")

    except Exception as e:
        print(f"FAILED: {file} | Error: {str(e)}")

2025-05-23 22:25:53.361191: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748039153.603224      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748039153.673683      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing 6176 audio files with sliding window...


100%|██████████| 6176/6176 [32:34<00:00,  3.16it/s]
