# MSR-VTT Frame and Audio Extraction Script (1 FPS, SOTA-Compatible)
Extracts:
- 1 FPS frames resized to 224x224 or 256x256 (BLIP-2 compatible)
- 16kHz mono WAV audio (Wav2Vec2 compatible)
- Skips already processed files
- Verifies missing frame folders from `video0` to `video7010`

In [1]:
import os
import subprocess
from tqdm import tqdm
import time

## Configuration

In [2]:
# ======= CONFIG =======
VIDEO_INPUT_DIR = '/kaggle/input/msrvtt/TrainValVideo'  # Kaggle dataset
FRAME_OUTPUT_DIR = '/kaggle/working/msrvtt_frames_1fps'
AUDIO_OUTPUT_DIR = '/kaggle/working/msrvtt_audio_wav'
RESOLUTION = 256  
FPS = 1

## Directory Setup

In [3]:
# ======= SETUP =======
os.makedirs(FRAME_OUTPUT_DIR, exist_ok=True)
os.makedirs(AUDIO_OUTPUT_DIR, exist_ok=True)

video_files = sorted([f for f in os.listdir(VIDEO_INPUT_DIR) if f.endswith('.mp4')])

start_time = time.time()
print(f" Processing {len(video_files)} videos...")

 Processing 7010 videos...


##  Main Processing Loop

In [4]:
# ======= MAIN LOOP =======
for video_file in tqdm(video_files):
    video_id = os.path.splitext(video_file)[0]  # e.g., video7010
    video_path = os.path.join(VIDEO_INPUT_DIR, video_file)
    
    # ----- Frame Extraction -----
    frame_output_folder = os.path.join(FRAME_OUTPUT_DIR, video_id)
    os.makedirs(frame_output_folder, exist_ok=True)

    if not os.listdir(frame_output_folder):  # only if empty
        frame_cmd = [
            'ffmpeg',
            '-i', video_path,
            '-vf', f'fps={FPS},scale={RESOLUTION}:{RESOLUTION}',
            os.path.join(frame_output_folder, 'frame_%04d.jpg')
        ]
        subprocess.run(frame_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # ----- Audio Extraction -----
    audio_output_path = os.path.join(AUDIO_OUTPUT_DIR, f'{video_id}.wav')
    if not os.path.exists(audio_output_path):
        audio_cmd = [
            'ffmpeg',
            '-i', video_path,
            '-vn',                    # no video
            '-acodec', 'pcm_s16le',   # raw wav
            '-ar', '16000',           # 16 kHz
            '-ac', '1',               # mono
            audio_output_path
        ]
        subprocess.run(audio_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

100%|██████████| 7010/7010 [54:32<00:00,  2.14it/s]


##  Summary: Total Processing Time

In [5]:
elapsed = time.time() - start_time
print(f"\n Done processing {len(video_files)} videos in {elapsed:.2f} seconds.")


 Done processing 7010 videos in 3274.08 seconds.


##Check Missing Frame Folders (`video0` to `video7010`)

In [6]:
# ======= VERIFICATION: CHECK MISSING FRAME FOLDERS =======
print("\n Verifying extracted frame folders...")

expected_videos = [f"video{i}" for i in range(7011)]  # 0 to 7010
missing_videos = []

for vid in expected_videos:
    frame_dir = os.path.join(FRAME_OUTPUT_DIR, vid)
    if not os.path.exists(frame_dir) or len(os.listdir(frame_dir)) == 0:
        missing_videos.append(vid)

# Display results
if not missing_videos:
    print("All expected videos have extracted frames.")
else:
    print(f" Missing frame folders for {len(missing_videos)} videos:")
    for mv in missing_videos[:10]:  # Print first 10 only
        print(f"   - {mv}")
    if len(missing_videos) > 10:
        print(f"   ... and {len(missing_videos) - 10} more.")

    # Save missing video IDs to a file
    with open('/kaggle/working/missing_videos.txt', 'w') as f:
        for mv in missing_videos:
            f.write(mv + '\n')
    print(" Saved missing video list to 'missing_videos.txt'")


 Verifying extracted frame folders...
 Missing frame folders for 1 videos:
   - video7010
 Saved missing video list to 'missing_videos.txt'
