## Test Set Extraction (video7010–video9999, Sorted Numerically)

In [1]:
import os
import subprocess
import time
import glob
from tqdm import tqdm

# ======= CONFIG =======
VIDEO_INPUT_DIR = '/kaggle/input/msrvtt/MSRVTT/videos/all'  
FRAME_OUTPUT_DIR = '/kaggle/working/msrvtt_test_frames_1fps'
AUDIO_OUTPUT_DIR = '/kaggle/working/msrvtt_test_audio_wav'
RESOLUTION = 256  
FPS = 1

# ======= SETUP =======
os.makedirs(FRAME_OUTPUT_DIR, exist_ok=True)
os.makedirs(AUDIO_OUTPUT_DIR, exist_ok=True)

# ======= CUSTOM SORT + FILTER =======
def sort_by_video_number(filename):
    return int(filename.replace("video", "").replace(".mp4", ""))

# Get numerically sorted list
video_files = sorted(
    [f for f in os.listdir(VIDEO_INPUT_DIR) if f.startswith("video") and f.endswith(".mp4")],
    key=sort_by_video_number
)

# Filter only test set range
video_files = [
    f for f in video_files
    if 7010 <= int(f.replace("video", "").replace(".mp4", "")) <= 9999
]

print(f" Processing {len(video_files)} test set videos...")

start_time = time.time()

# ======= MAIN LOOP =======
for video_file in tqdm(video_files):
    video_id = os.path.splitext(video_file)[0]
    video_path = os.path.join(VIDEO_INPUT_DIR, video_file)
    
    # Frame folder
    frame_output_folder = os.path.join(FRAME_OUTPUT_DIR, video_id)
    os.makedirs(frame_output_folder, exist_ok=True)

    if not os.listdir(frame_output_folder):
        frame_cmd = [
            'ffmpeg',
            '-i', video_path,
            '-vf', f'fps={FPS},scale={RESOLUTION}:{RESOLUTION}',
            os.path.join(frame_output_folder, 'frame_%04d.jpg')
        ]
        subprocess.run(frame_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        # Rename frames to contiguous numbering
        frame_files = sorted(glob.glob(os.path.join(frame_output_folder, 'frame_*.jpg')))
        for i, frame_path in enumerate(frame_files, start=1):
            new_name = os.path.join(frame_output_folder, f'frame_{i:04d}.jpg')
            os.rename(frame_path, new_name)

    # Audio path
    audio_output_path = os.path.join(AUDIO_OUTPUT_DIR, f'{video_id}.wav')
    if not os.path.exists(audio_output_path):
        audio_cmd = [
            'ffmpeg',
            '-i', video_path,
            '-vn',
            '-acodec', 'pcm_s16le',
            '-ar', '16000',
            '-ac', '1',
            audio_output_path
        ]
        subprocess.run(audio_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

 Processing 2990 test set videos...


100%|██████████| 2990/2990 [23:45<00:00,  2.10it/s]


##Check Missing Frame Folders (Test Set video7010–video9999)

In [2]:
print("\nVerifying extracted test set frame folders...")

# Create expected list
expected_test_videos = [f"video{i}" for i in range(7010, 10000)]
missing_test_videos = []

for vid in expected_test_videos:
    frame_dir = os.path.join(FRAME_OUTPUT_DIR, vid)
    if not os.path.exists(frame_dir) or len(os.listdir(frame_dir)) == 0:
        missing_test_videos.append(vid)

# Display results
if not missing_test_videos:
    print("All expected test set videos have extracted frames.")
else:
    print(f" Missing test frame folders for {len(missing_test_videos)} videos:")
    for mv in missing_test_videos[:10]:  
        print(f"   - {mv}")
    if len(missing_test_videos) > 10:
        print(f"   ... and {len(missing_test_videos) - 10} more.")

    # Save to file
    with open('/kaggle/working/missing_test_videos.txt', 'w') as f:
        for mv in missing_test_videos:
            f.write(mv + '\n')
    print(" Saved missing test video list to 'missing_test_videos.txt'")


Verifying extracted test set frame folders...
All expected test set videos have extracted frames.


In [3]:
# ======= ZIP FINAL OUTPUT =======
import zipfile

zip_filename = "/kaggle/working/msrvtt_test_output.zip"
folders_to_zip = [
    FRAME_OUTPUT_DIR,
    AUDIO_OUTPUT_DIR,
]

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for path in folders_to_zip:
        if os.path.isdir(path):
            for root, _, files in os.walk(path):
                for file in files:
                    full_path = os.path.join(root, file)
                    arcname = os.path.relpath(full_path, "/kaggle/working")
                    zipf.write(full_path, arcname=arcname)
        elif os.path.isfile(path):
            arcname = os.path.relpath(path, "/kaggle/working")
            zipf.write(path, arcname=arcname)

print(f"\n Final output : {zip_filename}")



 Final output : /kaggle/working/msrvtt_test_output.zip


In [4]:
import shutil

# Delete the extracted folders after zipping
shutil.rmtree(FRAME_OUTPUT_DIR)
shutil.rmtree(AUDIO_OUTPUT_DIR)

# Optional: Delete the missing file too
missing_file = "/kaggle/working/missing_test_videos.txt"
if os.path.exists(missing_file):
    os.remove(missing_file)

print("Deleted original folders and metadata. Only the zip remains.")


Deleted original folders and metadata. Only the zip remains.
