In [1]:
# import os
# import json
# import random
# import shutil
# from collections import defaultdict

# # --- Configuration Paths ---
# VIDEO_DIR = "/kaggle/input/ucaucf-crime-annotation-dataset/UCF_Crimes/UCF_Crimes/Videos"
# ANNOTATION_JSON_PATH = "/kaggle/input/ucaucf-crime-annotation-dataset/UCFCrime_Train.json"  # Adjust if different
# OUTPUT_DIR = "/kaggle/working/sampled_ucf_videos"
# OUTPUT_JSON_PATH =  "sampled_annotations.json"

# # Optionally, set a seed for reproducibility:
# random.seed(42)

# # --- Load the Annotations JSON ---
# with open(ANNOTATION_JSON_PATH, "r") as f:
#     annotations = json.load(f)

# # --- Organize Videos by Category ---
# # This dictionary maps category to a list of video keys from the annotations file.
# # It assumes the video names start with the category name.
# category_videos = defaultdict(list)
# available_categories = os.listdir(VIDEO_DIR)
# for video_name in annotations:
#     for category in available_categories:
#         if video_name.startswith(category):
#             category_videos[category].append(video_name)
#             break

# # --- Sampling and Copying Videos ---
# # Create output directory if it doesn't already exist.
# os.makedirs(OUTPUT_DIR, exist_ok=True)
# sampled_data = {}

# for category, videos in category_videos.items():
#     # Sample 50 videos from each category (or all videos if fewer than 50)
#     sample_count = min(50, len(videos))
#     sampled = random.sample(videos, sample_count)
    
#     for video_name in sampled:
#         # Adjust the extension if needed; here, we assume .mp4.
#         video_file = video_name + ".mp4"
#         # The source path is in a subfolder named by its category.
#         src_video_path = os.path.join(VIDEO_DIR, category, video_file)
#         # The destination is a single output folder (no subfolders).
#         dst_video_path = os.path.join(OUTPUT_DIR, video_file)
        
#         if os.path.exists(src_video_path):
#             shutil.copy2(src_video_path, dst_video_path)
#             # Save the video's metadata from the original annotations.
#             sampled_data[video_name] = annotations[video_name]
#         else:
#             print(f"Warning: Video file not found: {src_video_path}")

# # --- Save the Sampled Annotations to a New JSON File ---
# with open(OUTPUT_JSON_PATH, "w") as out_json:
#     json.dump(sampled_data, out_json, indent=4)

# print(f"✅ Sampling complete! All videos saved in: {OUTPUT_DIR}")
# print(f"✅ New JSON annotations file created at: {OUTPUT_JSON_PATH}")


In [2]:
import os
import json
import random
import shutil
from collections import defaultdict

# --- Configuration Paths ---
# Path to video files (each category is expected to be a subfolder under this directory)
VIDEO_DIR = "/kaggle/input/ucaucf-crime-annotation-dataset/UCF_Crimes/UCF_Crimes/Videos"

# Paths to the three annotation JSON files
TRAIN_JSON_PATH = "/kaggle/input/ucaucf-crime-annotation-dataset/UCFCrime_Train.json"
VAL_JSON_PATH   = "/kaggle/input/ucaucf-crime-annotation-dataset/UCFCrime_Val.json"
TEST_JSON_PATH  = "/kaggle/input/ucaucf-crime-annotation-dataset/UCFCrime_Test.json"

# Output folder path and JSON file for the sampled annotations
OUTPUT_DIR = "/kaggle/working/sampled_ucf_videos"
OUTPUT_JSON_PATH =  "sampled_annotations.json"

# Optionally, set a seed for reproducibility:
random.seed(42)

# --- Load and Combine Annotations ---
annotations = {}
for json_path in [TRAIN_JSON_PATH, VAL_JSON_PATH, TEST_JSON_PATH]:
    with open(json_path, "r") as f:
        data = json.load(f)
        # The keys in each file should be unique. In case of overlap, later files will override earlier ones.
        annotations.update(data)

# --- Organize Videos by Category ---
# This dictionary will map each category (inferred from the video filename prefix)
# to the list of video names that belong to that category.
category_videos = defaultdict(list)
available_categories = os.listdir(VIDEO_DIR)

for video_name in annotations:
    for category in available_categories:
        # Checking if the video name begins with the category name
        if video_name.startswith(category):
            category_videos[category].append(video_name)
            break

# --- Sampling and Copying Videos ---
# Create output folder if it doesn't already exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
sampled_data = {}

for category, videos in category_videos.items():
    # Sample 50 videos from each category (or all videos if fewer than 50 exist)
    sample_count = min(50, len(videos))
    sampled = random.sample(videos, sample_count)
    
    for video_name in sampled:
        # Adjust the file extension if needed; here, we assume videos are .mp4
        video_file = video_name + ".mp4"
        # The source path is built from the category subfolder inside VIDEO_DIR
        src_video_path = os.path.join(VIDEO_DIR, category, video_file)
        # All videos will be copied directly into OUTPUT_DIR (no category subfolders)
        dst_video_path = os.path.join(OUTPUT_DIR, video_file)
        
        if os.path.exists(src_video_path):
            shutil.copy2(src_video_path, dst_video_path)
            # Add the video's metadata from the combined annotations
            sampled_data[video_name] = annotations[video_name]
        else:
            print(f"Warning: Video file not found: {src_video_path}")

# --- Save the Sampled Annotations to a New JSON File ---
with open(OUTPUT_JSON_PATH, "w") as out_json:
    json.dump(sampled_data, out_json, indent=4)

print(f"✅ Sampling complete! All videos are saved in: {OUTPUT_DIR}")
print(f"✅ New JSON annotations file created at: {OUTPUT_JSON_PATH}")


✅ Sampling complete! All videos are saved in: /kaggle/working/sampled_ucf_videos
✅ New JSON annotations file created at: sampled_annotations.json


In [3]:
OUTPUT_JSON_PATH =  "sampled_annotations.json"
# --- Save the Sampled Annotations to a New JSON File ---
with open(OUTPUT_JSON_PATH, "w") as out_json:
    json.dump(sampled_data, out_json, indent=4)