In [4]:
# Importing Libraries
import pandas as pd
import numpy as np
import json
import re
import os
from moviepy import VideoFileClip
from tqdm import tqdm
import yaml

### Data Prep Helper Functions

In [5]:
# Function to summarize dataset structure
def summarize_dataset_structure(base_dir="."):
    """
    Summarize how many videos exist in each class folder across train/val/test splits.
    Handles nested folder structures (e.g., class -> video -> clips).

    Example layout:
        base_dir/
            train/
                Abuse/
                    Abuse001_x264/
                        Abuse001_x264_0.mp4
                        Abuse001_x264_1.mp4
                Shoplifting/
            val/
            test/
    """
    for split in ["train", "val", "test"]:
        split_path = os.path.join(base_dir, split)
        if not os.path.exists(split_path):
            print(f"‚ö†Ô∏è Split folder not found: {split_path}")
            continue

        print(f"\nüìÇ Stats for split: {split}")
        total_videos = 0

        # Iterate over each class folder (Abuse, Shoplifting, etc.)
        for cls in sorted(os.listdir(split_path)):
            cls_path = os.path.join(split_path, cls)
            if not os.path.isdir(cls_path):
                continue

            video_count = 0

            # Go into subdirectories (e.g., Abuse001_x264)
            for root, _, files in os.walk(cls_path):
                video_files = [
                    f for f in files
                    if f.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.npy'))
                ]
                video_count += len(video_files)

            print(f"  üóÇÔ∏è {cls}: {video_count} video clips")
            total_videos += video_count

        print(f"  ‚û§ Total video clips in '{split}': {total_videos}")

In [6]:
# Function to load UCF Crime-style JSON annotations
def load_ucf_json(json_path):
    """
    Loads a UCF Crime-style JSON annotation file and converts it
    into a flattened pandas DataFrame.

    Args:
        json_path (str): Path to the JSON annotation file.
    
    Returns:
        pd.DataFrame: Flattened DataFrame with columns:
            ['video', 'duration', 'start', 'end', 'description']
    """
    with open(json_path, "r") as f:
        data = json.load(f)

    rows = []
    for video, info in data.items():
        for (ts, sentence) in zip(info.get("timestamps", []), info.get("sentences", [])):
            rows.append({
                "video": video,
                "duration": info.get("duration", None),
                "start": ts[0],
                "end": ts[1],
                "description": sentence
            })

    df = pd.DataFrame(rows)
    return df

In [7]:
def add_clip_paths(df):
    """
    Adds a 'clip_path' column based only on DataFrame columns:
    folder/video_basename/video_basename_i.mp4

    Args:
        df (pd.DataFrame): must contain 'folder' and 'video' columns.

    Returns:
        pd.DataFrame: with an extra column 'clip_path'
    """
    df = df.copy()

    # Remove .mp4 extension from video to get folder/video base
    df["video_base"] = df["video"].str.replace(".mp4", "", regex=False)

    # Get index per unique (folder, video) group ‚Äî i = 0, 1, 2, ...
    df["clip_idx"] = df.groupby(["folder", "video"]).cumcount()

    # Construct the relative path
    df["clip_path"] = df.apply(
        lambda row: f"{row['folder']}/{row['video_base']}/{row['video_base']}_{row['clip_idx']}.mp4",
        axis=1
    )

    df.drop(['video_base', 'clip_idx'], axis=1, inplace=True)

    return df

### Loading UCA Annotations

In [None]:
train_file = "../Surveillance-Video-Understanding-main/ucf-annotation/json/UCFCrime_Train.json"
test_file = "../Surveillance-Video-Understanding-main/ucf-annotation/json/UCFCrime_Test.json"
val_file = "../Surveillance-Video-Understanding-main/ucf-annotation/json/UCFCrime_Val.json"

In [None]:
train_df = load_ucf_json(train_file)
test_df = load_ucf_json(test_file)
val_df = load_ucf_json(val_file)

In [None]:
train_df['folder'] = train_df['video'].apply(lambda x: re.match(r'([A-Za-z]+)', x).group(1) if re.match(r'([A-Za-z]+)', x) else None)
test_df['folder'] = test_df['video'].apply(lambda x: re.match(r'([A-Za-z]+)', x).group(1) if re.match(r'([A-Za-z]+)', x) else None)
val_df['folder'] = val_df['video'].apply(lambda x: re.match(r'([A-Za-z]+)', x).group(1) if re.match(r'([A-Za-z]+)', x) else None)

In [136]:
train_df['video'] = train_df['video']+".mp4"
test_df['video'] = test_df['video']+".mp4"
val_df['video'] = val_df['video']+".mp4"

In [137]:
train_df = add_clip_paths(train_df)
train_df[train_df["video"] == "Abuse001_x264.mp4"][["folder", "video", "clip_path"]]

Unnamed: 0,folder,video,clip_path
0,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_0.mp4
1,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_1.mp4
2,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_2.mp4
3,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_3.mp4
4,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_4.mp4
5,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_5.mp4
6,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_6.mp4
7,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_7.mp4
8,Abuse,Abuse001_x264.mp4,Abuse/Abuse001_x264/Abuse001_x264_8.mp4


In [138]:
test_df = add_clip_paths(test_df)
val_df = add_clip_paths(val_df)

In [139]:
train_df['split'] = 'train'
test_df['split'] = 'test'
val_df['split'] = 'val'

all_annotations = pd.concat([train_df, test_df, val_df], ignore_index=True)

# Expringting all annotations to CSV
all_annotations.to_csv("../uca-dataset/uca_annotations.csv", index=False)

### Preparing UCA Dataset with Clipped Videos

In [11]:
def clip_videos_from_df(df, source_dir, output_dir):
    """
    Clips videos using MoviePy 2.x API based on 'start' and 'end' times in the DataFrame.
    Displays a progress bar and only logs failures or invalid clips.
    """
    os.makedirs(output_dir, exist_ok=True)
    total_videos = df['video'].nunique()

    print(f"üé¨ Starting video clipping for {total_videos} unique videos ({len(df)} total clips)...\n")

    # Group videos first
    grouped_videos = list(df.groupby(["folder", "video"]))

    # tqdm progress bar
    for (folder, video_name), group in tqdm(grouped_videos, desc="Processing videos", unit="video"):
        src_path = os.path.join(source_dir, folder, video_name)
        base_name, ext = os.path.splitext(video_name)
        dest_subdir = os.path.join(output_dir, folder, base_name)
        os.makedirs(dest_subdir, exist_ok=True)

        if not os.path.exists(src_path):
            print(f"‚ö†Ô∏è Missing source video: {src_path}")
            continue

        try:
            video = VideoFileClip(src_path)
            for i, row in enumerate(group.itertuples(index=False)):
                start = float(row.start)
                end = min(float(row.end), video.duration)

                # Skip invalid segments
                if end <= start:
                    print(f"‚è© Skipping invalid segment ({start:.2f}-{end:.2f}) in {video_name}")
                    continue

                clip = video.subclipped(start, end)
                dest_path = os.path.join(dest_subdir, f"{base_name}_{i}.mp4")

                # Silent writing (no MoviePy output spam)
                clip.write_videofile(dest_path, audio=False, logger=None)

            video.close()

        except Exception as e:
            print(f"‚ùå Error processing {src_path}: {e}")

    print("\n‚úÖ All videos processed successfully!")

In [89]:
source_dir = '../original-ucf/Videos'
destination_dir = '../uca-dataset'

clip_videos_from_df(
    df=train_df,
    source_dir=source_dir,         
    output_dir=destination_dir+"/train"
)

üé¨ Starting video clipping for 1165 unique videos (15677 total clips)...



Processing videos:   0%|          | 0/1165 [00:00<?, ?video/s]

‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse001_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse002_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse003_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse004_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse005_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse006_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse007_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse008_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse009_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse010_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse011_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse012_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse013_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse014_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse015_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Abuse/Abuse016_x264.mp4
‚ö†Ô∏è Missing source vi

Processing videos:   4%|‚ñç         | 48/1165 [00:18<07:19,  2.54video/s]

‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest013_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest014_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest015_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest016_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest017_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest018_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest019_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest020_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest021_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest022_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest023_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest024_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest025_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest026_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest027_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Arrest/Arrest028_x

Processing videos:  29%|‚ñà‚ñà‚ñâ       | 341/1165 [01:10<02:43,  5.03video/s]

‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos137_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos138_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos139_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos140_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos141_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos142_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos143_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos144_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos145_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos146_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos147_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos148_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos149_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos151_x264.mp4
‚ö†Ô∏è Missing sourc

Processing videos:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 489/1165 [08:35<14:39,  1.30s/video]

‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos309_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos311_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos313_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos314_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos315_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos316_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos318_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos319_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos320_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos321_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos322_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos323_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos324_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos325_x264.mp4
‚ö†Ô∏è Missing sourc

Processing videos:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 568/1165 [09:03<10:46,  1.08s/video]

‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos397_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos398_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos399_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos400_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos402_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos403_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos404_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos405_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos406_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos407_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos408_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos409_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos410_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Normal/Normal_Videos411_x264.mp4
‚ö†Ô∏è Missing sourc

Processing videos:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 1016/1165 [09:04<00:56,  2.64video/s]

‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting009_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting010_x264.mp4


Processing videos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1165/1165 [09:06<00:00,  2.13video/s]

‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting012_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting013_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting014_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting015_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting017_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting018_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting019_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting020_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting021_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting022_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting023_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting024_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting025_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting026_x264.mp4
‚ö†Ô∏è Missing source video: ../Redo/Shooting/Shooting027_x264




In [31]:
source_dir = '../original-ucf/Videos'
destination_dir = '../uca-dataset'

clip_videos_from_df(
    df=val_df,
    source_dir=source_dir,         
    output_dir=destination_dir+"/val"
)

üé¨ Starting video clipping for 379 unique videos (3534 total clips)...



Processing videos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 379/379 [13:44<00:00,  2.18s/video]


‚úÖ All videos processed successfully!





In [32]:
source_dir = '../original-ucf/Videos'
destination_dir = '../uca-dataset'

clip_videos_from_df(
    df=test_df,
    source_dir=source_dir,         
    output_dir=destination_dir+"/test"
)

üé¨ Starting video clipping for 310 unique videos (4331 total clips)...



Processing videos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 310/310 [15:58<00:00,  3.09s/video]


‚úÖ All videos processed successfully!





#### Validating Data Prep

In [12]:
def check_for_duplicates(df, subset_cols=["video", "start", "end", "description"]):
    """
    Checks for duplicate video segments based on selected columns.
    Prints how many duplicates exist and which videos have them.
    """
    duplicated_rows = df[df.duplicated(subset=subset_cols, keep=False)]
    total_dupes = len(duplicated_rows)

    if total_dupes == 0:
        print("‚úÖ No duplicates found.")
        return

    print(f"üìä Found {total_dupes} duplicate rows based on {subset_cols}")

    # Count duplicates per video
    dupe_counts = (
        duplicated_rows.groupby(["video", "start", "end"])
        .size()
        .sort_values(ascending=False)
        .reset_index(name="duplicate_count")
    )

    print("\nüéûÔ∏è Videos with duplicate timestamps:")
    display(dupe_counts.head(10))  # show top 10 by default

    return duplicated_rows

In [None]:
check_for_duplicates(test_df)

In [55]:
check_for_duplicates(val_df)

‚úÖ No duplicates found.


In [59]:
duplicated_rows = check_for_duplicates(train_df)
# These duplicates are present in th original transcripts as well. Leaving them for now

üìä Found 14 duplicate rows based on ['video', 'start', 'end', 'description']

üéûÔ∏è Videos with duplicate timestamps:


Unnamed: 0,video,start,end,duplicate_count
0,Arrest012_x264.mp4,58.5,64.3,2
1,Normal_Videos136_x264.mp4,550.2,558.5,2
2,Normal_Videos308_x264.mp4,31095.5,31150.2,2
3,Normal_Videos396_x264.mp4,2087.6,2095.5,2
4,Shooting008_x264.mp4,3.4,9.4,2
5,Shooting011_x264.mp4,101.6,111.6,2
6,Shooting011_x264.mp4,123.9,128.9,2


In [100]:
# This aligns perfectly with the UCA paper table 3
destination_dir = "../uca-dataset"
summarize_dataset_structure(base_dir=destination_dir)


üìÇ Stats for split: train
  üóÇÔ∏è Abuse: 511 video clips
  üóÇÔ∏è Arrest: 420 video clips
  üóÇÔ∏è Arson: 123 video clips
  üóÇÔ∏è Assault: 191 video clips
  üóÇÔ∏è Burglary: 716 video clips
  üóÇÔ∏è Explosion: 207 video clips
  üóÇÔ∏è Fighting: 419 video clips
  üóÇÔ∏è Normal: 10276 video clips
  üóÇÔ∏è RoadAccidents: 441 video clips
  üóÇÔ∏è Robbery: 722 video clips
  üóÇÔ∏è Shooting: 378 video clips
  üóÇÔ∏è Shoplifting: 470 video clips
  üóÇÔ∏è Stealing: 562 video clips
  üóÇÔ∏è Vandalism: 241 video clips
  ‚û§ Total video clips in 'train': 15677

üìÇ Stats for split: val
  üóÇÔ∏è Abuse: 108 video clips
  üóÇÔ∏è Arrest: 87 video clips
  üóÇÔ∏è Arson: 53 video clips
  üóÇÔ∏è Assault: 37 video clips
  üóÇÔ∏è Burglary: 152 video clips
  üóÇÔ∏è Explosion: 78 video clips
  üóÇÔ∏è Fighting: 130 video clips
  üóÇÔ∏è Normal: 2254 video clips
  üóÇÔ∏è RoadAccidents: 44 video clips
  üóÇÔ∏è Robbery: 199 video clips
  üóÇÔ∏è Shooting: 29 video clips
  üóÇÔ∏è Sho

### YAML File Creation - Fine Tune SwinBERT

In [None]:
def create_yaml_files(df, output_dir="../UCA-Dataset/w-captions/"):
    """
    Converts UCA annotations CSV into SwinBERT-compatible YAML files for train/val/test.

    Args:
        csv_path (str): Path to CSV file with columns ['clip_path', 'description', 'split'].
        output_dir (str): Folder where YAMLs will be saved.
    """

    os.makedirs(output_dir, exist_ok=True)

    # Validate required columns
    required = {"clip_path", "description", "split"}
    if not required.issubset(df.columns):
        raise ValueError(f"CSV must contain columns: {required}")

    # Generate YAML for each split
    for split in df["split"].unique():
        split_df = df[df["split"] == split]
        split_yaml = {
            # This is based on file structure in Cloud GPU Instance
            f"{split}_videos": [os.path.join("../UCA-Dataset/w-captions", p) for p in split_df["clip_path"]],
            "captions": split_df["description"].tolist()
        }

        out_path = os.path.join(output_dir, f"{split}.yaml")
        with open(out_path, "w") as f:
            yaml.dump(split_yaml, f, default_flow_style=False, sort_keys=False, allow_unicode=True)

        print(f"‚úÖ Saved: {out_path} ({len(split_df)} entries)")

In [None]:
all_annotations = pd.read_csv("../UCA-Dataset/uca_annotations.csv")

In [146]:
all_annotations['description'] = all_annotations['description'].str.replace('\n', ' ').str.strip()

In [147]:
create_yaml_files(all_annotations)

‚úÖ Saved: ../uca-dataset/train.yaml (15677 entries)
‚úÖ Saved: ../uca-dataset/test.yaml (4331 entries)
‚úÖ Saved: ../uca-dataset/val.yaml (3534 entries)


### Aligning TEVAD data based on UCF to avoid data leakage

In [16]:
uca_split = pd.read_csv("../../uca_annotations.csv")

In [17]:
train_uca = uca_split[(uca_split['split'] =='train') | (uca_split['split'] == 'val')]['video'].unique()
test_uca = uca_split[(uca_split['split'] !='train') & (uca_split['split'] != 'val')]['video'].unique()

In [19]:
# Original UCF
train_ucf = pd.read_csv("../../Anomaly_Train_org.txt", header=None,
    names=["path"])

train_ucf['video'] = train_ucf['path'].str.split('/').str[1]

test_ucf = pd.read_csv("../../Anomaly_Test_org.txt", header=None,
    names=["path"])

test_ucf['video'] = test_ucf['path'].str.split('/').str[1]

In [20]:
len(set(train_uca).difference(set(train_ucf['video'].unique())))

236

In [21]:
uca_split['npy_name'] = uca_split['video'].str.replace('.mp4', '_i3d.npy')

  uca_split['npy_name'] = uca_split['video'].str.replace('.mp4', '_i3d.npy')


In [37]:
# Step 1 moving all files to all central location
src_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1"
dest_dir = "../TEVAD/save/Crime/UCF_all_i3d"

subfolders = ["UCF_Train_ten_crop_i3d", "UCF_Test_ten_crop_i3d"]

for sub in subfolders:
    sub_path = os.path.join(src_dir, sub)
    
    for file in os.listdir(sub_path):
        src_path = os.path.join(sub_path, file)
        dst_path = os.path.join(dest_dir, file)

        if os.path.isfile(src_path):
            shutil.move(src_path, dst_path)

In [47]:
src_dir = "../TEVAD/save/Crime/UCF_all_i3d"
train_dir = "../TEVAD/save/Crime/UCF_Train_ten_crop_i3d"
test_dir = "../TEVAD/save/Crime/UCF_Test_ten_crop_i3d"

In [51]:
df = uca_split[['npy_name', 'split']].copy()
df.drop_duplicates(inplace = True)
df.reset_index(drop = True, inplace = True)

In [52]:
df.shape

(1854, 2)

In [53]:
for _, row in df.iterrows():
    filename = row['npy_name']
    split = row['split']

    src_path = os.path.join(src_dir, filename)

    # Determine destination
    if split in ['train', 'val']:
        dst_path = os.path.join(train_dir, filename)
    else:  # test
        dst_path = os.path.join(test_dir, filename)

    # Move if exists
    if os.path.exists(src_path):
        shutil.move(src_path, dst_path)
    else:
        print(f"Missing file: {filename}")

Missing file: Abuse001_x264_i3d.npy
Missing file: Abuse002_x264_i3d.npy
Missing file: Abuse003_x264_i3d.npy
Missing file: Abuse004_x264_i3d.npy
Missing file: Abuse005_x264_i3d.npy
Missing file: Abuse006_x264_i3d.npy
Missing file: Abuse007_x264_i3d.npy
Missing file: Abuse008_x264_i3d.npy
Missing file: Abuse009_x264_i3d.npy
Missing file: Abuse010_x264_i3d.npy
Missing file: Abuse011_x264_i3d.npy
Missing file: Abuse012_x264_i3d.npy
Missing file: Abuse013_x264_i3d.npy
Missing file: Abuse014_x264_i3d.npy
Missing file: Abuse015_x264_i3d.npy
Missing file: Abuse016_x264_i3d.npy
Missing file: Abuse017_x264_i3d.npy
Missing file: Abuse018_x264_i3d.npy
Missing file: Abuse019_x264_i3d.npy
Missing file: Abuse020_x264_i3d.npy
Missing file: Abuse021_x264_i3d.npy
Missing file: Abuse022_x264_i3d.npy
Missing file: Abuse023_x264_i3d.npy
Missing file: Abuse024_x264_i3d.npy
Missing file: Abuse025_x264_i3d.npy
Missing file: Abuse026_x264_i3d.npy
Missing file: Abuse027_x264_i3d.npy
Missing file: Abuse028_x264_

In [22]:
train_dest_dir = "../TEVAD/save/Crime/UCF_Train_ten_crop_i3d"
test_dest_dir = "../TEVAD/save/Crime/UCF_Test_ten_crop_i3d"

In [None]:
# Checking if we missed any files
train_df = uca_split[['npy_name', 'split']][(uca_split['split'] =='train') | (uca_split['split'] == 'val')].copy()
train_df.drop_duplicates(inplace = True)
train_df.reset_index(drop = True, inplace = True)

expected = set(train_df['npy_name'])
actual = set(os.listdir(train_dest_dir))
missing_files = expected - actual

train_rem = train_df[train_df['npy_name'].isin(missing_files)]
train_rem

In [None]:
test_df = uca_split[['npy_name', 'split']][(uca_split['split'] =='test')].copy()
test_df.drop_duplicates(inplace = True)
test_df.reset_index(drop = True, inplace = True)

expected = set(test_df['npy_name'])
actual = set(os.listdir(test_dest_dir))
missing_files = expected - actual

test_rem = test_df[test_df['npy_name'].isin(missing_files)]
test_rem

In [88]:
# Above missing files due to redundancies in uca-split (annotations)

### Creating Train/Test List and Ground Truth

In [26]:
train_base_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Train_ten_crop_i3d"
test_base_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Test_ten_crop_i3d"

In [27]:
train_df['is_normal'] = train_df['npy_name'].str.startswith("Normal_Videos")

train_df = train_df.sort_values(by='is_normal', ascending=True)
train_df.reset_index(drop = True, inplace = True)

train_df['full_path'] = train_base_dir + "/" + train_df['npy_name']

output_file = "../TEVAD/list/ucf-i3d.list"

with open(output_file, "w") as f:
    for path in train_df['full_path']:
        f.write(path + "\n")

In [28]:
train_df[train_df['npy_name']== 'Shoplifting048_x264_i3d.npy']

Unnamed: 0,npy_name,split,is_normal,full_path
355,Shoplifting048_x264_i3d.npy,val,False,../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Tr...


In [66]:
train_df['is_normal'].value_counts()

True     806
False    738
Name: is_normal, dtype: int64

In [31]:
test_df['is_normal'] = test_df['npy_name'].str.startswith("Normal_Videos")

test_df = test_df.sort_values(by='is_normal', ascending=True)
test_df.reset_index(drop = True, inplace = True)

test_df['full_path'] = test_base_dir + "/" + test_df['npy_name']

output_file = "../TEVAD/list/ucf-i3d-test.list"

with open(output_file, "w") as f:
    for path in train_df['full_path']:
        f.write(path + "\n")

In [32]:
test_df['is_normal'].value_counts()

False    206
True     104
Name: is_normal, dtype: int64

In [105]:
gt_file = np.load("../TEVAD/list/gt-ucf.npy")
# Not required

### Removing Extra Embeddings

In [33]:
uca_split['emb_name'] = uca_split['video'].str.replace('.mp4', '_emb.npy')

  uca_split['emb_name'] = uca_split['video'].str.replace('.mp4', '_emb.npy')


In [34]:
temp= uca_split[['emb_name', 'split']].copy()
temp.drop_duplicates(inplace = True)
temp.reset_index(drop = True, inplace = True)

In [35]:
emb_dir = "../TEVAD/save/Crime/sent_emb_n"

expected = set(temp['emb_name'])
actual = set(os.listdir(emb_dir))

extra_files = actual-expected

In [36]:
for f in extra_files:
    file_path = os.path.join(emb_dir, f)
    
    if os.path.isfile(file_path):
        os.remove(file_path)
        print(f"Deleted: {f}")
    else:
        print(f"Skipped (not a file): {f}")