In [24]:
# Importing Libraries
import pandas as pd
import numpy as np
import json
import re
import os
from moviepy import VideoFileClip
from tqdm import tqdm
import yaml
import shutil

### Data Prep Helper Functions

In [15]:
# Function to summarize dataset structure
def summarize_dataset_structure(base_dir="."):
    """
    Summarize how many videos exist in each class folder across train/val/test splits.
    Handles nested folder structures (e.g., class -> video -> clips).

    Example layout:
        base_dir/
            train/
                Abuse/
                    Abuse001_x264/
                        Abuse001_x264_0.mp4
                        Abuse001_x264_1.mp4
                Shoplifting/
            val/
            test/
    """
    for split in ["train", "val", "test"]:
        split_path = os.path.join(base_dir, split)
        if not os.path.exists(split_path):
            print(f"‚ö†Ô∏è Split folder not found: {split_path}")
            continue

        print(f"\nüìÇ Stats for split: {split}")
        total_videos = 0

        # Iterate over each class folder (Abuse, Shoplifting, etc.)
        for cls in sorted(os.listdir(split_path)):
            cls_path = os.path.join(split_path, cls)
            if not os.path.isdir(cls_path):
                continue

            video_count = 0

            # Go into subdirectories (e.g., Abuse001_x264)
            for root, _, files in os.walk(cls_path):
                video_files = [
                    f for f in files
                    if f.lower().endswith(('.mp4', '.avi', '.mov', '.mkv', '.npy'))
                ]
                video_count += len(video_files)

            print(f"  üóÇÔ∏è {cls}: {video_count} video clips")
            total_videos += video_count

        print(f"  ‚û§ Total video clips in '{split}': {total_videos}")

In [16]:
# Function to load UCF Crime-style JSON annotations
def load_ucf_json(json_path):
    """
    Loads a UCF Crime-style JSON annotation file and converts it
    into a flattened pandas DataFrame.

    Args:
        json_path (str): Path to the JSON annotation file.
    
    Returns:
        pd.DataFrame: Flattened DataFrame with columns:
            ['video', 'duration', 'start', 'end', 'description']
    """
    with open(json_path, "r") as f:
        data = json.load(f)

    rows = []
    for video, info in data.items():
        for (ts, sentence) in zip(info.get("timestamps", []), info.get("sentences", [])):
            rows.append({
                "video": video,
                "duration": info.get("duration", None),
                "start": ts[0],
                "end": ts[1],
                "description": sentence
            })

    df = pd.DataFrame(rows)
    return df

In [17]:
def add_clip_paths(df):
    """
    Adds a 'clip_path' column based only on DataFrame columns:
    folder/video_basename/video_basename_i.mp4

    Args:
        df (pd.DataFrame): must contain 'folder' and 'video' columns.

    Returns:
        pd.DataFrame: with an extra column 'clip_path'
    """
    df = df.copy()

    # Remove .mp4 extension from video to get folder/video base
    df["video_base"] = df["video"].str.replace(".mp4", "", regex=False)

    # Get index per unique (folder, video) group ‚Äî i = 0, 1, 2, ...
    df["clip_idx"] = df.groupby(["folder", "video"]).cumcount()

    # Construct the relative path
    df["clip_path"] = df.apply(
        lambda row: f"{row['folder']}/{row['video_base']}/{row['video_base']}_{row['clip_idx']}.mp4",
        axis=1
    )

    df.drop(['video_base', 'clip_idx'], axis=1, inplace=True)

    return df

### Loading UCA Annotations

In [None]:
train_file = "../Surveillance-Video-Understanding-main/ucf-annotation/json/UCFCrime_Train.json"
test_file = "../Surveillance-Video-Understanding-main/ucf-annotation/json/UCFCrime_Test.json"
val_file = "../Surveillance-Video-Understanding-main/ucf-annotation/json/UCFCrime_Val.json"

In [None]:
train_df = load_ucf_json(train_file)
test_df = load_ucf_json(test_file)
val_df = load_ucf_json(val_file)

In [None]:
train_df['folder'] = train_df['video'].apply(lambda x: re.match(r'([A-Za-z]+)', x).group(1) if re.match(r'([A-Za-z]+)', x) else None)
test_df['folder'] = test_df['video'].apply(lambda x: re.match(r'([A-Za-z]+)', x).group(1) if re.match(r'([A-Za-z]+)', x) else None)
val_df['folder'] = val_df['video'].apply(lambda x: re.match(r'([A-Za-z]+)', x).group(1) if re.match(r'([A-Za-z]+)', x) else None)

In [None]:
train_df['video'] = train_df['video']+".mp4"
test_df['video'] = test_df['video']+".mp4"
val_df['video'] = val_df['video']+".mp4"

In [None]:
train_df = add_clip_paths(train_df)
train_df[train_df["video"] == "Abuse001_x264.mp4"][["folder", "video", "clip_path"]]

In [None]:
test_df = add_clip_paths(test_df)
val_df = add_clip_paths(val_df)

In [None]:
train_df['split'] = 'train'
test_df['split'] = 'test'
val_df['split'] = 'val'

all_annotations = pd.concat([train_df, test_df, val_df], ignore_index=True)

# Expringting all annotations to CSV
all_annotations.to_csv("../uca-dataset/uca_annotations.csv", index=False)

### Preparing UCA Dataset with Clipped Videos

In [None]:
def clip_videos_from_df(df, source_dir, output_dir):
    """
    Clips videos using MoviePy 2.x API based on 'start' and 'end' times in the DataFrame.
    Displays a progress bar and only logs failures or invalid clips.
    """
    os.makedirs(output_dir, exist_ok=True)
    total_videos = df['video'].nunique()

    print(f"üé¨ Starting video clipping for {total_videos} unique videos ({len(df)} total clips)...\n")

    # Group videos first
    grouped_videos = list(df.groupby(["folder", "video"]))

    # tqdm progress bar
    for (folder, video_name), group in tqdm(grouped_videos, desc="Processing videos", unit="video"):
        src_path = os.path.join(source_dir, folder, video_name)
        base_name, ext = os.path.splitext(video_name)
        dest_subdir = os.path.join(output_dir, folder, base_name)
        os.makedirs(dest_subdir, exist_ok=True)

        if not os.path.exists(src_path):
            print(f"‚ö†Ô∏è Missing source video: {src_path}")
            continue

        try:
            video = VideoFileClip(src_path)
            for i, row in enumerate(group.itertuples(index=False)):
                start = float(row.start)
                end = min(float(row.end), video.duration)

                # Skip invalid segments
                if end <= start:
                    print(f"‚è© Skipping invalid segment ({start:.2f}-{end:.2f}) in {video_name}")
                    continue

                clip = video.subclipped(start, end)
                dest_path = os.path.join(dest_subdir, f"{base_name}_{i}.mp4")

                # Silent writing (no MoviePy output spam)
                clip.write_videofile(dest_path, audio=False, logger=None)

            video.close()

        except Exception as e:
            print(f"‚ùå Error processing {src_path}: {e}")

    print("\n‚úÖ All videos processed successfully!")

In [None]:
source_dir = '../original-ucf/Videos'
destination_dir = '../uca-dataset'

clip_videos_from_df(
    df=train_df,
    source_dir=source_dir,         
    output_dir=destination_dir+"/train"
)

In [None]:
source_dir = '../original-ucf/Videos'
destination_dir = '../uca-dataset'

clip_videos_from_df(
    df=val_df,
    source_dir=source_dir,         
    output_dir=destination_dir+"/val"
)

In [None]:
source_dir = '../original-ucf/Videos'
destination_dir = '../uca-dataset'

clip_videos_from_df(
    df=test_df,
    source_dir=source_dir,         
    output_dir=destination_dir+"/test"
)

#### Validating Data Prep

In [None]:
def check_for_duplicates(df, subset_cols=["video", "start", "end", "description"]):
    """
    Checks for duplicate video segments based on selected columns.
    Prints how many duplicates exist and which videos have them.
    """
    duplicated_rows = df[df.duplicated(subset=subset_cols, keep=False)]
    total_dupes = len(duplicated_rows)

    if total_dupes == 0:
        print("‚úÖ No duplicates found.")
        return

    print(f"üìä Found {total_dupes} duplicate rows based on {subset_cols}")

    # Count duplicates per video
    dupe_counts = (
        duplicated_rows.groupby(["video", "start", "end"])
        .size()
        .sort_values(ascending=False)
        .reset_index(name="duplicate_count")
    )

    print("\nüéûÔ∏è Videos with duplicate timestamps:")
    display(dupe_counts.head(10))  # show top 10 by default

    return duplicated_rows

In [None]:
check_for_duplicates(test_df)

In [None]:
check_for_duplicates(val_df)

In [None]:
duplicated_rows = check_for_duplicates(train_df)
# These duplicates are present in th original transcripts as well. Leaving them for now

In [None]:
# This aligns perfectly with the UCA paper table 3
destination_dir = "../uca-dataset"
summarize_dataset_structure(base_dir=destination_dir)

### YAML File Creation - Fine Tune SwinBERT

In [None]:
def create_yaml_files(df, output_dir="../UCA-Dataset/w-captions/"):
    """
    Converts UCA annotations CSV into SwinBERT-compatible YAML files for train/val/test.

    Args:
        csv_path (str): Path to CSV file with columns ['clip_path', 'description', 'split'].
        output_dir (str): Folder where YAMLs will be saved.
    """

    os.makedirs(output_dir, exist_ok=True)

    # Validate required columns
    required = {"clip_path", "description", "split"}
    if not required.issubset(df.columns):
        raise ValueError(f"CSV must contain columns: {required}")

    # Generate YAML for each split
    for split in df["split"].unique():
        split_df = df[df["split"] == split]
        split_yaml = {
            # This is based on file structure in Cloud GPU Instance
            f"{split}_videos": [os.path.join("../UCA-Dataset/w-captions", p) for p in split_df["clip_path"]],
            "captions": split_df["description"].tolist()
        }

        out_path = os.path.join(output_dir, f"{split}.yaml")
        with open(out_path, "w") as f:
            yaml.dump(split_yaml, f, default_flow_style=False, sort_keys=False, allow_unicode=True)

        print(f"‚úÖ Saved: {out_path} ({len(split_df)} entries)")

In [None]:
all_annotations = pd.read_csv("../UCA-Dataset/uca_annotations.csv")

In [None]:
all_annotations['description'] = all_annotations['description'].str.replace('\n', ' ').str.strip()

In [None]:
create_yaml_files(all_annotations)

### Aligning TEVAD data based on UCF to avoid data leakage

In [18]:
uca_split = pd.read_csv("../../uca_annotations.csv")

In [19]:
train_uca = uca_split[(uca_split['split'] =='train') | (uca_split['split'] == 'val')]['video'].unique()
test_uca = uca_split[(uca_split['split'] !='train') & (uca_split['split'] != 'val')]['video'].unique()

In [20]:
# Original UCF
train_ucf = pd.read_csv("../../Anomaly_Train_org.txt", header=None,
    names=["path"])

train_ucf['video'] = train_ucf['path'].str.split('/').str[1]

test_ucf = pd.read_csv("../../Anomaly_Test_org.txt", header=None,
    names=["path"])

test_ucf['video'] = test_ucf['path'].str.split('/').str[1]

In [21]:
len(set(train_uca).difference(set(train_ucf['video'].unique())))

236

In [22]:
uca_split['npy_name'] = uca_split['video'].str.replace('.mp4', '_i3d.npy')

In [25]:
# Step 1 moving all files to all central location
src_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1"
dest_dir = "../TEVAD/save/Crime/UCF_all_i3d"

subfolders = ["UCF_Train_ten_crop_i3d", "UCF_Test_ten_crop_i3d"]

for sub in subfolders:
    sub_path = os.path.join(src_dir, sub)
    
    for file in os.listdir(sub_path):
        src_path = os.path.join(sub_path, file)
        dst_path = os.path.join(dest_dir, file)

        if os.path.isfile(src_path):
            shutil.move(src_path, dst_path)

In [31]:
src_dir = "../TEVAD/save/Crime/UCF_all_i3d"
train_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Train_ten_crop_i3d"
test_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Test_ten_crop_i3d"

In [32]:
df = uca_split[['npy_name', 'split']].copy()
df.drop_duplicates(inplace = True)
df.reset_index(drop = True, inplace = True)

In [33]:
df.shape

(1854, 2)

In [34]:
for _, row in df.iterrows():
    filename = row['npy_name']
    split = row['split']

    src_path = os.path.join(src_dir, filename)

    # Determine destination
    if split in ['train', 'val']:
        dst_path = os.path.join(train_dir, filename)
    else:  # test
        dst_path = os.path.join(test_dir, filename)

    # Move if exists
    if os.path.exists(src_path):
        shutil.move(src_path, dst_path)
    else:
        print(f"Missing file: {filename}")

In [35]:
train_dest_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Train_ten_crop_i3d"
test_dest_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Test_ten_crop_i3d"

In [36]:
# Checking if we missed any files
train_df = uca_split[['npy_name', 'split']][(uca_split['split'] =='train') | (uca_split['split'] == 'val')].copy()
train_df.drop_duplicates(inplace = True)
train_df.reset_index(drop = True, inplace = True)

expected = set(train_df['npy_name'])
actual = set(os.listdir(train_dest_dir))
missing_files = expected - actual

train_rem = train_df[train_df['npy_name'].isin(missing_files)]
train_rem

Unnamed: 0,npy_name,split


In [37]:
test_df = uca_split[['npy_name', 'split']][(uca_split['split'] =='test')].copy()
test_df.drop_duplicates(inplace = True)
test_df.reset_index(drop = True, inplace = True)

expected = set(test_df['npy_name'])
actual = set(os.listdir(test_dest_dir))
missing_files = expected - actual

test_rem = test_df[test_df['npy_name'].isin(missing_files)]
test_rem

Unnamed: 0,npy_name,split


In [38]:
# Above missing files due to redundancies in uca-split (annotations)

### Creating Train/Test List and Ground Truth

In [39]:
train_base_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Train_ten_crop_i3d"
test_base_dir = "../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Test_ten_crop_i3d"

In [40]:
train_df['is_normal'] = train_df['npy_name'].str.startswith("Normal_Videos")

train_df = train_df.sort_values(by='is_normal', ascending=True)
train_df.reset_index(drop = True, inplace = True)

train_df['full_path'] = train_base_dir + "/" + train_df['npy_name']

output_file = "../TEVAD/list/ucf-i3d.list"

with open(output_file, "w") as f:
    for path in train_df['full_path']:
        f.write(path + "\n")

In [41]:
train_df[train_df['npy_name']== 'Shoplifting048_x264_i3d.npy']

Unnamed: 0,npy_name,split,is_normal,full_path
355,Shoplifting048_x264_i3d.npy,val,False,../TEVAD/save/Crime/UCF_ten_crop_i3d_v1/UCF_Tr...


In [42]:
train_df['is_normal'].value_counts()

is_normal
True     806
False    738
Name: count, dtype: int64

In [43]:
test_df['is_normal'] = test_df['npy_name'].str.startswith("Normal_Videos")

test_df = test_df.sort_values(by='is_normal', ascending=True)
test_df.reset_index(drop = True, inplace = True)

test_df['full_path'] = test_base_dir + "/" + test_df['npy_name']

output_file = "../TEVAD/list/ucf-i3d-test.list"

with open(output_file, "w") as f:
    for path in train_df['full_path']:
        f.write(path + "\n")

In [44]:
test_df['is_normal'].value_counts()

is_normal
False    206
True     104
Name: count, dtype: int64

### Removing Extra Embeddings

In [45]:
uca_split['emb_name'] = uca_split['video'].str.replace('.mp4', '_emb.npy')

In [46]:
temp= uca_split[['emb_name', 'split']].copy()
temp.drop_duplicates(inplace = True)
temp.reset_index(drop = True, inplace = True)

In [47]:
emb_dir = "../TEVAD/save/Crime/sent_emb_n"

expected = set(temp['emb_name'])
actual = set(os.listdir(emb_dir))

extra_files = actual-expected

In [48]:
for f in extra_files:
    file_path = os.path.join(emb_dir, f)
    
    if os.path.isfile(file_path):
        os.remove(file_path)
        print(f"Deleted: {f}")
    else:
        print(f"Skipped (not a file): {f}")