correcting data to improve models :

avenue : 

In [1]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

# ==== USER: MODIFY THESE ====
csv_file1 = 'data/avenue/test_labels.csv'
csv_file2 = 'data/avenue/train_labels.csv'
combined_folder = 'combined_data/avenue/combined_frames'
output_csv = 'combined_data/avenue/combined_data.csv'
# ============================

# Create the output folder if it doesn't exist
os.makedirs(combined_folder, exist_ok=True)

# Load CSVs
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

# Combine them
combined_df = pd.concat([df1, df2], ignore_index=True)

# Column names (change if yours are different)
path_col = 'path'
label_col = 'label'

# New rows to save in the output CSV
new_rows = []

# Iterate with progress bar
for i, row in tqdm(combined_df.iterrows(), total=len(combined_df), desc="Processing frames"):
    original_path = row[path_col]
    label = row[label_col]

    # Ensure the original path exists
    if not os.path.exists(original_path):
        tqdm.write(f"[Warning] File not found: {original_path}")
        continue

    # Generate a unique filename to avoid collisions
    file_ext = os.path.splitext(original_path)[1]  # keep original extension
    new_filename = f"frame_{i:05d}{file_ext}"      # e.g., frame_00001.jpg
    new_path = os.path.join(combined_folder, new_filename)

    # Copy the file
    shutil.copy2(original_path, new_path)

    # Save new path and label
    new_rows.append([new_path, label])

# Save the new CSV
output_df = pd.DataFrame(new_rows, columns=[path_col, label_col])
output_df.to_csv(output_csv, index=False)

print(f"\n✅ Done. All frames copied and renamed into '{combined_folder}'")
print(f"📄 Updated CSV saved as '{output_csv}' with new paths.")


Processing frames: 100%|██████████| 30652/30652 [04:41<00:00, 108.73it/s]



✅ Done. All frames copied and renamed into 'combined_data/avenue/combined_frames'
📄 Updated CSV saved as 'combined_data/avenue/combined_data.csv' with new paths.


In [3]:
import pandas as pd

# Path to your combined CSV
csv_path = 'combined_data/avenue/combined_data.csv'

# Load the CSV
df = pd.read_csv(csv_path)

# Count values in 'label' column
label_counts = df['label'].value_counts()

# Print results
print("🔢 Label Counts:")
print(label_counts)

# Optional: show count for each label explicitly
print(f"\nTotal 0s: {label_counts.get(0, 0)}")
print(f"Total 1s: {label_counts.get(1, 0)}")


🔢 Label Counts:
label
0    26051
1     4601
Name: count, dtype: int64

Total 0s: 26051
Total 1s: 4601


Read combined_data.csv with paths and labels.

1- Undersample label 0:

Keep all label 1 frames.

From label 0, take 1 frame, skip the next 4, repeat.

2- Split the resulting data into:

70% training → move to folder avenue_train/

30% testing → move to folder test_avenue/

3- Save new CSV files for both sets: train.csv, test.csv.

In [1]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# === CONFIG ===
input_csv = 'combined_data/avenue/combined_data.csv'
train_dir = 'combined_data/avenue/avenue_train'
test_dir = 'combined_data/avenue/test_avenue'
train_csv = 'combined_data/avenue/train.csv'
test_csv = 'combined_data/avenue/test.csv'
undersample_ratio = 5  # keep 1 every 5 label 0 frames
# ==============

# Step 1: Load CSV
df = pd.read_csv(input_csv)

# Step 2: Undersample label 0
label_1_df = df[df['label'] == 1]
label_0_df = df[df['label'] == 0].reset_index(drop=True)
label_0_sampled = label_0_df.iloc[::undersample_ratio]

# Step 3: Combine balanced data
balanced_df = pd.concat([label_1_df, label_0_sampled], ignore_index=True).sample(frac=1, random_state=42)

# Step 4: Train-test split
train_df, test_df = train_test_split(balanced_df, test_size=0.3, random_state=42, stratify=balanced_df['label'])

# Step 5: Create output folders
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

def move_files(df, target_folder):
    updated_rows = []
    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Copying to {target_folder}"):
        old_path = row['path']
        label = row['label']
        filename = os.path.basename(old_path)
        new_path = os.path.join(target_folder, filename)

        if os.path.exists(old_path):
            shutil.copy2(old_path, new_path)
            updated_rows.append([new_path, label])
        else:
            tqdm.write(f"[Warning] Missing file: {old_path}")

    return pd.DataFrame(updated_rows, columns=['path', 'label'])

# Step 6: Move files and save new CSVs
train_final_df = move_files(train_df, train_dir)
test_final_df = move_files(test_df, test_dir)

train_final_df.to_csv(train_csv, index=False)
test_final_df.to_csv(test_csv, index=False)

print(f"\n✅ Finished. Train → {len(train_final_df)} samples. Test → {len(test_final_df)} samples.")
print(f"📂 Train folder: {train_dir} | CSV: {train_csv}")
print(f"📂 Test folder: {test_dir} | CSV: {test_csv}")


Copying to combined_data/avenue/avenue_train: 100%|██████████| 6868/6868 [04:41<00:00, 24.44it/s]
Copying to combined_data/avenue/test_avenue: 100%|██████████| 2944/2944 [02:11<00:00, 22.32it/s]



✅ Finished. Train → 6868 samples. Test → 2944 samples.
📂 Train folder: combined_data/avenue/avenue_train | CSV: combined_data/avenue/train.csv
📂 Test folder: combined_data/avenue/test_avenue | CSV: combined_data/avenue/test.csv


In [2]:
import pandas as pd

# Path to your combined CSV
csv_path1 = 'combined_data/avenue/train.csv'
csv_path2 = 'combined_data/avenue/test.csv'

# Load the CSV
df1 = pd.read_csv(csv_path1)
df2 = pd.read_csv(csv_path2)

# Count values in 'label' column
label_counts1 = df1['label'].value_counts()
label_counts2 = df2['label'].value_counts()

# Print results
print("🔢 train label Counts:")
print(label_counts1)
print("🔢 test label Counts:")
print(label_counts2)

🔢 train label Counts:
label
0    3647
1    3221
Name: count, dtype: int64
🔢 test label Counts:
label
0    1564
1    1380
Name: count, dtype: int64


violent flows :

In [2]:
import pandas as pd

# Path to your combined CSV
csv_path1 = 'data/Violent-Flows/violentflows_labels.csv'
# Load the CSV
df1 = pd.read_csv(csv_path1)

# Count values in 'label' column
label_counts1 = df1['label'].value_counts()

# Print results
print("🔢 train label Counts:")
print(label_counts1) 

🔢 train label Counts:
label
1    12530
0     9544
Name: count, dtype: int64


so the violent_flows dataset doesn't need any fixing 

fixing ped1 and ped2 datasets :

In [3]:
csv_path1 = 'data/UCSD_Anomaly_Dataset.v1p2/labels_ped1_train.csv'
df1 = pd.read_csv(csv_path1)
label_counts1 = df1['label'].value_counts()
print("🔢 train label Counts:")
print(label_counts1) 

🔢 train label Counts:
label
0    6800
Name: count, dtype: int64


In [4]:
df1 = pd.read_csv('data/UCSD_Anomaly_Dataset.v1p2/labels_ped1_test.csv')
print("🔢 test label Counts:")
print(df1["label"].value_counts())

🔢 test label Counts:
label
0    5964
1    1235
Name: count, dtype: int64


In [5]:
df1 = pd.read_csv('data/UCSD_Anomaly_Dataset.v1p2/labels_ped2_test.csv')
print("🔢 test label Counts:")
print(df1["label"].value_counts())

🔢 test label Counts:
label
1    1276
0     734
Name: count, dtype: int64


In [6]:
df1 = pd.read_csv('data/UCSD_Anomaly_Dataset.v1p2/labels_ped1_train.csv')
print("🔢 test label Counts:")
print(df1["label"].value_counts())

🔢 test label Counts:
label
0    6800
Name: count, dtype: int64


so we are gonna take the excessive abnormal frames from the violents flows dataset and we are gonna divide them between the ped1 and ped2 since they have very small amount in the abnormal frames

In [1]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

# ===== Paths to CSV label files =====
csv_files = {
    "ped1": [
        ("train", "data/UCSD_Anomaly_Dataset.v1p2/labels_ped1_train.csv"),
        ("test", "data/UCSD_Anomaly_Dataset.v1p2/labels_ped1_test.csv")
    ],
    "ped2": [
        ("train", "data/UCSD_Anomaly_Dataset.v1p2/labels_ped2_train.csv"),
        ("test", "data/UCSD_Anomaly_Dataset.v1p2/labels_ped2_test.csv")
    ]
}

# ===== Output base folder =====
output_base = "combined_data"

# ===== Process each group =====
for group, file_list in csv_files.items():
    print(f"\n📁 Processing {group.upper()}...")

    # Output merged folder
    merged_folder = os.path.join(output_base, f"UCSD_{group}", f"{group}_merged")
    os.makedirs(merged_folder, exist_ok=True)

    # List to collect updated [new_path, label]
    new_entries = []
    frame_counter = 0

    # Read and process each CSV
    for split_name, csv_file in file_list:
        print(f"➡️  Processing {split_name.upper()} set: {csv_file}")
        df = pd.read_csv(csv_file)

        for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Copying {group} {split_name}", leave=False):
            original_path = row["path"]
            label = row["label"]

            if not os.path.exists(original_path):
                tqdm.write(f"⚠️ Missing file: {original_path}")
                continue

            # Generate a unique filename
            ext = os.path.splitext(original_path)[1]
            new_filename = f"{group}_frame_{frame_counter:05d}{ext}"
            new_path = os.path.join(merged_folder, new_filename)

            # Copy file
            shutil.copy2(original_path, new_path)

            # Save updated row
            new_entries.append([new_path, label])
            frame_counter += 1

    # Save merged CSV
    df_output = pd.DataFrame(new_entries, columns=["path", "label"])
    output_csv_path = os.path.join(output_base, f"UCSD_{group}", f"{group}_merged.csv")
    df_output.to_csv(output_csv_path, index=False)

    print(f"✅ Finished {group.upper()}: {frame_counter} total frames.")
    print(f"📄 CSV saved: {output_csv_path}")



📁 Processing PED1...
➡️  Processing TRAIN set: data/UCSD_Anomaly_Dataset.v1p2/labels_ped1_train.csv


                                                                        

➡️  Processing TEST set: data/UCSD_Anomaly_Dataset.v1p2/labels_ped1_test.csv


                                                                       

✅ Finished PED1: 13999 total frames.
📄 CSV saved: combined_data\UCSD_ped1\ped1_merged.csv

📁 Processing PED2...
➡️  Processing TRAIN set: data/UCSD_Anomaly_Dataset.v1p2/labels_ped2_train.csv


                                                                        

➡️  Processing TEST set: data/UCSD_Anomaly_Dataset.v1p2/labels_ped2_test.csv


                                                                       

✅ Finished PED2: 4560 total frames.
📄 CSV saved: combined_data\UCSD_ped2\ped2_merged.csv




In [2]:
df1 = pd.read_csv('combined_data/UCSD_ped1/ped1_merged.csv')
print("🔢ped1 label Counts:")
print(df1["label"].value_counts())

🔢ped1 label Counts:
label
0    12764
1     1235
Name: count, dtype: int64


In [6]:
df1 = pd.read_csv('combined_data/UCSD_ped2/ped2_merged.csv')
print("🔢ped2 label Counts:")
print(df1["label"].value_counts())

🔢ped2 label Counts:
label
0    3284
1    2769
Name: count, dtype: int64


taking frames from violent flows dataset

In [4]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

# === CONFIGURATION ===
violent_csv_path = "data/Violent-Flows/violentflows_labels.csv"
ped1_folder = "combined_data/UCSD_ped1/ped1_merged"
ped2_folder = "combined_data/UCSD_ped2/ped2_merged"
ped1_csv = "combined_data/UCSD_ped1/ped1_merged.csv"
ped2_csv = "combined_data/UCSD_ped2/ped2_merged.csv"

extra_frame_count = 2986
half = extra_frame_count // 2
# =====================

# Step 1: Load Violent-Flows CSV
vf_df = pd.read_csv(violent_csv_path)

# Step 2: Get first 2,986 label==1 frames
violent_frames = vf_df[vf_df["label"] == 1].head(extra_frame_count).reset_index(drop=True)

vf_ped1 = violent_frames.iloc[:half]
vf_ped2 = violent_frames.iloc[half:]

# Helper function to copy frames and create new entries
def copy_and_append(vf_rows, target_folder, base_filename_prefix, start_index):
    new_entries = []
    for i, row in tqdm(vf_rows.iterrows(), total=len(vf_rows), desc=f"Copying to {base_filename_prefix}"):
        src_path = row['path']
        label = row['label']
        ext = os.path.splitext(src_path)[1]
        new_filename = f"{base_filename_prefix}_frame_{start_index + i:05d}{ext}"
        dest_path = os.path.join(target_folder, new_filename)

        if os.path.exists(src_path):
            shutil.copy2(src_path, dest_path)
            new_entries.append([dest_path, label])
            os.remove(src_path)  # Delete after copying
        else:
            tqdm.write(f"⚠️ File not found: {src_path}")

    return new_entries

# Step 3: Load original UCSD CSVs
df_ped1 = pd.read_csv(ped1_csv)
df_ped2 = pd.read_csv(ped2_csv)

# Step 4: Copy and append to each
ped1_new = copy_and_append(vf_ped1, ped1_folder, "vf_ped1", len(df_ped1))
ped2_new = copy_and_append(vf_ped2, ped2_folder, "vf_ped2", len(df_ped2))

# Step 5: Update UCSD CSVs
df_ped1 = pd.concat([df_ped1, pd.DataFrame(ped1_new, columns=["path", "label"])], ignore_index=True)
df_ped2 = pd.concat([df_ped2, pd.DataFrame(ped2_new, columns=["path", "label"])], ignore_index=True)

df_ped1.to_csv(ped1_csv, index=False)
df_ped2.to_csv(ped2_csv, index=False)

print(f"✅ Appended {len(ped1_new)} frames to ped1 and {len(ped2_new)} to ped2.")

# Step 6: Remove used frames from Violent-Flows CSV
vf_remaining = vf_df.drop(vf_df[vf_df["path"].isin(violent_frames["path"])].index)
vf_remaining.to_csv(violent_csv_path, index=False)

print(f"🧹 Removed {len(violent_frames)} frames from Violent-Flows and updated the CSV.")


Copying to vf_ped1: 100%|██████████| 1493/1493 [00:57<00:00, 26.14it/s] 
Copying to vf_ped2: 100%|██████████| 1493/1493 [01:42<00:00, 14.55it/s] 


✅ Appended 1493 frames to ped1 and 1493 to ped2.
🧹 Removed 2986 frames from Violent-Flows and updated the CSV.


testing if the changes occurred

violent flows : 

In [5]:
csv_path1 = 'data/Violent-Flows/violentflows_labels.csv'
df1 = pd.read_csv(csv_path1)
label_counts1 = df1['label'].value_counts()
print("🔢 train label Counts:")
print(label_counts1) 

🔢 train label Counts:
label
0    9544
1    9544
Name: count, dtype: int64


ped1 :

In [7]:
df1 = pd.read_csv('combined_data/UCSD_ped1/ped1_merged.csv')
print("🔢ped1 label Counts:")
print(df1["label"].value_counts())

🔢ped1 label Counts:
label
0    12764
1     2728
Name: count, dtype: int64


ped2 :

In [8]:
df1 = pd.read_csv('combined_data/UCSD_ped2/ped2_merged.csv')
print("🔢ped2 label Counts:")
print(df1["label"].value_counts())

🔢ped2 label Counts:
label
0    3284
1    2769
Name: count, dtype: int64


now we are going to remove the excesive normal frames from ped1 and ped2 to balance the data :

In [None]:
import pandas as pd
import os
from tqdm import tqdm

def reduce_excessive_zeros(csv_path, frames_to_keep, label0_total, set_name):
    print(f"\n🔧 Processing {set_name}...")
    df = pd.read_csv(csv_path)

    # Split 0s and 1s
    df_0 = df[df["label"] == 0].reset_index(drop=True)
    df_1 = df[df["label"] == 1].reset_index(drop=True)

    print(f"Label 0 count: {len(df_0)}, Label 1 count: {len(df_1)}")

    # Select frames to keep from label 0 with step to avoid clusters
    step = int(label0_total / frames_to_keep)
    df_0_kept = df_0.iloc[::step].reset_index(drop=True)

    # Truncate to exact target
    df_0_kept = df_0_kept.iloc[:frames_to_keep]

    # Frames to delete
    to_delete = df_0[~df_0["path"].isin(df_0_kept["path"])]

    # Delete excessive 0-label frames
    print(f"🧹 Deleting {len(to_delete)} excessive frames in {set_name}...")
    for _, row in tqdm(to_delete.iterrows(), total=len(to_delete), desc=f"Deleting frames from {set_name}"):
        try:
            os.remove(row["path"])
        except FileNotFoundError:
            pass

    # Final merged dataframe
    final_df = pd.concat([df_0_kept, df_1], ignore_index=True).sample(frac=1).reset_index(drop=True)
    final_df.to_csv(csv_path, index=False)

    print(f"✅ Finished {set_name}: Now {final_df['label'].value_counts().to_dict()}")

# === CONFIGURATION ===
ped1_csv = "combined_data/UCSD_ped1/ped1_merged.csv"
ped2_csv = "combined_data/UCSD_ped2/ped2_merged.csv"

reduce_excessive_zeros(ped1_csv, frames_to_keep=2728 * 2, label0_total=12764, set_name="ped1")  # keep 2x anomaly
reduce_excessive_zeros(ped2_csv, frames_to_keep=2769 , label0_total=3284, set_name="ped2")  # same logic



🔧 Processing ped1...
Label 0 count: 12764, Label 1 count: 2728
🧹 Deleting 7308 excessive frames in ped1...


Deleting frames from ped1: 100%|██████████| 7308/7308 [00:49<00:00, 148.17it/s]


✅ Finished ped1: Now {0: 5456, 1: 2728}

🔧 Processing ped2...
Label 0 count: 3284, Label 1 count: 2769


ValueError: slice step cannot be zero

In [11]:
df1 = pd.read_csv('combined_data/UCSD_ped1/ped1_merged.csv')
print("🔢ped1 label Counts:")
print(df1["label"].value_counts())

🔢ped1 label Counts:
label
0    5456
1    2728
Name: count, dtype: int64


In [15]:
df1 = pd.read_csv('combined_data/UCSD_ped2/ped2_merged.csv')
print("🔢ped2 label Counts:")
print(df1["label"].value_counts())

🔢ped2 label Counts:
label
0    3284
1    2769
Name: count, dtype: int64


creating train and test folders : 

In [16]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# === CONFIG ===
input_csv = 'combined_data/UCSD_ped1/ped1_merged.csv'
train_dir = 'combined_data/UCSD_ped1/ped1_train'
test_dir = 'combined_data/UCSD_ped1/ped1_test'
train_csv = 'combined_data/UCSD_ped1/ped1_train.csv'
test_csv = 'combined_data/UCSD_ped1/ped1_test.csv'
# ==============

# Step 1: Load CSV
df = pd.read_csv(input_csv)

# Step 2: Train-test split (no undersampling)
train_df, test_df = train_test_split(
    df,
    test_size=0.3,
    random_state=42,
    stratify=df['label']
)

# Step 3: Create folders
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

def move_files(df, target_folder):
    updated_rows = []
    for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Copying to {target_folder}"):
        old_path = row['path']
        label = row['label']
        filename = os.path.basename(old_path)
        new_path = os.path.join(target_folder, filename)

        if os.path.exists(old_path):
            shutil.copy2(old_path, new_path)
            updated_rows.append([new_path, label])
        else:
            tqdm.write(f"[Warning] Missing file: {old_path}")

    return pd.DataFrame(updated_rows, columns=['path', 'label'])

# Step 4: Copy files and save updated CSVs
train_final_df = move_files(train_df, train_dir)
test_final_df = move_files(test_df, test_dir)

train_final_df.to_csv(train_csv, index=False)
test_final_df.to_csv(test_csv, index=False)

print(f"\n✅ Finished. Train → {len(train_final_df)} samples. Test → {len(test_final_df)} samples.")
print(f"📂 Train folder: {train_dir} | CSV: {train_csv}")
print(f"📂 Test folder: {test_dir} | CSV: {test_csv}")


Copying to combined_data/UCSD_ped1/ped1_train: 100%|██████████| 5728/5728 [00:17<00:00, 324.72it/s]
Copying to combined_data/UCSD_ped1/ped1_test: 100%|██████████| 2456/2456 [00:15<00:00, 154.15it/s]



✅ Finished. Train → 5728 samples. Test → 2456 samples.
📂 Train folder: combined_data/UCSD_ped1/ped1_train | CSV: combined_data/UCSD_ped1/ped1_train.csv
📂 Test folder: combined_data/UCSD_ped1/ped1_test | CSV: combined_data/UCSD_ped1/ped1_test.csv
