correcting data to improve models :

avenue : 

In [1]:
import os
import shutil
import pandas as pd
from tqdm import tqdm

# ==== USER: MODIFY THESE ====
csv_file1 = 'data/avenue/test_labels.csv'
csv_file2 = 'data/avenue/train_labels.csv'
combined_folder = 'combined_data/avenue/combined_frames'
output_csv = 'combined_data/avenue/combined_data.csv'
# ============================

# Create the output folder if it doesn't exist
os.makedirs(combined_folder, exist_ok=True)

# Load CSVs
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)

# Combine them
combined_df = pd.concat([df1, df2], ignore_index=True)

# Column names (change if yours are different)
path_col = 'path'
label_col = 'label'

# New rows to save in the output CSV
new_rows = []

# Iterate with progress bar
for i, row in tqdm(combined_df.iterrows(), total=len(combined_df), desc="Processing frames"):
    original_path = row[path_col]
    label = row[label_col]

    # Ensure the original path exists
    if not os.path.exists(original_path):
        tqdm.write(f"[Warning] File not found: {original_path}")
        continue

    # Generate a unique filename to avoid collisions
    file_ext = os.path.splitext(original_path)[1]  # keep original extension
    new_filename = f"frame_{i:05d}{file_ext}"      # e.g., frame_00001.jpg
    new_path = os.path.join(combined_folder, new_filename)

    # Copy the file
    shutil.copy2(original_path, new_path)

    # Save new path and label
    new_rows.append([new_path, label])

# Save the new CSV
output_df = pd.DataFrame(new_rows, columns=[path_col, label_col])
output_df.to_csv(output_csv, index=False)

print(f"\n✅ Done. All frames copied and renamed into '{combined_folder}'")
print(f"📄 Updated CSV saved as '{output_csv}' with new paths.")


Processing frames: 100%|██████████| 30652/30652 [04:41<00:00, 108.73it/s]



✅ Done. All frames copied and renamed into 'combined_data/avenue/combined_frames'
📄 Updated CSV saved as 'combined_data/avenue/combined_data.csv' with new paths.


In [3]:
import pandas as pd

# Path to your combined CSV
csv_path = 'combined_data/avenue/combined_data.csv'

# Load the CSV
df = pd.read_csv(csv_path)

# Count values in 'label' column
label_counts = df['label'].value_counts()

# Print results
print("🔢 Label Counts:")
print(label_counts)

# Optional: show count for each label explicitly
print(f"\nTotal 0s: {label_counts.get(0, 0)}")
print(f"Total 1s: {label_counts.get(1, 0)}")


🔢 Label Counts:
label
0    26051
1     4601
Name: count, dtype: int64

Total 0s: 26051
Total 1s: 4601


Read combined_data.csv with paths and labels.

1- Undersample label 0:

Keep all label 1 frames.

From label 0, take 1 frame, skip the next 4, repeat.

2- Split the resulting data into:

70% training → move to folder avenue_train/

30% testing → move to folder test_avenue/

3- Save new CSV files for both sets: train.csv, test.csv.

In [1]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# === CONFIG ===
input_csv = 'combined_data/avenue/combined_data.csv'
train_dir = 'combined_data/avenue/avenue_train'
test_dir = 'combined_data/avenue/test_avenue'
train_csv = 'combined_data/avenue/train.csv'
test_csv = 'combined_data/avenue/test.csv'
undersample_ratio = 5  # keep 1 every 5 label 0 frames
# ==============

# Step 1: Load CSV
df = pd.read_csv(input_csv)

# Step 2: Undersample label 0
label_1_df = df[df['label'] == 1]
label_0_df = df[df['label'] == 0].reset_index(drop=True)
label_0_sampled = label_0_df.iloc[::undersample_ratio]

# Step 3: Combine balanced data
balanced_df = pd.concat([label_1_df, label_0_sampled], ignore_index=True).sample(frac=1, random_state=42)

# Step 4: Train-test split
train_df, test_df = train_test_split(balanced_df, test_size=0.3, random_state=42, stratify=balanced_df['label'])

# Step 5: Create output folders
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

def move_files(df, target_folder):
    updated_rows = []
    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"Copying to {target_folder}"):
        old_path = row['path']
        label = row['label']
        filename = os.path.basename(old_path)
        new_path = os.path.join(target_folder, filename)

        if os.path.exists(old_path):
            shutil.copy2(old_path, new_path)
            updated_rows.append([new_path, label])
        else:
            tqdm.write(f"[Warning] Missing file: {old_path}")

    return pd.DataFrame(updated_rows, columns=['path', 'label'])

# Step 6: Move files and save new CSVs
train_final_df = move_files(train_df, train_dir)
test_final_df = move_files(test_df, test_dir)

train_final_df.to_csv(train_csv, index=False)
test_final_df.to_csv(test_csv, index=False)

print(f"\n✅ Finished. Train → {len(train_final_df)} samples. Test → {len(test_final_df)} samples.")
print(f"📂 Train folder: {train_dir} | CSV: {train_csv}")
print(f"📂 Test folder: {test_dir} | CSV: {test_csv}")


Copying to combined_data/avenue/avenue_train: 100%|██████████| 6868/6868 [04:41<00:00, 24.44it/s]
Copying to combined_data/avenue/test_avenue: 100%|██████████| 2944/2944 [02:11<00:00, 22.32it/s]



✅ Finished. Train → 6868 samples. Test → 2944 samples.
📂 Train folder: combined_data/avenue/avenue_train | CSV: combined_data/avenue/train.csv
📂 Test folder: combined_data/avenue/test_avenue | CSV: combined_data/avenue/test.csv


In [2]:
import pandas as pd

# Path to your combined CSV
csv_path1 = 'combined_data/avenue/train.csv'
csv_path2 = 'combined_data/avenue/test.csv'

# Load the CSV
df1 = pd.read_csv(csv_path1)
df2 = pd.read_csv(csv_path2)

# Count values in 'label' column
label_counts1 = df1['label'].value_counts()
label_counts2 = df2['label'].value_counts()

# Print results
print("🔢 train label Counts:")
print(label_counts1)
print("🔢 test label Counts:")
print(label_counts2)

🔢 train label Counts:
label
0    3647
1    3221
Name: count, dtype: int64
🔢 test label Counts:
label
0    1564
1    1380
Name: count, dtype: int64


In [None]:
import pandas as pd

# Path to your combined CSV
csv_path1 = 'data/Violent-Flows/violentflows_labels.csv'
# Load the CSV
df1 = pd.read_csv(csv_path1)

# Count values in 'label' column
label_counts1 = df1['label'].value_counts()

# Print results
print("🔢 train label Counts:")
print(label_counts1) 

🔢 train label Counts:
label
1    12530
0     9544
Name: count, dtype: int64


so the violent_flows dataset doesn't need any fixing 