In [None]:
import pandas as pd
import numpy as np
import os

BASE_PATH = r"D:\UK\00. 2024 QMUL\00. Course\SAV-ViolenceDetection"
ANNOTATION_DIR = os.path.join(BASE_PATH, "Annotations_Final")

FINAL_ANNOTATION_PATH = os.path.join(ANNOTATION_DIR, "2. Violence-label-only dataset_final.csv")
df = pd.read_csv(FINAL_ANNOTATION_PATH, encoding = 'latin1')


In [None]:
output_dir = r"D:\UK\00. 2024 QMUL\00. Course\SAV-ViolenceDetection\Data_split\Violence_Label_Only_split"

video_ids = df['Video ID'].unique()
np.random.seed(42)
shuffle_ids = np.random.permutation(video_ids) # Returns a new shuffled array

n_total = len(shuffle_ids)
n_train = int(n_total * 0.70)
train_ids = shuffle_ids[:n_train]
test_ids = shuffle_ids[n_train:]

def assign_split(row):
    vid = row['Video ID']
    if vid in train_ids:
        return 'train'
    if vid in test_ids:
        return 'test'
    else:
        return 'none'
    
df['Split'] = df.apply(assign_split, axis=1)

for value in df['Split'].unique():
    subset = df[df['Split'] == value]
    filename = os.path.join(output_dir, f'{value}.csv')
    subset.to_csv(filename, index=False)

In [19]:
df['Split'].value_counts()

Split
train    669
test     326
Name: count, dtype: int64

In [None]:
# video summary of three datasets

import pandas as pd
import numpy as np
import os

folder_dir = r"D:\UK\00. 2024 QMUL\00. Course\SAV-ViolenceDetection\Data_split\Violence_Label_Only_split"
train_data = pd.read_csv(os.path.join(folder_dir, "train.csv"))
test_data = pd.read_csv(os.path.join(folder_dir, 'test.csv'))


def extract_video_summary(df, split_name):
    row = {'Split': split_name}
    
    # step 1 - violence cat: get unique video IDs per type
    grouped = df.groupby('Video ID')['Violence label(video)'].unique()

    violent_videos = grouped[grouped.apply(lambda x: 1 in x)].index
    non_violent_videos = grouped[grouped.apply(lambda x: 0 in x and 1 not in x)].index


    row['Videos - Violent'] = len(violent_videos)
    row['Videos - Non-Violent'] = len(non_violent_videos)
    row['Segments - Violent'] = (df['Violence label(video)'] == 1).sum()
    row['Segments - Non-Violent'] = (df['Violence label(video)'] == 0).sum()

    row['Frames - Violent'] = df[df['Violence label(video)'] == 1]['No. Frames'].sum()
    row['Frames - Non-Violent'] = df[df['Violence label(video)'] == 0]['No. Frames'].sum()

    return row

rows = []
rows.append(extract_video_summary(train_data, 'train'))
rows.append(extract_video_summary(test_data, 'test'))

summary_df = pd.DataFrame(rows)
summary_df = summary_df.fillna(0).astype({col: int for col in summary_df.columns if col != 'Split'})
summary_df


Unnamed: 0,Split,Videos - Violent,Videos - Non-Violent,Segments - Violent,Segments - Non-Violent,Frames - Violent,Frames - Non-Violent
0,train,92,45,288,381,103235,357744
1,test,43,16,143,183,36715,152307


In [17]:
print(train_data['Violence label(video)'].unique())

[1 0]


In [None]:
# segments summary of three datasets

def extract_segment_summary(df, split_name):
    row = {'Split': split_name}

    # step 1 - violence type counts (segment-level)
    df['Violence Type (Video)'] = df['Violence Type (Video)'].str.lower()
    row['Segments - Violent'] = (df['Violence Type (Video)'] == 'violent').sum()
    row['Segments - Non-Violent'] = (df['Violence Type (Video)'] == 'non-violent').sum()
    row['Segments - None'] = (~df['Violence Type (Video)'].isin(['violent', 'non-violent'])).sum()

    # step 2 - modality type counts (segment-level)
    df['Modality'] = df['Modality'].str.lower()
    row['Modality - Unimodal (Video)'] = (df['Modality'] == 'unimodal(video)').sum()
    row['Modality - Multimodal'] = (df['Modality'] != 'unimodal(video)').sum()

    # step 3 - video type counts (segment-level)
    df['Video Type'] = df['Video Type'].str.lower()
    video_type_counts = df['Video Type'].value_counts()
    for vt in video_type_counts.index:
        row[f'Video Type - {vt}'] = video_type_counts[vt]

    return row

segment_rows = []
segment_rows.append(extract_segment_summary(train_data, 'Train'))
# segment_rows.append(extract_segment_summary(val_data, 'Validation'))
segment_rows.append(extract_segment_summary(test_data, 'Test'))

segment_summary_df = pd.DataFrame(segment_rows)
segment_summary_df = segment_summary_df.fillna(0).astype({col: int for col in segment_summary_df.columns if col != 'Split'})
segment_summary_df

Unnamed: 0,Split,Segments - Violent,Segments - Non-Violent,Segments - None,Modality - Unimodal (Video),Modality - Multimodal,Video Type - news,Video Type - cctv,Video Type - self-filmed,Video Type - combination,Video Type - others,Video Type - bodycam,Video Type - dashcam
0,Train,435,1018,196,1110,539,736,366,287,92,66,55,47
1,Validation,85,193,52,242,88,108,65,61,19,29,29,19
2,Test,71,213,61,171,174,171,53,75,12,10,10,14


In [41]:
# Ratio of training/validating/testing dataset (violence : non-violence)

def format_ratio(v, nv):
    if v == 0:
        return f'{nv} : 0'
    else:
        ratio = nv / v
        return f'1: {round(ratio, 2)}'

segment_summary_df['Ratio violence : non-violence'] = segment_summary_df.apply(
    lambda row: format_ratio(row['Segments - Violent'], row['Segments - Non-Violent']),
    axis=1
)
segment_summary_df[['Ratio violence : non-violence']]

Unnamed: 0,Ratio violence : non-violence
0,1: 2.34
1,1: 2.27
2,1: 3.0
