In [1]:
import os
os.chdir('/Coding/CVPR2025_abaw_framewise/')

In [2]:
import pandas as pd
import yaml

data_path = "data_abaw"

with open(os.path.join(data_path, "annotation.yml"), "r", encoding="utf-8") as f:
    data = yaml.safe_load(f)

yml = pd.DataFrame([{"video_path": video,
                     "frame_id": path.split("/")[-1].replace("frame-", "").replace(".jpg", ""),
                     "class_id": class_id,}
                    for video, details in data.items()
                    for path, class_id in details.get("frame_annotation", [])
                    ])

In [3]:
yml.head()
print(yml["video_path"][0])

82557/Visite 1/82557_Question_1_2024-08-22 14-46-11_Video.mp4


In [4]:
import pandas as pd

def create_segments(df):
    segments = []
    df['frame_id'] = df['frame_id'].astype(int)
    
    for video, group in df.groupby("video_path"):
        group = group.sort_values("frame_id").reset_index(drop=True)
        group['segment'] = (group['class_id'] != group['class_id'].shift(1)).cumsum()
        
        for _, sub in group.groupby("segment"):
            frame_id_start = sub['frame_id'].iloc[0]
            frame_id_end = sub['frame_id'].iloc[-1] + 1
            class_id = sub['class_id'].iloc[0]
            segments.append({
                "video_path": video,
                "frame_id_start": frame_id_start,
                "frame_id_end": frame_id_end - 1,
                "class_id": class_id
            })
            
    return pd.DataFrame(segments)

yml_clusterted = create_segments(yml)
yml_clusterted.head()

Unnamed: 0,video_path,frame_id_start,frame_id_end,class_id
0,82557/Visite 1/82557_Question_1_2024-08-22 14-...,0,141,0
1,82557/Visite 1/82557_Question_1_2024-08-22 14-...,142,222,1
2,82557/Visite 1/82557_Question_1_2024-08-22 14-...,223,481,0
3,82557/Visite 1/82557_Question_2_2024-08-22 14-...,0,157,0
4,82557/Visite 1/82557_Question_2_2024-08-22 14-...,158,229,1


In [5]:
import os
import pandas as pd
from tqdm import tqdm
import soundfile as sf
import numpy as np
import math

def read_meta_text(path):
    data = []
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(',', 2)
            video_path, label, text = parts
            file_path = os.path.join(video_path, os.path.basename(video_path).removesuffix('.mp4'))
            data.append([file_path, label, text])
    return pd.DataFrame(data, columns=['general_path', 'class_id', 'text'])

def generate_train_meta(yml_df, fps=24.0, sr=16 * 1e3):
    df = yml_df.copy()
    
    df = df.rename(columns={"frame_id_start": "frame_start_index"})
    df["sound_start_index"] = np.floor((df["frame_start_index"] / fps) * sr).astype(int)

    df = df.rename(columns={"frame_id_end": "frame_end_index"})
    df["sound_end_index"] = np.floor((df["frame_end_index"] / fps) * sr).astype(int)

    df['general_path'] = df['video_path'].apply(
        lambda path: os.path.join(path, os.path.basename(path).removesuffix(".mp4")))    
    df.drop(columns=['video_path'], inplace=True)

    return df

def df_filter_label(all_meta_df, label_df):
    filtered_all_meta_df = all_meta_df[all_meta_df['general_path'].isin(label_df['general_path'])].copy()
    order_map = {gp: idx for idx, gp in enumerate(label_df.drop_duplicates(subset='general_path')['general_path'])}
    filtered_all_meta_df['order'] = filtered_all_meta_df['general_path'].map(order_map)
    filtered_all_meta_df.sort_values(['order', 'frame_start_index'], inplace=True)
    filtered_all_meta_df.drop(columns=['order'], inplace=True)
    return filtered_all_meta_df

In [6]:
data_path = "data_abaw"
train_label_file = "data_abaw/default-split/train.txt"
eval_label_file = "data_abaw/default-split/val.txt"

#jump_s = 1.5
fps = 24
train_output_csv = f"data_abaw/splits/train.csv"
eval_output_csv = f"data_abaw/splits/val.csv"
train_and_eval_output_csv = f"data_abaw/splits/train_and_eval.csv"

all_meta_df = generate_train_meta(yml_clusterted, fps=fps, sr=16*1e3)

train_label_df = read_meta_text(train_label_file)
train_df = df_filter_label(all_meta_df, train_label_df)

eval_label_df = read_meta_text(eval_label_file)
eval_df = df_filter_label(all_meta_df, eval_label_df)

train_df.to_csv(train_output_csv, index=False)
eval_df.to_csv(eval_output_csv, index=False)
all_meta_df.to_csv(train_and_eval_output_csv, index=False)

In [7]:
train_df.head()

Unnamed: 0,frame_start_index,frame_end_index,class_id,sound_start_index,sound_end_index,general_path
418,0,75,1,0,50000,82622/Visite_1/82622_Question_7_2024-11-05_15-...
419,76,722,0,50666,481333,82622/Visite_1/82622_Question_7_2024-11-05_15-...
420,723,788,1,482000,525333,82622/Visite_1/82622_Question_7_2024-11-05_15-...
421,789,810,0,526000,540000,82622/Visite_1/82622_Question_7_2024-11-05_15-...
422,811,938,1,540666,625333,82622/Visite_1/82622_Question_7_2024-11-05_15-...


In [8]:
eval_df.head()

Unnamed: 0,frame_start_index,frame_end_index,class_id,sound_start_index,sound_end_index,general_path
1030,0,263,0,0,175333,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1031,264,311,1,176000,207333,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1032,312,358,0,208000,238666,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1033,359,434,1,239333,289333,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1034,435,597,0,290000,398000,82688/Visite_1/82688_Question_1_2024-11-14_13-...


In [9]:
train_df.head()

Unnamed: 0,frame_start_index,frame_end_index,class_id,sound_start_index,sound_end_index,general_path
418,0,75,1,0,50000,82622/Visite_1/82622_Question_7_2024-11-05_15-...
419,76,722,0,50666,481333,82622/Visite_1/82622_Question_7_2024-11-05_15-...
420,723,788,1,482000,525333,82622/Visite_1/82622_Question_7_2024-11-05_15-...
421,789,810,0,526000,540000,82622/Visite_1/82622_Question_7_2024-11-05_15-...
422,811,938,1,540666,625333,82622/Visite_1/82622_Question_7_2024-11-05_15-...


In [10]:
# function within eval_dataset to "de-cluster" eval-metadata
def generate_eval_meta(eval_df, fps, eval_fps, sr): #label_file_csv):
    raw_meta_df = eval_df # pd.read_csv(label_file_csv)

    eval_rows = []
    step = int(fps / eval_fps)
    
    for _, row in raw_meta_df.iterrows():
        frame_start = row['frame_start_index']
        frame_end = row['frame_end_index']
        class_id = row['class_id']
        general_path = row['general_path']
        
        # Erzeuge die sub-sampled Frame-Indizes: Hier gehen wir davon aus, dass 
        # frame_end_index inklusiv ist – falls nicht, evtl. frame_end + 1 verwenden.
        for frame_index in np.arange(frame_start, frame_end + 1, step):
            sound_index = int(np.floor((frame_index / fps) * sr))
            eval_rows.append({
                'frame_index': int(frame_index),
                'sound_index': sound_index,
                'class_id': class_id,
                'general_path': general_path
            })
    
    return pd.DataFrame(eval_rows)

In [11]:
print("clustered: ", len(eval_df))

eval_fps = 24
re_eval_df = generate_eval_meta(eval_df, fps=24, eval_fps=eval_fps, sr=16 * 1e3)
print(len(re_eval_df))

eval_fps = 12
re_eval_df = generate_eval_meta(eval_df, fps=24, eval_fps=eval_fps, sr=16 * 1e3)
print(len(re_eval_df))

eval_fps = 6
re_eval_df = generate_eval_meta(eval_df, fps=24, eval_fps=eval_fps, sr=16 * 1e3)
print(len(re_eval_df))

eval_fps = 0.3
re_eval_df = generate_eval_meta(eval_df, fps=24, eval_fps=eval_fps, sr=16 * 1e3)
print(len(re_eval_df))

eval_df.head()

clustered:  454
89600
44914
22563
1345


Unnamed: 0,frame_start_index,frame_end_index,class_id,sound_start_index,sound_end_index,general_path
1030,0,263,0,0,175333,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1031,264,311,1,176000,207333,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1032,312,358,0,208000,238666,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1033,359,434,1,239333,289333,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1034,435,597,0,290000,398000,82688/Visite_1/82688_Question_1_2024-11-14_13-...


In [12]:
re_eval_df.head()

Unnamed: 0,frame_index,sound_index,class_id,general_path
0,0,0,0,82688/Visite_1/82688_Question_1_2024-11-14_13-...
1,80,53333,0,82688/Visite_1/82688_Question_1_2024-11-14_13-...
2,160,106666,0,82688/Visite_1/82688_Question_1_2024-11-14_13-...
3,240,160000,0,82688/Visite_1/82688_Question_1_2024-11-14_13-...
4,264,176000,1,82688/Visite_1/82688_Question_1_2024-11-14_13-...


In [13]:
import os
import pandas as pd
from tqdm import tqdm
import soundfile as sf
import numpy as np
import math

def generate_train_meta_old(data_folder, eval_df, jump_s=6.0, fps=24.0):
    # Initialize an empty DataFrame with specified columns
    framewise_meta = pd.DataFrame(columns=['general_path', 'class_id', 'text', 'frame_time_index'])

    # Iterate over each row in eval_df
    for _, row in tqdm(eval_df.iterrows(), total=eval_df.shape[0], desc="Generating frame-wise metadata"):
        audio_file_path = os.path.join(data_folder, "sound_16kHz", row['general_path'] + ".wav")
        audio_data, sr = sf.read(audio_file_path, dtype="float32")
        num_frames_all = len(audio_data)
        num_frames_sample = int((num_frames_all / sr) / jump_s)

        # Append rows to framewise_meta for each timestep
        for sound_index in tqdm(
                np.linspace(0, num_frames_all - 1, num_frames_sample, dtype=int),  # Gleichmäßig verteilte Indizes
                total=num_frames_sample, 
                desc="sub - for-loop", 
                position=1, 
                leave=False
                ):
            
            frame_index = math.floor((sound_index / sr) * fps)

            class_id = yaml

            new_row = {
                'general_path': row['general_path'],
                'class_id': class_id,
                'text': row['text'],
                'sound_start_index': sound_index,
                'frame_start_index': frame_index,
                'sound_end_index': sound_index,
                'frame_end_index': frame_index,
            }

            framewise_meta = pd.concat([framewise_meta, pd.DataFrame([new_row])], ignore_index=True)
    
    return framewise_meta