# Preprocessing windowing, Framing, dan Splitting

In [1]:
import argparse
import json
import os
import subprocess
import sys
import tempfile
import pandas as pd
import numpy as np
from torchvision import transforms
from PIL import Image
import torch
import librosa
import soundfile as sf
from torch.utils.data import Dataset, DataLoader

try:
    import cv2
except Exception as e:
    print("Error importing cv2 (opencv-python). Install with: pip install opencv-python")
    raise

try:
    import librosa
    import soundfile as sf
except Exception as e:
    print("Error importing librosa/soundfile. Install with: pip install librosa soundfile")
    raise

In [2]:
csv_directory = r'D:\Satria_Data\train'

In [3]:
csv_files = ["train_scrap.csv", "train_audio.csv", "transcription_manifest.csv"]

dfs = []
for csv_file in csv_files:
    csv_path = os.path.join(csv_directory, csv_file)
    if os.path.exists(csv_path):
        dfi = pd.read_csv(csv_path)
        dfs.append(dfi)
    else:
        print("Missing", csv_path)

if not dfs:
    raise RuntimeError("No CSV files found")

# merge all on 'id' using outer join
merge_df = dfs[0]
for dfi in dfs[1:]:
    merge_df = pd.merge(merge_df, dfi, on='id', how='outer')

merge_df['audio_path'] = merge_df['audio_path'].str.replace(r'^/content/drive/MyDrive/Satria_Data', "D:/Satria_Data", regex=True)
merge_df['path'] = merge_df['path'].str.replace(r'^/content/drive/MyDrive/Satria_Data', "D:/Satria_Data", regex=True)
merge_df['transcript_path'] = merge_df['transcript_path'].str.replace(r'^/content/drive/MyDrive/Satria_Data', "D:/Satria_Data", regex=True)

# Save combined
combined_csv_path = os.path.join(csv_directory, "combined_paths.csv")
merge_df.to_csv(combined_csv_path, index=False)
print(f"Combined CSV saved to {combined_csv_path}")

Combined CSV saved to D:\Satria_Data\train\combined_paths.csv


In [4]:
merge_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 802 entries, 0 to 801
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               802 non-null    int64 
 1   path             766 non-null    object
 2   emotion_x        802 non-null    object
 3   audio_path       745 non-null    object
 4   emotion_y        745 non-null    object
 5   transcript_path  745 non-null    object
 6   emotion          745 non-null    object
dtypes: int64(1), object(6)
memory usage: 44.0+ KB


In [5]:
merge_df.dropna(subset=['audio_path', 'path', 'transcript_path'], inplace=True)

In [7]:
# Find the minimum count for each emotion_x
min_count = merge_df['emotion_x'].value_counts().min()

# Sample min_count rows for each emotion_x to balance the dataset
merge_df = merge_df.groupby('emotion_x').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)

print(merge_df['emotion_x'].value_counts())

emotion_x
Anger       5
Fear        5
Joy         5
Neutral     5
Proud       5
Sadness     5
Surprise    5
Trust       5
Name: count, dtype: int64


  merge_df = merge_df.groupby('emotion_x').apply(lambda x: x.sample(n=min_count, random_state=42)).reset_index(drop=True)


# Framings

In [8]:
def run_ffprobe(path):
    """Return duration (s) and frame rate (fps) using ffprobe."""
    cmd = [
        "ffprobe", "-v", "error",
        "-select_streams", "v:0",
        "-show_entries", "stream=avg_frame_rate,duration",
        "-of", "json",
        path
    ]
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if p.returncode != 0:
        raise RuntimeError(f"ffprobe failed: {p.stderr}")
    info = json.loads(p.stdout)
    streams = info.get("streams", [])
    if not streams:
        raise RuntimeError("No video stream found in ffprobe output.")
    stream = streams[0]
    # avg_frame_rate is like "30000/1001" or "30/1"
    avg_frame_rate = stream.get("avg_frame_rate", "0/1")
    num, den = avg_frame_rate.split("/")
    fps = float(num) / float(den) if float(den) != 0 else 0.0
    # duration fallback
    duration = float(stream.get("duration", 0.0))
    return duration, fps


def extract_audio_to_wav(video_path, out_wav):
    """Use ffmpeg to extract audio to 16k mono wav."""
    cmd = ["ffmpeg", "-y", "-i", video_path, "-vn", "-ac", "1", "-ar", "16000", out_wav]
    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if p.returncode != 0:
        # ffmpeg sometimes returns non-zero but still writes file; check file exists
        if not os.path.exists(out_wav):
            raise RuntimeError(f"ffmpeg failed producing wav. stderr: {p.stderr}")
    return out_wav


def frames_between(start_s, end_s, fps, total_frames):
    """Return list of frame indices that lie within [start_s, end_s)."""
    # frame i timestamp (s) = i / fps
    first = int(np.floor(start_s * fps))
    last = int(np.ceil(end_s * fps)) - 1
    first = max(0, first)
    last = min(total_frames - 1, last)
    if last < first:
        return []
    return list(range(first, last + 1))


def sample_n_frames_in_window(frame_indices, n):
    """Uniformly sample up to n frames from provided frame_indices."""
    if not frame_indices:
        return []
    if len(frame_indices) <= n:
        return frame_indices
    # Uniform sampling
    idxs = np.linspace(0, len(frame_indices) - 1, num=n, dtype=int)
    return [frame_indices[i] for i in idxs]


def load_transcript_json(path):
    """Load transcript JSON list of objects with 'word','start','end' fields."""
    if not path or not isinstance(path, str) or not os.path.exists(path):
        raise FileNotFoundError(path)
    with open(path, "r", encoding="utf8") as f:
        data = json.load(f)
    # Accept either list of words or whisper-style segments; normalize to list of word-like dicts
    words = []
    # If data is a dict with 'segments' (whisper), flatten
    if isinstance(data, dict) and "segments" in data:
        for seg in data["segments"]:
            txt = seg.get("text", "").strip()
            words.append({"word": txt, "start": float(seg.get("start", 0.0)), "end": float(seg.get("end", 0.0))})
        return words
    if isinstance(data, list):
        for entry in data:
            if isinstance(entry, dict) and ("word" in entry or "text" in entry):
                w = entry.get("word", entry.get("text", "")).strip()
                start = float(entry.get("start", entry.get("start_time", 0.0)))
                end = float(entry.get("end", entry.get("end_time", start)))
                words.append({"word": w, "start": start, "end": end})
        return words
    raise ValueError("Transcript JSON format not recognized. Expected list or dict with 'segments'.")


def words_in_window(words, start, end):
    """Return list of words whose time overlaps [start,end)."""
    selected = []
    for w in words:
        if not (w["end"] < start or w["start"] > end):
            selected.append(w)
    return selected


def main(video, audio, transcript, emotion=None, window=1.0, stride=0.5, frames_per_window=4, out_json=""):
    # Improved validation
    if not isinstance(video, str) or not os.path.exists(video):
        raise FileNotFoundError(f"Video not found or invalid path: {video}")
    if not isinstance(audio, str) or not os.path.exists(audio):
        raise FileNotFoundError(f"Audio not found or invalid path: {audio}")
    if transcript and (not isinstance(transcript, str) or not os.path.exists(transcript)):
        print(f"Transcript path invalid or missing: {transcript}; continuing without transcript.")
        transcript = None

    duration, fps = run_ffprobe(video)
    print(f"Video duration: {duration:.3f}s, fps: {fps:.3f}")

    cap = cv2.VideoCapture(video)
    if not cap.isOpened():
        raise RuntimeError("Cannot open video with cv2.VideoCapture")
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total_frames == 0:
        total_frames = int(np.floor(duration * fps))

    print(f"Total frames (reported/estimated): {total_frames}")

    y, sr = librosa.load(audio, sr=16000)
    audio_len = y.shape[0] / sr
    print(f"Loaded audio: {y.shape[0]} samples, {sr} Hz, duration {audio_len:.3f}s")

    words = []
    if transcript:
        try:
            words = load_transcript_json(transcript)
            print(f"Loaded transcript with {len(words)} entries")
        except Exception as te:
            print(f"Failed loading transcript {transcript}: {te}")
            words = []

    windows = []
    t = 0.0
    i = 0
    while t < duration:
        start = t
        end = min(duration, t + window)
        frame_inds = frames_between(start, end, fps, total_frames)
        sampled = sample_n_frames_in_window(frame_inds, frames_per_window)
        frame_times = [fi / fps for fi in sampled]
        a_start = int(max(0, np.floor(start * sr)))
        a_end = int(min(len(y), np.ceil(end * sr)))
        # y_segment is not stored; if needed later, compute from audio_samples
        w_sel = words_in_window(words, start, end) if words else []
        windows.append({
            "index": i,
            "start": float(start),
            "end": float(end),
            "frame_indices": sampled,
            "frame_times": frame_times,
            "audio_samples": [int(a_start), int(a_end)],
            "num_audio_samples": int(a_end - a_start),
            "transcript_words": w_sel,
            "emotion": emotion
        })
        i += 1
        t += stride

    print("\nWindows summary (first 8 windows):")
    for w in windows[:8]:
        print(f"Window {w['index']}: {w['start']:.3f}s - {w['end']:.3f}s | frames {w['frame_indices']} times {['{:.3f}'.format(x) for x in w['frame_times']]} | audio_samples {w['audio_samples']} | words {len(w['transcript_words'])}")
        if w['transcript_words']:
            for we in w['transcript_words']:
                print(f"  * '{we['word']}' ({we['start']:.3f}-{we['end']:.3f})")
    print(f"\nTotal windows: {len(windows)} (window={window}s stride={stride}s)")

    if out_json:
        out = {
            "video": video,
            "audio": audio,
            "duration": duration,
            "fps": fps,
            "windows": windows,
            "emotion": emotion
        }
        with open(out_json, "w", encoding="utf8") as f:
            json.dump(out, f, indent=2)
        print("Saved windows mapping to:", out_json)

    cap.release()

In [9]:
# --- create windows JSON for each video and record the json path ---
import math
json_rows = {"id": [], "window_path": [], "status": [], "reason": []}
WINDOWS_OUT_DIR = os.path.join(csv_directory, "windows")
os.makedirs(WINDOWS_OUT_DIR, exist_ok=True)

processed = 0
skipped = 0
failed = 0

def _clean_path(p):
    if p is None:
        return None
    if isinstance(p, float) and math.isnan(p):
        return None
    if not isinstance(p, str):
        return None
    p = p.strip().strip('"').strip("'")
    if not p:
        return None
    return p

for _, row in merge_df.iterrows():
    vid = row['id']
    raw_video_path = row.get('path')
    raw_audio_path = row.get('audio_path')
    raw_transcription_path = row.get('transcript_path')  # may be None / NaN
    emotion = (row.get('emotion') or row.get('label') or row.get('emotion_x') or row.get('emotion_y') or "").strip()

    video_path = _clean_path(raw_video_path)
    audio_path = _clean_path(raw_audio_path)
    transcription_path = _clean_path(raw_transcription_path)

    if not video_path or not audio_path:
        print(f"Skipping {vid}: missing video or audio path (video={video_path}, audio={audio_path})")
        json_rows['id'].append(vid)
        json_rows['window_path'].append(None)
        json_rows['status'].append('skipped')
        json_rows['reason'].append('missing_video_or_audio')
        skipped += 1
        continue

    out_json = os.path.join(WINDOWS_OUT_DIR, f"{vid}_windows.json")
    try:
        main(video_path, audio_path, transcription_path, emotion=emotion, window=1.0, stride=0.5, frames_per_window=4, out_json=out_json)
        json_rows['id'].append(vid)
        json_rows['window_path'].append(out_json)
        json_rows['status'].append('ok')
        json_rows['reason'].append('')
        processed += 1
    except Exception as e:
        print(f"Error processing {vid}: {e}")
        json_rows['id'].append(vid)
        json_rows['window_path'].append(None)
        json_rows['status'].append('error')
        json_rows['reason'].append(str(e)[:300])
        failed += 1

print(f"Summary -> processed: {processed}, skipped: {skipped}, failed: {failed}")

json_df = pd.DataFrame(json_rows)

# Merge the new json paths into merge_df (outer join)
merge_df = pd.merge(merge_df, json_df, on='id', how='left')
window_combined_path = os.path.join(csv_directory, "window_combined_paths.csv")
merge_df.to_csv(window_combined_path, index=False)
print(f"Saved merged combined CSV with window paths to {window_combined_path}")

Video duration: 136.433s, fps: 30.000
Total frames (reported/estimated): 4093
Loaded audio: 2184480 samples, 16000 Hz, duration 136.530s
Loaded transcript with 25 entries

Windows summary (first 8 windows):
Window 0: 0.000s - 1.000s | frames [0, 9, 19, 29] times ['0.000', '0.300', '0.633', '0.967'] | audio_samples [0, 16000] | words 1
  * 'Hai ini dia yang katanya teks editor pengganti VS Code yang udah built-in AI nya gratis tanpa' (0.000-5.360)
Window 1: 0.500s - 1.500s | frames [15, 24, 34, 44] times ['0.500', '0.800', '1.133', '1.467'] | audio_samples [8000, 24000] | words 1
  * 'Hai ini dia yang katanya teks editor pengganti VS Code yang udah built-in AI nya gratis tanpa' (0.000-5.360)
Window 2: 1.000s - 2.000s | frames [30, 39, 49, 59] times ['1.000', '1.300', '1.633', '1.967'] | audio_samples [16000, 32000] | words 1
  * 'Hai ini dia yang katanya teks editor pengganti VS Code yang udah built-in AI nya gratis tanpa' (0.000-5.360)
Window 3: 1.500s - 2.500s | frames [45, 54, 64, 74

In [10]:
merge_df = pd.read_csv(window_combined_path)
# Drop only rows where the window JSON wasn't produced
before = len(merge_df)
merge_df = merge_df.dropna(subset=['window_path']).reset_index(drop=True)
after = len(merge_df)
print(f"Filtered merge_df from {before} to {after} rows keeping successful windows.")

Filtered merge_df from 40 to 40 rows keeping successful windows.


In [11]:
# data splitting
# ensure we have a canonical label column 'label' (fallback to emotion columns)
if 'label' not in merge_df.columns:
    if 'emotion' in merge_df.columns:
        merge_df['label'] = merge_df['emotion']
    elif 'emotion_x' in merge_df.columns:
        merge_df['label'] = merge_df['emotion_x'].fillna(merge_df.get('emotion_y', merge_df.get('label', "")))
    else:
        raise RuntimeError("No label/emotion column found in merged dataframe.")

# now stratify safely at video level: dedupe videos and split video ids
from sklearn.model_selection import train_test_split
vids = merge_df[['id','label']].drop_duplicates(subset=['id']).reset_index(drop=True)
train_vids, val_vids = train_test_split(vids['id'].tolist(), test_size=0.2, random_state=42,
                                        stratify=vids['label'].tolist())
train_df = merge_df[merge_df['id'].isin(train_vids)].reset_index(drop=True)
val_df   = merge_df[merge_df['id'].isin(val_vids)].reset_index(drop=True)
print(f"Train videos: {len(train_vids)}, Val videos: {len(val_vids)}")
train_df.to_csv(os.path.join(csv_directory, 'train_df.csv'), index=False)
val_df.to_csv(os.path.join(csv_directory, 'val_df.csv'), index=False)

Train videos: 32, Val videos: 8


In [12]:
train_df = pd.read_csv(os.path.join(csv_directory, 'train_df.csv'))
val_df = pd.read_csv(os.path.join(csv_directory, 'val_df.csv'))

In [13]:
train_df.head()

Unnamed: 0,id,path,emotion_x,audio_path,emotion_y,transcript_path,emotion,window_path,status,reason,label
0,802,D:/Satria_Data/train/videos/802_Anger.mp4,Anger,D:/Satria_Data/train/preprocess/audio/802_Ange...,Anger,D:/Satria_Data/train/preprocess/transcripts/80...,Anger,D:\Satria_Data\train\windows\802_windows.json,ok,,Anger
1,343,D:/Satria_Data/train/videos/343_Anger.mp4,Anger,D:/Satria_Data/train/preprocess/audio/343_Ange...,Anger,D:/Satria_Data/train/preprocess/transcripts/34...,Anger,D:\Satria_Data\train\windows\343_windows.json,ok,,Anger
2,734,D:/Satria_Data/train/videos/734_Anger.mp4,Anger,D:/Satria_Data/train/preprocess/audio/734_Ange...,Anger,D:/Satria_Data/train/preprocess/transcripts/73...,Anger,D:\Satria_Data\train\windows\734_windows.json,ok,,Anger
3,481,D:/Satria_Data/train/videos/481_Anger.mp4,Anger,D:/Satria_Data/train/preprocess/audio/481_Ange...,Anger,D:/Satria_Data/train/preprocess/transcripts/48...,Anger,D:\Satria_Data\train\windows\481_windows.json,ok,,Anger
4,67,D:/Satria_Data/train/videos/67_Fear.mp4,Fear,D:/Satria_Data/train/preprocess/audio/67_Fear.wav,Fear,D:/Satria_Data/train/preprocess/transcripts/67...,Fear,D:\Satria_Data\train\windows\67_windows.json,ok,,Fear


# video extraction

In [14]:
# Video Prepocessing
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225])
])

def extract_video_samples(video_path, windows_data, out_dir="frames_dataset"):
    cap = cv2.VideoCapture(video_path)
    for win in windows_data['windows']: # Corrected to access the list of windows
        frames = []
        for idx in win["frame_indices"]:
            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
            ret, frame = cap.read()
            if ret:
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                pil_img = Image.fromarray(frame)
                frames.append(transform(pil_img))
        if frames:
            tensor = torch.stack(frames)   # shape: (num_frames, 3, 224, 224)
            output_path = f"{out_dir}/video_{win['index']}.pt"
            torch.save(tensor, output_path)
            print(f"Successfully saved video tensor to: {output_path}") # Debug print
    cap.release()

In [15]:
# audio preprocessing
def extract_audio_samples(audio_path, windows_data, out_dir="audio_dataset"):
    y, sr = librosa.load(audio_path, sr=16000)  # mono, 16kHz
    for win in windows_data['windows']: # Corrected to access the list of windows
        start, end = win["audio_samples"]
        clip = y[start:end]
        # optional: normalize volume
        clip = clip / max(abs(clip).max(), 1e-8)
        output_path = f"{out_dir}/audio_{win['index']}.wav"
        sf.write(output_path, clip, sr)
        print(f"Successfully wrote audio to: {output_path}") # Debug print

In [16]:
def extract_text_samples(windows_data, out_file="text_dataset.txt"):
    print(f"Attempting to write text to: {out_file}") # Debug print
    with open(out_file, "w") as f:
        for win in windows_data['windows']: # Corrected to access the list of windows
            words = [w["word"] for w in win.get("transcript_words", [])]
            sentence = " ".join(words) if words else "[NO_SPEECH]"
            f.write(f"{win['index']}\t{sentence}\n")
    print(f"Successfully wrote text to: {out_file}") # Debug print

In [17]:
def extract_data(df, output_base=csv_directory, out_audio_dir=None, out_video_dir=None, out_text_dir=None, out_file=None, batch_size=140):
    out_audio_dir = out_audio_dir or os.path.join(output_base, "preprocess/frames/audio_frames_train")
    out_video_dir = out_video_dir or os.path.join(output_base, "preprocess/frames/video_frames_train")
    out_text_dir  = out_text_dir  or os.path.join(output_base, "preprocess/frames/text_frames_train")
    os.makedirs(out_audio_dir, exist_ok=True)
    os.makedirs(out_video_dir, exist_ok=True)
    os.makedirs(out_text_dir, exist_ok=True)

    framing_rows = {'id':[], 'window_index':[], 'video_path':[], 'audio_path':[], 'text_path':[], 'emotion':[]}

    if batch_size is None:
        batches = [df]
    else:
        batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]

    for batch_df in batches:
        for _, row in batch_df.iterrows():
            file_id = row['id']
            emotion = row.get('emotion_x') or row.get('emotion_y') or row.get('label') or ""
            window_json_path = row.get('window_path')
            if not window_json_path or not os.path.exists(window_json_path):
                print(f"Warning: missing window json for {file_id}")
                continue
            with open(window_json_path, 'r') as f:
                windows_data = json.load(f)
            # media paths (prefer paths inside JSON if present)
            video_path = windows_data.get('video') or row.get('video_path')
            audio_path = windows_data.get('audio') or row.get('audio_path')
            # output subdirs
            vid_audio_dir = os.path.join(out_audio_dir, f"a_{file_id}_{emotion}")
            vid_video_dir = os.path.join(out_video_dir,  f"v_{file_id}_{emotion}")
            os.makedirs(vid_audio_dir, exist_ok=True); os.makedirs(vid_video_dir, exist_ok=True)
            # extract per-window files
            extract_video_samples(video_path, windows_data, out_dir=vid_video_dir)
            extract_audio_samples(audio_path, windows_data, out_dir=vid_audio_dir)
            # write text per-window file
            text_file = os.path.join(out_text_dir, f"t_{file_id}_{emotion}.txt")
            with open(text_file, "w") as tf:
                for win in windows_data.get("windows", []):
                    words = [w["word"] for w in win.get("transcript_words", [])]
                    sentence = " ".join(words) if words else "[NO_SPEECH]"
                    tf.write(f"{win['index']}\t{sentence}\n")
            # populate CSV rows
            for win in windows_data.get("windows", []):
                idx = win['index']
                framing_rows['id'].append(file_id)
                framing_rows['window_index'].append(idx)
                framing_rows['video_path'].append(os.path.join(vid_video_dir, f"video_{idx}.pt"))
                framing_rows['audio_path'].append(os.path.join(vid_audio_dir, f"audio_{idx}.wav"))
                framing_rows['text_path'].append(text_file)
                framing_rows['emotion'].append(emotion)

    out_df = pd.DataFrame(framing_rows)
    out_df.to_csv(os.path.join(output_base, out_file), index=False)
    print("Saved manifest to", os.path.join(output_base, out_file))
    return out_df

In [18]:
extract_data(train_df, out_file='frame_train_manifest.csv')
extract_data(val_df, out_file='frame_val_manifest.csv')

Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_0.pt
Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_1.pt
Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_2.pt
Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_3.pt
Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_4.pt
Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_5.pt
Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_6.pt
Successfully saved video tensor to: D:\Satria_Data\train\preprocess/frames/video_frames_train\v_802_Anger/video_7.pt
Successfully saved video tensor to: D:\Satria_Data\train\preproc

Unnamed: 0,id,window_index,video_path,audio_path,text_path,emotion
0,739,0,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Anger
1,739,1,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Anger
2,739,2,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Anger
3,739,3,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Anger
4,739,4,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Anger
...,...,...,...,...,...,...
699,325,103,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Trust
700,325,104,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Trust
701,325,105,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Trust
702,325,106,D:\Satria_Data\train\preprocess/frames/video_f...,D:\Satria_Data\train\preprocess/frames/audio_f...,D:\Satria_Data\train\preprocess/frames/text_fr...,Trust


In [19]:
coba_df = pd.read_csv(r"D:\Satria_Data\train\frame_val_manifest.csv")
coba_df.isna().sum()

id              0
window_index    0
video_path      0
audio_path      0
text_path       0
emotion         0
dtype: int64

# Fold data split k-fold

In [20]:
# make_stratified_folds.py
import pandas as pd
import json
from sklearn.model_selection import StratifiedKFold

def make_stratified_folds(metadata_csv, n_splits=5, seed=42, out_path="folds.json"):
    """
    metadata_csv must contain at least: video_id,label
    Produces folds.json with entries: [{fold:0, train:[vid...], test:[vid...]}, ...]
    """
    df = pd.read_csv(metadata_csv)
    assert "id" in df.columns and "label" in df.columns, "CSV must have video_id,label columns"

    # dedupe video_id (in case multiple rows per video)
    df_vid = df.drop_duplicates(subset=["id"]).reset_index(drop=True)

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    folds = []
    y = df_vid["label"].values
    vids = df_vid["id"].values

    for fold_idx, (train_idx, test_idx) in enumerate(skf.split(vids, y)):
        train_vids = vids[train_idx].tolist()
        test_vids  = vids[test_idx].tolist()
        folds.append({"fold": fold_idx, "train": train_vids, "test": test_vids})

    with open(out_path, "w", encoding="utf8") as f:
        json.dump(folds, f, indent=2)
    print(f"Saved {n_splits}-fold splits to {out_path}")
    return folds

In [23]:
make_stratified_folds("D:/Satria_Data/train/train_df.csv", n_splits=4, seed=42, out_path="D:/Satria_Data/train/train_folds.json")
#make_stratified_folds("D:/Satria_Data/train/val_df.csv", n_splits=1,seed=42, out_path='D:/Satria_Data/train/val_folds.json')

Saved 4-fold splits to D:/Satria_Data/train/train_folds.json


[{'fold': 0,
  'train': [802,
   343,
   481,
   67,
   90,
   740,
   112,
   732,
   257,
   91,
   287,
   79,
   604,
   288,
   109,
   18,
   661,
   145,
   378,
   327,
   235,
   242,
   439,
   504],
  'test': [734, 669, 747, 659, 745, 613, 137, 763]},
 {'fold': 1,
  'train': [343,
   734,
   481,
   90,
   669,
   740,
   112,
   747,
   257,
   659,
   287,
   79,
   604,
   745,
   109,
   613,
   661,
   145,
   378,
   327,
   137,
   763,
   242,
   504],
  'test': [802, 67, 732, 91, 288, 18, 235, 439]},
 {'fold': 2,
  'train': [802,
   343,
   734,
   67,
   90,
   669,
   112,
   747,
   732,
   91,
   659,
   287,
   288,
   745,
   109,
   613,
   18,
   145,
   378,
   235,
   137,
   763,
   439,
   504],
  'test': [481, 740, 257, 79, 604, 661, 327, 242]},
 {'fold': 3,
  'train': [802,
   734,
   481,
   67,
   669,
   740,
   747,
   732,
   257,
   91,
   659,
   79,
   604,
   288,
   745,
   613,
   18,
   661,
   327,
   235,
   137,
   763,
   242,
   439],


In [24]:
# validate_folds.py
import json, pandas as pd, collections

def validate_folds(folds_json, metadata_csv):
    with open(folds_json) as f: folds = json.load(f)
    df = pd.read_csv(metadata_csv).drop_duplicates(subset=["id"]).set_index("id")
    for fd in folds:
        fold_idx = fd["fold"]
        for split in ("train","test"):
            vids = fd[split]
            labels = df.loc[vids,"label"].tolist()
            counts = collections.Counter(labels)
            print(f"Fold {fold_idx} {split}: {len(vids)} videos; class counts: {dict(counts)}")
            # find missing classes
            all_labels = set(df["label"].unique())
            missing = all_labels - set(counts.keys())
            if missing:
                print(f"  WARNING: missing classes in fold {fold_idx} {split}: {missing}")
    print("Validation done.")

In [25]:
validate_folds("D:/Satria_Data/train/train_folds.json", "D:/Satria_Data/train/train_df.csv")
#validate_folds("D:/Satria_Data/train/val_folds.json", "D:/Satria_Data/train/val_df.csv")

Fold 0 train: 24 videos; class counts: {'Anger': 3, 'Fear': 3, 'Joy': 3, 'Neutral': 3, 'Proud': 3, 'Sadness': 3, 'Surprise': 3, 'Trust': 3}
Fold 0 test: 8 videos; class counts: {'Anger': 1, 'Fear': 1, 'Joy': 1, 'Neutral': 1, 'Proud': 1, 'Sadness': 1, 'Surprise': 1, 'Trust': 1}
Fold 1 train: 24 videos; class counts: {'Anger': 3, 'Fear': 3, 'Joy': 3, 'Neutral': 3, 'Proud': 3, 'Sadness': 3, 'Surprise': 3, 'Trust': 3}
Fold 1 test: 8 videos; class counts: {'Anger': 1, 'Fear': 1, 'Joy': 1, 'Neutral': 1, 'Proud': 1, 'Sadness': 1, 'Surprise': 1, 'Trust': 1}
Fold 2 train: 24 videos; class counts: {'Anger': 3, 'Fear': 3, 'Joy': 3, 'Neutral': 3, 'Proud': 3, 'Sadness': 3, 'Surprise': 3, 'Trust': 3}
Fold 2 test: 8 videos; class counts: {'Anger': 1, 'Fear': 1, 'Joy': 1, 'Neutral': 1, 'Proud': 1, 'Sadness': 1, 'Surprise': 1, 'Trust': 1}
Fold 3 train: 24 videos; class counts: {'Anger': 3, 'Fear': 3, 'Joy': 3, 'Neutral': 3, 'Proud': 3, 'Sadness': 3, 'Surprise': 3, 'Trust': 3}
Fold 3 test: 8 videos; cla

In [28]:
import csv
def build_manifest_for_fold(label_file, fold_file, frame_manifest_file, target_fold, data_dir="data", out_dir="artifacts", output_prefix=""):
    """Build manifest for a specific fold"""
    # paths
    LABELS_CSV = os.path.join(csv_directory, label_file)
    FOLDS_JSON = os.path.join(csv_directory, fold_file)
    FRAME_MANIFEST_CSV = os.path.join(csv_directory, frame_manifest_file)
    WINDOWS_DIR = os.path.join(csv_directory, "windows")
    os.makedirs(out_dir, exist_ok=True)

    # --- load labels ---
    df_labels = pd.read_csv(LABELS_CSV).drop_duplicates(subset=["id"])
    df_labels["id"] = df_labels["id"].astype(str)
    df_labels = df_labels.set_index("id")
    valid_video_ids = set(df_labels.index)

    # --- load target fold ---
    with open(FOLDS_JSON, "r") as f:
        folds = json.load(f)
    
    target_fold_data = None
    for fd in folds:
        if fd["fold"] == target_fold:
            target_fold_data = fd
            break
    
    if target_fold_data is None:
        print(f"Warning: fold {target_fold} not found in {FOLDS_JSON}")
        return

    # Create video_to_split mapping for this specific fold
    video_to_split = {}
    for v in target_fold_data["train"]:
        video_to_split[str(v)] = "train"
    for v in target_fold_data["test"]:
        video_to_split[str(v)] = "test"

    # --- load frame manifest ---
    df_frames = pd.read_csv(FRAME_MANIFEST_CSV)
    df_frames["id"] = df_frames["id"].astype(str)
    df_frames = df_frames[df_frames["id"].isin(valid_video_ids)]
    
    frame_path_lookup = {}
    audio_path_lookup = {}
    for _, row in df_frames.iterrows():
        key = (str(row["id"]), int(row["window_index"]))
        frame_path_lookup[key] = row["video_path"]
        audio_path_lookup[key] = row["audio_path"]

    rows = []

    # Process only videos in this fold
    for vid_str in valid_video_ids:
        if vid_str not in video_to_split:
            continue  # Skip videos not in this fold
            
        window_file = os.path.join(WINDOWS_DIR, f"{vid_str}_windows.json")
        if not os.path.exists(window_file):
            continue

        label = df_labels.loc[vid_str, "label"]
        split = video_to_split[vid_str]

        with open(window_file, "r", encoding="utf8") as f:
            windows_data = json.load(f)

        for w in windows_data.get("windows", []):
            idx = w.get("index", None)
            start = w.get("start", None)
            end = w.get("end", None)
            frame_inds = w.get("frame_indices", [])

            key = (vid_str, idx)
            frames_path = frame_path_lookup.get(key)
            audio_path = audio_path_lookup.get(key)

            words = w.get("transcript_words", [])
            snippet = " ".join([it.get("word", "") for it in words]) if words else ""

            rows.append({
                "video_id": vid_str,
                "window_idx": idx,
                "start": start,
                "end": end,
                "frame_indices": ",".join(map(str, frame_inds)),
                "frames_path": frames_path,
                "audio_path": audio_path,
                "text_snippet": snippet,
                "label": label,
                "label_idx": None,
                "fold": target_fold,
                "split": split,
                "speech_ratio": w.get("speech_ratio", None),
                "has_face": w.get("has_face", None),
            })

    # --- map label->index ---
    if rows:
        labels = sorted(df_labels["label"].unique())
        label_to_idx = {l: i for i, l in enumerate(labels)}
        for r in rows:
            r["label_idx"] = label_to_idx[r["label"]]

        # Write fold-specific manifest
        path = os.path.join(out_dir, f"{output_prefix}manifest_fold{target_fold}.csv")
        keys = list(rows[0].keys())
        with open(path, "w", newline="", encoding="utf8") as f:
            writer = csv.DictWriter(f, keys)
            writer.writeheader()
            for r in rows:
                writer.writerow(r)
        print(f"Wrote {path}, rows: {len(rows)}")
        return rows
    else:
        print(f"No rows generated for fold {target_fold}")
        return []

def build_all_manifests(label_file, fold_file, frame_manifest_file, output_prefix=""):
    """Build manifests for all folds and create a combined manifest_all"""
    # Get fold count
    FOLDS_JSON = os.path.join(csv_directory, fold_file)
    with open(FOLDS_JSON, "r") as f:
        folds = json.load(f)
    
    all_rows = []
    
    # Build each fold separately
    for fold_data in folds:
        fold_id = fold_data["fold"]
        rows = build_manifest_for_fold(label_file, fold_file, frame_manifest_file, fold_id, output_prefix=output_prefix)
        all_rows.extend(rows)
    
    # Create combined manifest_all.csv
    if all_rows:
        manifest_all = os.path.join("artifacts", f"{output_prefix}manifest_all.csv")
        keys = list(all_rows[0].keys())
        with open(manifest_all, "w", newline="", encoding="utf8") as f:
            writer = csv.DictWriter(f, keys)
            writer.writeheader()
            for r in all_rows:
                writer.writerow(r)
        print(f"Wrote {manifest_all}, total rows: {len(all_rows)}")

In [29]:
build_all_manifests("train_df.csv", "train_folds.json", "frame_train_manifest.csv", output_prefix="train_")
#build_all_manifests("val_df.csv", "val_folds.json", "frame_val_manifest.csv", output_prefix="val_")

Wrote artifacts\train_manifest_fold0.csv, rows: 4271
Wrote artifacts\train_manifest_fold1.csv, rows: 4271
Wrote artifacts\train_manifest_fold2.csv, rows: 4271
Wrote artifacts\train_manifest_fold3.csv, rows: 4271
Wrote artifacts\train_manifest_all.csv, total rows: 17084


# Datasets & Data Loader

In [30]:
import pandas as pd, json
from datasets import build_label_encoder

manifest = pd.read_csv("artifacts/train_manifest_all.csv")
label2idx, idx2label = build_label_encoder(manifest["label"].unique())

with open("artifacts/label2idx.json","w") as f: json.dump(label2idx,f)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# run in training specialist
import json
with open("artifacts/label2idx.json") as f:
    label2idx = json.load(f)

In [None]:
from torchvision import models

# Pretrained ResNet
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Linear(resnet.fc.in_features, 8)  # 8 emotions

train_ds = FaceDataset("artifacts/train_manifest_all.csv", split="train", fold=0, label2idx=label2idx)
val_ds   = FaceDataset("artifacts/val_manifest_all.csv", split="test", fold=0, label2idx=label2idx)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=4)

model = train_specialist(resnet, train_loader, val_loader, num_epochs=10, lr=1e-4)
torch.save(model.state_dict(), "checkpoints/face_fold0.pth")


In [None]:
# kalo mau buat model sendiri
# 1. import yang ini
# run in training specialist
import json
with open("artifacts/label2idx.json") as f:
    label2idx = json.load(f)
    

# 2. ini buat masukin input ke model
# ini kek model compilation di keras atau enggak di tensorflow gitu
# ini function jadi defaultnya kayak gini biar gampang masukin data ke modelnya
# aku setting gini biar universal jadi kamu lebih fleksibel buat masukin datanya
# ohh iya ini bisa setting aja hyperparameternya kayak num_epochs, learning rate, dll
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score

device = "cuda" if torch.cuda.is_available() else "cpu"

def train_specialist(model, train_loader, val_loader, num_epochs=5, lr=1e-4):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        # ---- Training ----
        model.train()
        total_loss = 0
        for x, y, _ in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch} train loss: {total_loss/len(train_loader):.4f}")

        # ---- Validation ----
        model.eval()
        preds, gts = [], []
        with torch.no_grad():
            for x, y, _ in val_loader:
                x, y = x.to(device), y.to(device)
                logits = model(x)
                pred = logits.argmax(dim=1).cpu().numpy()
                preds.extend(pred)
                gts.extend(y.cpu().numpy())
        f1 = f1_score(gts, preds, average="macro")
        print(f"Epoch {epoch} val macro-F1: {f1:.4f}")

    return model

# 3. buat modelnya, bisa pake pretrained model atau bikin sendiri
# disini aku contohin model pake resnet18
# yaa kayak biasa kalo buat model biasanya kan di save di variabel model sama aja tinggal disave di variabel model atau seengaknya yang mewakilin
from torchvision import models
from datasets import TextDataset

# Pretrained ResNet
# buat nyiapin pretrained model
resnet = models.resnet18(pretrained=True)
resnet.fc = nn.Linear(resnet.fc.in_features, 8)  # 8 emotions

# ini nyiapin dataset sama dataloader nya biar bisa masukin ke model, ini wajib ada
train_ds = FaceDataset("artifacts/train_manifest_all.csv", split="train", fold=0, label2idx=label2idx)
val_ds   = FaceDataset("artifacts/val_manifest_all.csv", split="test", fold=0, label2idx=label2idx)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=4)


# 4. setelah itu tinggal panggil function train_specialist buat masukin dataset yang udah kamu retrieve tadi sama model yang udah kamu siapin
model = train_specialist(resnet, train_loader, val_loader, num_epochs=10, lr=1e-4)
torch.save(model.state_dict(), "checkpoints/face_fold0.pth")