In [2]:
# %% [markdown]
# # Multimodal Harmful Video Detection — Preprocess & Train
# This notebook processes three datasets (MultiHateClip EN, HateMM, HateClipSeg),
# unifies metadata, extracts audio/frames/text, and trains simple baselines.
# Runtime: Kaggle (GPU T4). Make sure "Accelerator: GPU" is ON.

import sys, platform, subprocess, os, shutil

print("Python:", sys.version)
print("Platform:", platform.platform())
print("CUDA visible:", os.environ.get("CUDA_VISIBLE_DEVICES"))
!nvidia-smi -L || true
!ffmpeg -version | head -n 1 || true

Python: 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
Platform: Linux-6.6.56+-x86_64-with-glibc2.35
CUDA visible: None
GPU 0: Tesla T4 (UUID: GPU-5925937c-09a8-6d89-19b9-f71168e72bb6)
GPU 1: Tesla T4 (UUID: GPU-11f0a912-56f6-8367-9cea-d9a49160448c)
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers


In [3]:
# %% [markdown]
# ## Install dependencies
# We use PyTorch + transformers + librosa + faster-whisper, and some utils.

!pip -q install --upgrade pip
!pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip -q install transformers==4.43.3 librosa==0.10.2.post1 soundfile==0.12.1 \
                     opencv-python==4.10.0.84 faster-whisper==1.0.3 \
                     pandas pyarrow numpy tqdm joblib scikit-learn ffmpeg-python==0.2.0

In [4]:
# %% [markdown]
# ## Directory config (EDIT: set RAW_BASE to your Kaggle mounted dataset name)
from pathlib import Path

ROOT = Path("/kaggle/working")
DATA = ROOT / "data"
RAW = DATA / "raw"
PROC = DATA / "processed"
META = DATA / "metadata"
for p in [RAW, PROC, META]:
    p.mkdir(parents=True, exist_ok=True)

In [5]:
# >>> IMPORTANT <<<
# Mount your whole local tree as a Kaggle dataset (e.g. "preprocess_lab").
# Then set RAW_BASE to that input path:
RAW_BASE = Path("/kaggle/input/dataset-sample/preprocess_lab")  # <-- EDIT to your actual mount name

# Your real structure inside RAW_BASE (mirrors what you showed):
# HateClipSeg:
HCS_ROOT   = RAW_BASE / "data" / "raw" / "HateClipSeg"
HCS_VLEVEL = HCS_ROOT / "data" / "video_level"    # full videos
HCS_SLEVEL = HCS_ROOT / "data" / "segment_level"  # may contain same videos or pre-cut clips
HCS_SEGCSV = HCS_ROOT / "segment_level_annotation.csv"
HCS_VIDCSV = HCS_ROOT / "video_level_annotation.csv"

# HateMM:
HMM_ROOT   = RAW_BASE / "data" / "raw" / "HateMM"
HMM_ANN    = HMM_ROOT / "HateMM_annotation.csv"
HMM_HATE   = HMM_ROOT / "data" / "hate_videos"
HMM_NHATE  = HMM_ROOT / "data" / "non_hate_videos"

# MultiHateClip:
MHC_ROOT   = RAW_BASE / "data" / "raw" / "MultiHateClip"
MHC_DATA   = MHC_ROOT / "data"
MHC_TRAIN  = MHC_DATA / "train"
MHC_VALID  = MHC_DATA / "valid"
MHC_TEST   = MHC_DATA / "test"
MHC_TRAIN_TSV = MHC_ROOT / "train.tsv"
MHC_VALID_TSV = MHC_ROOT / "valid.tsv"
MHC_TEST_TSV  = MHC_ROOT / "test.tsv"

# Processing knobs
FPS = 1
FRAME_SIZE = (224, 224)
SAMPLE_RATE = 16000
ASR_MODEL = "small.en"
MIN_DUR = 1.5

EXTRACT_FRAMES = True
EXTRACT_AUDIO  = True
EXTRACT_MELS   = True
RUN_ASR        = True
CUT_SEGMENTS   = True  # set False if HCS segment_level already provides pre-cut segments

In [13]:
# %% [markdown]
# ## Utilities: ffmpeg, mel, ASR
import subprocess, librosa, numpy as np, soundfile as sf
from tqdm import tqdm

def run(cmd): subprocess.run(cmd, check=True)

def hhmmss(seconds: float) -> str:
    ms = int(round(seconds * 1000))
    s = ms/1000.0
    hh = int(s//3600); s -= hh*3600
    mm = int(s//60); s -= mm*60
    return f"{hh:02d}:{mm:02d}:{s:06.3f}"

def ffmpeg_extract_audio(in_mp4: Path, out_wav: Path, sr=16000):
    out_wav.parent.mkdir(parents=True, exist_ok=True)
    run(["ffmpeg","-hide_banner","-loglevel","error","-i",str(in_mp4),
         "-ac","1","-ar",str(sr),"-vn","-y",str(out_wav)])

def ffmpeg_extract_frames(in_mp4: Path, out_dir: Path, fps=1, size=(224,224)):
    out_dir.mkdir(parents=True, exist_ok=True)
    w,h = size
    run(["ffmpeg","-hide_banner","-loglevel","error","-i",str(in_mp4),
         "-r",str(fps),"-vf",f"scale={w}:{h}",str(out_dir / "%06d.jpg")])

def ffmpeg_cut_segment(in_mp4: Path, start_s: float, dur_s: float, out_mp4: Path):
    out_mp4.parent.mkdir(parents=True, exist_ok=True)
    run(["ffmpeg","-hide_banner","-loglevel","error","-ss",hhmmss(start_s),
         "-i",str(in_mp4),"-t",f"{dur_s:.3f}",
         "-c:v","libx264","-preset","veryfast","-crf","23","-c:a","aac","-y",str(out_mp4)])

def save_mel(wav_path: Path, out_npy: Path, sr=16000, n_mels=64):
    y, sr_ = librosa.load(wav_path, sr=sr, mono=True)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    S_db = librosa.power_to_db(S, ref=np.max)
    out_npy.parent.mkdir(parents=True, exist_ok=True)
    np.save(out_npy, S_db)

from faster_whisper import WhisperModel
class ASR:
    def __init__(self, model_size="small.en", device="auto", compute_type="float16"):
        self.model = WhisperModel(model_size, device=device, compute_type=compute_type)
    def transcribe(self, audio_path: Path) -> str:
        segs, _ = self.model.transcribe(str(audio_path), beam_size=5, vad_filter=True)
        return " ".join([s.text.strip() for s in segs]).strip()

In [14]:
# %% [markdown]
# ## Label mapping helpers
import pandas as pd
from pathlib import Path

def map_mhc_bin(lbl: str) -> int:
    return 0 if str(lbl).strip().lower()=="normal" else 1

def map_hmm_bin(lbl: str) -> int:
    return 1 if str(lbl).strip().lower() in ["hate","hateful","offensive"] else 0

def map_hcs_multiclass(lbl: str) -> str:
    L = str(lbl).strip().lower()
    if L in ["normal","hateful","insulting","sexual","violence","self-harm"]:
        return L.capitalize()
    return "Normal"

def map_hcs_bin(lbl: str) -> int:
    return 0 if str(lbl).strip().lower()=="normal" else 1

In [15]:
# %% [markdown]
# ## HateClipSeg: robust parser for ['Video Id', 'Segment-Level Label', 'Segment Timestamp']
# Handles many formats, including nested-list strings and missing commas.

import re, ast, json
import pandas as pd
from pathlib import Path

assert HCS_SEGCSV.exists(), f"Missing {HCS_SEGCSV}"
df_raw = pd.read_csv(HCS_SEGCSV)

REQ_COLS = ["Video Id", "Segment-Level Label", "Segment Timestamp"]
missing = [c for c in REQ_COLS if c not in df_raw.columns]
assert not missing, f"CSV missing columns: {missing} | Found: {list(df_raw.columns)}"

DEFAULT_HALF_WIN = 1.5  # seconds for single-timestamp cases

# ---------- time parsers ----------
def time_to_seconds(token: str) -> float:
    """Accepts 'hh:mm:ss(.ms)', 'mm:ss(.ms)', or plain float seconds."""
    t = token.strip()
    # HH:MM:SS(.ms)
    if re.match(r"^\d{1,2}:\d{1,2}:\d{1,2}(\.\d+)?$", t):
        h, m, s = t.split(":")
        return int(h) * 3600 + int(m) * 60 + float(s)
    # MM:SS(.ms)
    if re.match(r"^\d{1,2}:\d{1,2}(\.\d+)?$", t):
        m, s = t.split(":")
        return int(m) * 60 + float(s)
    # float seconds
    return float(t)

def normalize_list_string(s: str) -> str:
    """
    Normalize weird list-like strings:
      - Remove double brackets [[...]] -> [...]
      - Ensure commas between items if only whitespace separates numbers/ times
    """
    x = s.strip()
    # Strip outer brackets multiple levels
    x = re.sub(r"^\s*\[+\s*", "[", x)
    x = re.sub(r"\s*\]+\s*$", "]", x)

    # If there are quotes-separated tokens without comma, insert comma
    # e.g., "['0.00' '3.00']" -> "['0.00','3.00']"
    x = re.sub(r"'\s+'", "','", x)
    x = re.sub(r'"\s+"', '","', x)

    # Also handle bare numbers/times separated by whitespace inside brackets: [0.0 3.0] -> [0.0,3.0]
    # Only apply inside [...] to avoid affecting normal strings
    def _insert_commas_inside_brackets(m):
        inner = m.group(1)
        # if already has comma, keep
        if "," in inner:
            return "[" + inner + "]"
        # put comma between number/time tokens separated by whitespace
        tokens = inner.strip().split()
        return "[" + ",".join(tokens) + "]"

    x = re.sub(r"\[\s*([^\[\]]+?)\s*\]", lambda m: _insert_commas_inside_brackets(m), x)
    return x

def try_parse_as_list(ts: str):
    """
    Try to parse ts as JSON or Python literal list.
    Returns list[str] or None.
    """
    s = normalize_list_string(ts)
    # Try JSON first
    try:
        val = json.loads(s)
        # flatten 2D lists like [["0.00","3.00"]]
        while isinstance(val, list) and len(val)==1 and isinstance(val[0], list):
            val = val[0]
        if isinstance(val, list):
            return [str(v) for v in val]
    except Exception:
        pass
    # Try Python literal
    try:
        val = ast.literal_eval(s)
        while isinstance(val, list) and len(val)==1 and isinstance(val[0], list):
            val = val[0]
        if isinstance(val, list):
            return [str(v) for v in val]
    except Exception:
        pass
    return None

def parse_segment_timestamp(ts) -> tuple:
    """
    Return (start_s, end_s) floats.
    Strategy:
      1) If list-like -> take first two entries.
      2) Else split by separators (-, –, —, 'to', comma) and parse first two parts.
      3) Else single timestamp -> ±DEFAULT_HALF_WIN.
    """
    if pd.isna(ts) or str(ts).strip()=="":
        return None, None
    s = str(ts).strip()

    # (1) list-like?
    as_list = try_parse_as_list(s)
    if as_list:
        if len(as_list) >= 2:
            a, b = as_list[0], as_list[1]
            a_s, b_s = time_to_seconds(a), time_to_seconds(b)
            if b_s < a_s: a_s, b_s = b_s, a_s
            return a_s, b_s
        elif len(as_list) == 1:
            t = time_to_seconds(as_list[0])
            return max(0.0, t-DEFAULT_HALF_WIN), t+DEFAULT_HALF_WIN

    # (2) split by common separators
    s_norm = re.sub(r"\s*(–|—|to)\s*", "-", s, flags=re.IGNORECASE)
    parts = [p for chunk in s_norm.split(",") for p in chunk.split("-")]
    parts = [p.strip() for p in parts if p.strip()!=""]
    if len(parts) >= 2:
        a_s, b_s = time_to_seconds(parts[0]), time_to_seconds(parts[1])
        if b_s < a_s: a_s, b_s = b_s, a_s
        return a_s, b_s
    elif len(parts) == 1:
        t = time_to_seconds(parts[0])
        return max(0.0, t-DEFAULT_HALF_WIN), t+DEFAULT_HALF_WIN

    return None, None

def resolve_hcs_video_path(video_id: str) -> Path:
    """
    Map 'Video Id' to actual .mp4 file.
    Prefer video_level/, fallback to segment_level/. Try explicit 'bit_' prefix too.
    """
    v = str(video_id).strip()
    candidates = []
    if v.endswith(".mp4"):
        candidates.append(v)
    else:
        candidates += [f"{v}.mp4", f"bit_{v}.mp4"]

    for name in candidates:
        p1 = HCS_VLEVEL / name
        if p1.exists(): return p1
        p2 = HCS_SLEVEL / name
        if p2.exists(): return p2
    # last-shot: as-is in video_level
    return HCS_VLEVEL / (v if v.endswith(".mp4") else f"{v}.mp4")

# ---------- build output & collect rejects ----------
rows, rejects = [], []
for i, r in df_raw.iterrows():
    vid = r["Video Id"]
    label = r["Segment-Level Label"]
    ts = r["Segment Timestamp"]

    try:
        start_s, end_s = parse_segment_timestamp(ts)
        if start_s is None or end_s is None:
            raise ValueError(f"Unparsed ts: {ts}")
        vpath = resolve_hcs_video_path(vid)
        stem = Path(vpath).stem
        seg_id = f"{stem}_{int(round(start_s*1000))}_{int(round(end_s*1000))}"
        rows.append({
            "seg_id": seg_id,
            "video_path": str(vpath),
            "start": float(start_s),
            "end": float(end_s),
            "label": str(label),
            "target_group": ""
        })
    except Exception as e:
        rejects.append({"idx": i, "Video Id": vid, "Segment-Level Label": label, "Segment Timestamp": ts, "error": str(e)})

hcs_seg_df = pd.DataFrame(rows)
print("HateClipSeg segments parsed:", hcs_seg_df.shape)
display(hcs_seg_df.head(5))

# Save rejects for inspection, if any
rej_path = META / "hcs_timestamp_rejects.csv"
if rejects:
    pd.DataFrame(rejects).to_csv(rej_path, index=False)
    print(f"Saved rejects -> {rej_path}  (count={len(rejects)})")
else:
    print("No rejects 🎉")


HateClipSeg segments parsed: (2, 6)


Unnamed: 0,seg_id,video_path,start,end,label,target_group
0,bit_tD1tyOy1HOJH_0_201160,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,201.16,"[[1, 0, 0, 0, 0, 0]]",
1,bit_ty1ffKOFCEnl_0_290550,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,290.55,"[[1, 0, 0, 0, 0, 0]]",


Saved rejects -> /kaggle/working/data/metadata/hcs_timestamp_rejects.csv  (count=433)


In [17]:
# %% [markdown]
# ## HateMM: parser for ['video_file_name', 'label', 'hate_snippet', 'target']
# - Resolves video path under data/{hate_videos, non_hate_videos}
# - Parses optional timestamps from 'hate_snippet' if present; else uses full segment.

import re, ast, json
import pandas as pd
from pathlib import Path

assert HMM_ANN.exists(), f"Missing {HMM_ANN}"
hmm_raw = pd.read_csv(HMM_ANN)

REQ = ["video_file_name", "label"]
missing = [c for c in REQ if c not in hmm_raw.columns]
assert not missing, f"CSV missing columns: {missing} | Found: {list(hmm_raw.columns)}"

# ---------- time helpers (reuse logic from HCS robust parser) ----------
DEFAULT_HALF_WIN = 1.5  # seconds

def time_to_seconds(token: str) -> float:
    t = str(token).strip()
    if re.match(r"^\d{1,2}:\d{1,2}:\d{1,2}(\.\d+)?$", t):
        h, m, s = t.split(":"); return int(h)*3600 + int(m)*60 + float(s)
    if re.match(r"^\d{1,2}:\d{1,2}(\.\d+)?$", t):
        m, s = t.split(":"); return int(m)*60 + float(s)
    return float(t)

def normalize_list_string(s: str) -> str:
    x = s.strip()
    x = re.sub(r"^\s*\[+\s*", "[", x)
    x = re.sub(r"\s*\]+\s*$", "]", x)
    x = re.sub(r"'\s+'", "','", x)
    x = re.sub(r'"\s+"', '","', x)
    def _inside(m):
        inner = m.group(1)
        if "," in inner: return "[" + inner + "]"
        toks = inner.strip().split()
        return "[" + ",".join(toks) + "]"
    x = re.sub(r"\[\s*([^\[\]]+?)\s*\]", lambda m: _inside(m), x)
    return x

def try_parse_as_list(ts: str):
    s = normalize_list_string(ts)
    try:
        val = json.loads(s)
        while isinstance(val, list) and len(val)==1 and isinstance(val[0], list):
            val = val[0]
        if isinstance(val, list): return [str(v) for v in val]
    except Exception:
        pass
    try:
        val = ast.literal_eval(s)
        while isinstance(val, list) and len(val)==1 and isinstance(val[0], list):
            val = val[0]
        if isinstance(val, list): return [str(v) for v in val]
    except Exception:
        pass
    return None

def parse_timestamp_flexible(ts) -> tuple:
    """
    Try to extract (start,end) from hate_snippet if it looks time-like.
    Returns (start_s, end_s) or (None, None) if not parseable.
    """
    if ts is None or (isinstance(ts, float) and pd.isna(ts)): 
        return None, None
    s = str(ts).strip()
    if s == "": return None, None

    # 1) list-like?
    as_list = try_parse_as_list(s)
    if as_list:
        if len(as_list) >= 2:
            a, b = as_list[0], as_list[1]
            a_s, b_s = time_to_seconds(a), time_to_seconds(b)
            if b_s < a_s: a_s, b_s = b_s, a_s
            return a_s, b_s
        elif len(as_list) == 1:
            t = time_to_seconds(as_list[0])
            return max(0.0, t-DEFAULT_HALF_WIN), t+DEFAULT_HALF_WIN

    # 2) split by separators: '-', en-dash, 'to', comma
    s_norm = re.sub(r"\s*(–|—|to)\s*", "-", s, flags=re.IGNORECASE)
    parts = [p for chunk in s_norm.split(",") for p in chunk.split("-")]
    parts = [p.strip() for p in parts if p.strip()!=""]
    # Accept only if parts look like time tokens (avoid random text)
    def looks_time(x): 
        return bool(re.match(r"^\d+(\.\d+)?$", x) or re.match(r"^\d{1,2}:\d{1,2}(:\d{1,2}(\.\d+)?)?$", x))
    parts_time = [p for p in parts if looks_time(p)]
    if len(parts_time) >= 2:
        a_s, b_s = time_to_seconds(parts_time[0]), time_to_seconds(parts_time[1])
        if b_s < a_s: a_s, b_s = b_s, a_s
        return a_s, b_s
    if len(parts_time) == 1:
        t = time_to_seconds(parts_time[0])
        return max(0.0, t-DEFAULT_HALF_WIN), t+DEFAULT_HALF_WIN

    return None, None

# ---------- path resolver ----------
def resolve_hmm_path(basename: str) -> Path:
    """
    Try both hate and non_hate dirs; handle presence/absence of .mp4.
    """
    v = str(basename).strip()
    cands = [v, f"{v}.mp4"] if not v.endswith(".mp4") else [v]
    for name in cands:
        p = HMM_HATE / name
        if p.exists(): return p
        p2 = HMM_NHATE / name
        if p2.exists(): return p2
    # last fallback
    return HMM_HATE / (v if v.endswith(".mp4") else f"{v}.mp4")

# ---------- build rows ----------
rows, rejects = [], []
has_target = "target" in hmm_raw.columns
has_snip   = "hate_snippet" in hmm_raw.columns

for i, r in hmm_raw.iterrows():
    vid   = r["video_file_name"]
    label = r["label"]
    tgt   = (str(r["target"]) if has_target and pd.notna(r["target"]) else "")
    snip  = (r["hate_snippet"] if has_snip else None)

    vpath = resolve_hmm_path(vid)

    # Try to parse timestamps from snippet; else full segment
    start_s, end_s = parse_timestamp_flexible(snip)
    if start_s is None or end_s is None:
        # mark as full; preprocess step will copy full video (dur=None sentinel)
        seg_id = f"{Path(vpath).stem}_full"
        rows.append({
            "seg_id": seg_id,
            "video_path": str(vpath),
            "start": 0.0,
            "end": 1e9,  # sentinel for full
            "label": str(label),
            "target": tgt
        })
    else:
        seg_id = f"{Path(vpath).stem}_{int(round(start_s*1000))}_{int(round(end_s*1000))}"
        rows.append({
            "seg_id": seg_id,
            "video_path": str(vpath),
            "start": float(start_s),
            "end": float(end_s),
            "label": str(label),
            "target": tgt
        })

hmm_seg_df = pd.DataFrame(rows)
print("HateMM segments parsed:", hmm_seg_df.shape)
display(hmm_seg_df.head(5))

# Save lines where snippet looked time-like but failed (none in this pass since we fall back).
rej_path = META / "hatemm_timestamp_rejects.csv"
if rejects:
    pd.DataFrame(rejects).to_csv(rej_path, index=False)
    print(f"Saved rejects -> {rej_path}")
else:
    print("No rejects 🎉")


HateMM segments parsed: (1083, 6)


Unnamed: 0,seg_id,video_path,start,end,label,target
0,hate_video_1_34000_94000,/kaggle/input/dataset-sample/preprocess_lab/da...,34.0,94.0,Hate,Blacks
1,hate_video_2_6000_126000,/kaggle/input/dataset-sample/preprocess_lab/da...,6.0,126.0,Hate,Blacks
2,non_hate_video_1_full,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,1000000000.0,Non Hate,Others
3,hate_video_3_full,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,1000000000.0,Hate,Blacks
4,non_hate_video_2_full,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,1000000000.0,Non Hate,Blacks


No rejects 🎉


In [18]:
# %% [markdown]
# ## MultiHateClip: parse `train.tsv`, `valid.tsv`, `test.tsv`
# Expected per line: <video_id>\t<label>. Map to data/{train,valid,test}.

import csv  # <-- bổ sung để dùng csv.reader
import pandas as pd
from pathlib import Path

def read_tsv(tsv_path: Path):
    rows=[]
    with open(tsv_path, newline='', encoding='utf-8') as f:
        rd = csv.reader(f, delimiter='\t')
        for row in rd:
            if not row: 
                continue
            # bỏ header nếu có
            if len(row) >= 2 and row[0].lower() in ["video_id","id"] and row[1].lower() in ["label","class"]:
                continue
            # một số tệp thực chất là csv chứ không phải tsv
            if len(row) < 2 and ',' in row[0]:
                parts = row[0].split(',')
                if len(parts) >= 2:
                    rows.append({"video_id": parts[0].strip(), "label": parts[1].strip()})
                continue
            if len(row) >= 2:
                rows.append({"video_id": row[0].strip(), "label": row[1].strip()})
    return pd.DataFrame(rows)

def mhc_map_path(vid: str, split_dir: Path) -> Path:
    # xử lý các trường hợp id có tiền tố '-' hoặc '_' trong tên file
    p = split_dir / vid
    if p.exists(): return p
    if not vid.endswith(".mp4"):
        p2 = split_dir / f"{vid}.mp4"
        if p2.exists(): return p2
    cand = vid.lstrip("-_")
    p3 = split_dir / cand
    if p3.exists(): return p3
    if not cand.endswith(".mp4"):
        p4 = split_dir / f"{cand}.mp4"
        if p4.exists(): return p4
    matches = list(split_dir.glob(f"*{Path(vid).stem}*.mp4"))
    return matches[0] if matches else split_dir / (vid if vid.endswith(".mp4") else f"{vid}.mp4")

dfs=[]
if MHC_TRAIN_TSV.exists():
    d = read_tsv(MHC_TRAIN_TSV); d["split"]="train"; dfs.append(d)
if MHC_VALID_TSV.exists():
    d = read_tsv(MHC_VALID_TSV); d["split"]="valid"; dfs.append(d)
if MHC_TEST_TSV.exists():
    d = read_tsv(MHC_TEST_TSV);  d["split"]="test";  dfs.append(d)

mhc_lbl = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print("Loaded MHC rows:", mhc_lbl.shape)

mhc_rows=[]
for _, r in mhc_lbl.iterrows():
    sp = r["split"]
    split_dir = {"train": MHC_TRAIN, "valid": MHC_VALID, "test": MHC_TEST}[sp]
    path = mhc_map_path(str(r["video_id"]), split_dir)
    mhc_rows.append({"video_id": str(r["video_id"]), "video_path": str(path), "label": str(r["label"]), "split": sp})
mhc_df = pd.DataFrame(mhc_rows)
print("MultiHateClip mapped:", mhc_df.shape)
display(mhc_df.head(3))


Loaded MHC rows: (1004, 3)
MultiHateClip mapped: (1004, 4)


Unnamed: 0,video_id,video_path,label,split
0,Video_ID,/kaggle/input/dataset-sample/preprocess_lab/da...,Majority_Voting,train
1,4V0KGql_fUI,/kaggle/input/dataset-sample/preprocess_lab/da...,Normal,train
2,5snzFreG79c,/kaggle/input/dataset-sample/preprocess_lab/da...,Offensive,train


In [19]:
# %% [markdown]
# ## Save normalized label CSVs for the preprocessing stage
labs_dir = META / "labels_raw"
labs_dir.mkdir(parents=True, exist_ok=True)

LAB_HC = labs_dir / "hateclipseg_segments.csv"       # seg_id, video_path, start, end, label, target_group
LAB_MM = labs_dir / "hatemm_spans.csv"               # seg_id, video_path, start, end, label
LAB_MH = labs_dir / "multihateclip_en.csv"           # video_id, video_path, label, split

hcs_seg_df.to_csv(LAB_HC, index=False)
hmm_seg_df.to_csv(LAB_MM, index=False)
mhc_df.to_csv(LAB_MH, index=False)

print("Wrote:")
print("-", LAB_HC)
print("-", LAB_MM)
print("-", LAB_MH)


Wrote:
- /kaggle/working/data/metadata/labels_raw/hateclipseg_segments.csv
- /kaggle/working/data/metadata/labels_raw/hatemm_spans.csv
- /kaggle/working/data/metadata/labels_raw/multihateclip_en.csv


In [20]:
# %% [markdown]
# ## Force-override ASR class (safe on CPU/GPU) + quick probe
# This cell deletes any previous ASR definitions, defines a safe wrapper, and prints the chosen config.

import os
from faster_whisper import WhisperModel

# 1) Remove any previous ASR definitions
try:
    del ASR  # old class that used compute_type="float16"
except NameError:
    pass

def pick_fw_config(prefer_size="small.en"):
    """Pick a safe (device, compute_type, model_size) for faster-whisper."""
    # If Kaggle gives us a GPU, CUDA_VISIBLE_DEVICES is typically set; otherwise fall back to CPU.
    has_cuda = os.environ.get("CUDA_VISIBLE_DEVICES") not in (None, "", "-1")

    if has_cuda:
        # 'int8_float16' is fast & memory-friendly on most Kaggle GPUs.
        # If it still fails, our class below will fall back to 'int8'.
        return ("cuda", "int8_float16", prefer_size)
    else:
        # Pure CPU: 'int8' keeps memory and speed reasonable.
        return ("cpu", "int8", prefer_size)

class ASR:
    """Safe ASR wrapper that never hard-codes float16; includes graceful fallbacks."""
    def __init__(self, model_size="small.en"):
        device, compute_type, msize = pick_fw_config(prefer_size=model_size)
        self._cfg = (device, compute_type, msize)
        # try preferred config
        try:
            self.model = WhisperModel(msize, device=device, compute_type=compute_type)
            self._ok = True
        except ValueError as e:
            # If float16 path or chosen type not supported, fall back stepwise.
            self._ok = False
            # 1) try int8 on same device
            try:
                self.model = WhisperModel(msize, device=device, compute_type="int8")
                self._ok = True
                self._cfg = (device, "int8", msize)
            except Exception:
                # 2) final fallback: CPU + int8
                self.model = WhisperModel(msize, device="cpu", compute_type="int8")
                self._ok = True
                self._cfg = ("cpu", "int8", msize)

    def transcribe(self, audio_path):
        segments, _ = self.model.transcribe(str(audio_path), beam_size=5, vad_filter=True)
        return " ".join([s.text.strip() for s in segments]).strip()

# Optional: force a lighter model to speed up CPU runs
ASR_MODEL = "base.en"  # or "tiny.en" if CPU is slow

# Quick probe (no audio I/O; just show picked config)
print("ASR will use (device, compute_type, model):", ASR(ASR_MODEL)._cfg)

ASR will use (device, compute_type, model): ('cpu', 'int8', 'base.en')


In [21]:
# %% [markdown]
# ## Quick audit: which HateClipSeg video files are missing?
from pathlib import Path
import pandas as pd

LAB_HC = META / "labels_raw" / "hateclipseg_segments.csv"
hc_df = pd.read_csv(LAB_HC)

def exists(p): 
    try: return Path(p).exists()
    except: return False

hc_df["exists"] = hc_df["video_path"].apply(exists)
missing = hc_df[~hc_df["exists"]].copy()
print(f"Total HCS rows: {len(hc_df)} | missing files: {len(missing)}")
display(missing.head(10))

Total HCS rows: 2 | missing files: 2


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,seg_id,video_path,start,end,label,target_group,exists
0,bit_tD1tyOy1HOJH_0_201160,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,201.16,"[[1, 0, 0, 0, 0, 0]]",,False
1,bit_ty1ffKOFCEnl_0_290550,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,290.55,"[[1, 0, 0, 0, 0, 0]]",,False


In [22]:
# %% [markdown]
# ## Smarter resolver for HateClipSeg file paths + remap
from pathlib import Path
import pandas as pd
import re

# Đảm bảo bạn đã set đúng RAW_BASE trước đó.
# Ở log lỗi mình thấy: /kaggle/input/dataset-sample/preprocess_lab/...
# Vậy RAW_BASE nên là:
RAW_BASE = Path("/kaggle/input/dataset-sample/preprocess_lab")  # chỉnh nếu khác
HCS_ROOT   = RAW_BASE / "data" / "raw" / "HateClipSeg"
HCS_VLEVEL = HCS_ROOT / "data" / "video_level"
HCS_SLEVEL = HCS_ROOT / "data" / "segment_level"

def smart_glob(stem: str):
    """
    Tìm file bằng wildcard theo stem (không phần đuôi). Thử cả 2 thư mục.
    Ví dụ stem='bit_abc123' -> tìm *abc123*.mp4
    """
    pats = [f"*{stem}*.mp4", f"*{stem}*"]
    cands = []
    for d in [HCS_VLEVEL, HCS_SLEVEL]:
        for pat in pats:
            cands += list(d.glob(pat))
    return cands

def resolve_hcs_video_path_smart(original_path: str):
    """
    - Nếu original_path tồn tại -> trả về luôn
    - Thử các biến thể tên file: có/không .mp4, thêm/bớt 'bit_' và 'yt_'
    - Nếu chưa ra -> wildcard theo stem
    - Nếu vẫn chưa -> trả về None
    """
    p = Path(original_path)
    if p.exists():
        return p

    name = p.name
    stem = Path(name).stem
    trials = []

    # thư mục ưu tiên: video_level -> segment_level
    # 1) nguyên bản
    trials += [HCS_VLEVEL / name, HCS_SLEVEL / name]
    # 2) thêm .mp4
    if not name.endswith(".mp4"):
        trials += [HCS_VLEVEL / f"{name}.mp4", HCS_SLEVEL / f"{name}.mp4"]

    # 3) thêm/bớt tiền tố
    prefixes = ["", "bit_", "yt_"]
    base = stem
    # nếu stem đã có bit_/yt_ thì thử bỏ đi
    base_noprefix = re.sub(r"^(bit_|yt_)", "", stem)
    variants = set()
    for pf in prefixes:
        variants.add(pf + base)
        variants.add(pf + base_noprefix)

    for v in variants:
        trials += [HCS_VLEVEL / f"{v}.mp4", HCS_SLEVEL / f"{v}.mp4",
                   HCS_VLEVEL / v,         HCS_SLEVEL / v]

    # 4) wildcard
    if not any(t.exists() for t in trials):
        # tìm theo cả stem và base_noprefix
        globs = smart_glob(stem) + smart_glob(base_noprefix)
        if globs:
            return globs[0]
        return None

    for t in trials:
        if t.exists():
            return t
    return None

# Remap toàn bộ HCS video_path với resolver mới
hc_df = pd.read_csv(LAB_HC)
new_paths = []
for vp in hc_df["video_path"].tolist():
    rp = resolve_hcs_video_path_smart(vp)
    new_paths.append("" if rp is None else str(rp))

hc_df["video_path_resolved"] = new_paths
hc_df["exists"] = hc_df["video_path_resolved"].apply(lambda x: Path(x).exists() if isinstance(x, str) and x else False)
print("Missing after smart resolve:", int((~hc_df["exists"]).sum()))
display(hc_df[~hc_df["exists"]].head(10))

# Ghi đè file labels để preprocess dùng đường dẫn đã resolve
hc_df_use = hc_df.copy()
hc_df_use["video_path"] = hc_df_use["video_path_resolved"]
hc_df_use = hc_df_use.drop(columns=["video_path_resolved","exists"])
hc_df_use.to_csv(LAB_HC, index=False)
print("Rewrote:", LAB_HC)

Missing after smart resolve: 2


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,seg_id,video_path,start,end,label,target_group,video_path_resolved,exists
0,bit_tD1tyOy1HOJH_0_201160,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,201.16,"[[1, 0, 0, 0, 0, 0]]",,,False
1,bit_ty1ffKOFCEnl_0_290550,/kaggle/input/dataset-sample/preprocess_lab/da...,0.0,290.55,"[[1, 0, 0, 0, 0, 0]]",,,False


Rewrote: /kaggle/working/data/metadata/labels_raw/hateclipseg_segments.csv


In [23]:
# %% [markdown]
# ## Safer preprocess_hateclipseg: skip-missing + dur clamp via ffprobe
import json, subprocess
from pathlib import Path
import pandas as pd
from tqdm import tqdm

def ffprobe_duration_seconds(path: Path) -> float:
    """
    Return duration in seconds using ffprobe. Returns None if probing fails.
    """
    try:
        cmd = [
            "ffprobe", "-v", "error", "-show_entries", "format=duration",
            "-of", "json", str(path)
        ]
        out = subprocess.check_output(cmd)
        j = json.loads(out.decode("utf-8"))
        dur = float(j["format"]["duration"])
        return dur
    except Exception:
        return None

def preprocess_hateclipseg(df: pd.DataFrame) -> pd.DataFrame:
    rec=[]; missing_rows=[]
    asr = ASR(ASR_MODEL) if RUN_ASR else None

    for r in tqdm(df.to_dict(orient="records"), desc="HateClipSeg"):
        in_mp4 = Path(r["video_path"])
        if not in_mp4.exists():
            # log missing and continue
            missing_rows.append(r)
            continue

        start, end = float(r["start"]), float(r["end"])
        raw_dur = max(0.0, end - start)
        if raw_dur < MIN_DUR:
            continue

        # Clamp duration by actual file duration if available
        vid_dur = ffprobe_duration_seconds(in_mp4)
        dur = raw_dur
        if vid_dur is not None:
            # đôi khi annotation end vượt file length 1–2s; clamp lại
            max_dur = max(0.0, vid_dur - start)
            dur = min(raw_dur, max_dur) if max_dur > 0 else raw_dur
            if dur < MIN_DUR:
                continue

        seg_id = str(r["seg_id"])
        out_seg = PROC / "segments" / "hateclipseg" / f"{seg_id}.mp4"
        src = out_seg
        if CUT_SEGMENTS:
            if not out_seg.exists():
                ffmpeg_cut_segment(in_mp4, start, dur, out_seg)
        else:
            src = in_mp4

        audio = PROC / "audio" / "hateclipseg" / f"{seg_id}.wav"
        frames_dir = PROC / "frames" / "hateclipseg" / seg_id
        mel = PROC / "mels" / "hateclipseg" / f"{seg_id}.npy"
        txt = PROC / "text" / "hateclipseg" / f"{seg_id}.txt"

        if EXTRACT_AUDIO and not audio.exists(): ffmpeg_extract_audio(src, audio, SAMPLE_RATE)
        if EXTRACT_FRAMES and not frames_dir.exists(): ffmpeg_extract_frames(src, frames_dir, FPS, FRAME_SIZE)
        if EXTRACT_MELS and not mel.exists(): save_mel(audio, mel, SAMPLE_RATE)
        if RUN_ASR and not txt.exists():
            txt.parent.mkdir(parents=True, exist_ok=True)
            txt.write_text(asr.transcribe(audio), encoding="utf-8")

        rec.append({
            "dataset":"HateClipSeg","sample_id":seg_id,
            "video_path":str(in_mp4),"segment_path":str(out_seg),
            "audio_path":str(audio),"frames_dir":str(frames_dir),
            "mel_path":str(mel),"text_path":str(txt),
            "start":start,"end":end,"duration":dur,
            "label_multiclass": r.get("label_multiclass", r.get("label", "")),
            "label_bin": 0 if str(r.get("label","")).strip().lower()=="normal" else 1,
            "target_group": r.get("target_group","")
        })

    # Save missing for review
    if missing_rows:
        miss_path = META / "hcs_missing_files.csv"
        pd.DataFrame(missing_rows).to_csv(miss_path, index=False)
        print(f"[HateClipSeg] Missing files logged -> {miss_path} (n={len(missing_rows)})")

    return pd.DataFrame.from_records(rec)

In [24]:
dfs=[]
hc_df = pd.read_csv(LAB_HC)
mm_df = pd.read_csv(META / "labels_raw" / "hatemm_spans.csv")
mh_df = pd.read_csv(META / "labels_raw" / "multihateclip_en.csv")

if len(hc_df): dfs.append(preprocess_hateclipseg(hc_df))
if len(mm_df): dfs.append(preprocess_hatemm(mm_df))
if len(mh_df): dfs.append(preprocess_multihateclip(mh_df))

meta = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print("Total processed samples:", len(meta))
display(meta.head(3))

HateClipSeg:   0%|          | 0/2 [00:00<?, ?it/s]


TypeError: expected str, bytes or os.PathLike object, not float

In [25]:
# %% [markdown]
# ## Preprocess: cut/extract features for HateClipSeg, HateMM, MultiHateClip
# Assumes you already defined:
# - PROC, META
# - functions: ffmpeg_cut_segment, ffmpeg_extract_audio, ffmpeg_extract_frames, save_mel, ASR
# - toggles: CUT_SEGMENTS, EXTRACT_AUDIO, EXTRACT_FRAMES, EXTRACT_MELS, RUN_ASR
# - constants: SAMPLE_RATE, FPS, FRAME_SIZE, MIN_DUR, ASR_MODEL

import pandas as pd
from pathlib import Path
from tqdm import tqdm

# Load the normalized label CSVs you just wrote
LAB_HC = META / "labels_raw" / "hateclipseg_segments.csv"
LAB_MM = META / "labels_raw" / "hatemm_spans.csv"
LAB_MH = META / "labels_raw" / "multihateclip_en.csv"

hc_df = pd.read_csv(LAB_HC) if LAB_HC.exists() else pd.DataFrame()
mm_df = pd.read_csv(LAB_MM) if LAB_MM.exists() else pd.DataFrame()
mh_df = pd.read_csv(LAB_MH) if LAB_MH.exists() else pd.DataFrame()

print("HateClipSeg rows:", len(hc_df))
print("HateMM rows:", len(mm_df))
print("MultiHateClip rows:", len(mh_df))

# ---------- dataset-specific preprocess ----------

def preprocess_hateclipseg(df: pd.DataFrame) -> pd.DataFrame:
    rec=[]; asr = ASR(ASR_MODEL) if RUN_ASR else None
    for r in tqdm(df.to_dict(orient="records"), desc="HateClipSeg"):
        in_mp4 = Path(r["video_path"])
        start, end = float(r["start"]), float(r["end"])
        dur = max(0.0, end - start)
        if dur < MIN_DUR: continue

        seg_id = str(r["seg_id"])
        out_seg = PROC / "segments" / "hateclipseg" / f"{seg_id}.mp4"
        src = out_seg
        if CUT_SEGMENTS:
            if not out_seg.exists():
                ffmpeg_cut_segment(in_mp4, start, dur, out_seg)
        else:
            src = in_mp4

        audio = PROC / "audio" / "hateclipseg" / f"{seg_id}.wav"
        frames_dir = PROC / "frames" / "hateclipseg" / seg_id
        mel = PROC / "mels" / "hateclipseg" / f"{seg_id}.npy"
        txt = PROC / "text" / "hateclipseg" / f"{seg_id}.txt"

        if EXTRACT_AUDIO and not audio.exists(): ffmpeg_extract_audio(src, audio, SAMPLE_RATE)
        if EXTRACT_FRAMES and not frames_dir.exists(): ffmpeg_extract_frames(src, frames_dir, FPS, FRAME_SIZE)
        if EXTRACT_MELS and not mel.exists(): save_mel(audio, mel, SAMPLE_RATE)
        if RUN_ASR and not txt.exists():
            txt.parent.mkdir(parents=True, exist_ok=True)
            txt.write_text(asr.transcribe(audio), encoding="utf-8")

        rec.append({
            "dataset":"HateClipSeg","sample_id":seg_id,
            "video_path":str(in_mp4),"segment_path":str(out_seg),
            "audio_path":str(audio),"frames_dir":str(frames_dir),
            "mel_path":str(mel),"text_path":str(txt),
            "start":start,"end":end,"duration":dur,
            "label_multiclass": r.get("label_multiclass", r.get("label", "")),
            "label_bin": 0 if str(r.get("label","")).strip().lower()=="normal" else 1,
            "target_group": r.get("target_group","")
        })
    return pd.DataFrame.from_records(rec)

def preprocess_hatemm(df: pd.DataFrame) -> pd.DataFrame:
    rec=[]; asr = ASR(ASR_MODEL) if RUN_ASR else None
    for r in tqdm(df.to_dict(orient="records"), desc="HateMM"):
        in_mp4 = Path(r["video_path"])
        start, end = float(r["start"]), float(r["end"])
        full = (end > 9e8)  # sentinel for "full video"
        if not full:
            dur = max(0.0, end - start)
            if dur < MIN_DUR: continue

        seg_id = str(r["seg_id"])
        out_seg = PROC / "segments" / "hatemm" / f"{seg_id}.mp4"
        if CUT_SEGMENTS:
            if full:
                if not out_seg.exists():
                    run(["ffmpeg","-hide_banner","-loglevel","error","-i",str(in_mp4),
                         "-c:v","libx264","-preset","veryfast","-crf","23","-c:a","aac","-y",str(out_seg)])
            else:
                if not out_seg.exists(): ffmpeg_cut_segment(in_mp4, start, dur, out_seg)
        src = out_seg if CUT_SEGMENTS else in_mp4

        audio = PROC / "audio" / "hatemm" / f"{seg_id}.wav"
        frames_dir = PROC / "frames" / "hatemm" / seg_id
        mel = PROC / "mels" / "hatemm" / f"{seg_id}.npy"
        txt = PROC / "text" / "hatemm" / f"{seg_id}.txt"

        if EXTRACT_AUDIO and not audio.exists(): ffmpeg_extract_audio(src, audio, SAMPLE_RATE)
        if EXTRACT_FRAMES and not frames_dir.exists(): ffmpeg_extract_frames(src, frames_dir, FPS, FRAME_SIZE)
        if EXTRACT_MELS and not mel.exists(): save_mel(audio, mel, SAMPLE_RATE)
        if RUN_ASR and not txt.exists():
            txt.parent.mkdir(parents=True, exist_ok=True)
            txt.write_text(asr.transcribe(audio), encoding="utf-8")

        rec.append({
            "dataset":"HateMM","sample_id":seg_id,
            "video_path":str(in_mp4),"segment_path":str(out_seg),
            "audio_path":str(audio),"frames_dir":str(frames_dir),
            "mel_path":str(mel),"text_path":str(txt),
            "start":start,"end":(None if full else end),
            "duration":(None if full else dur),
            "label_bin": 1 if str(r.get("label","")).strip().lower() in ["hate","hateful","offensive"] else 0,
            "target": r.get("target","")
        })
    return pd.DataFrame.from_records(rec)

def preprocess_multihateclip(df: pd.DataFrame) -> pd.DataFrame:
    rec=[]; asr = ASR(ASR_MODEL) if RUN_ASR else None
    for r in tqdm(df.to_dict(orient="records"), desc="MultiHateClip"):
        vid = str(r["video_id"]); in_mp4 = Path(r["video_path"])
        stem = Path(vid).stem

        audio = PROC / "audio" / "multihateclip_en" / f"{stem}.wav"
        frames_dir = PROC / "frames" / "multihateclip_en" / stem
        mel = PROC / "mels" / "multihateclip_en" / f"{stem}.npy"
        txt = PROC / "text" / "multihateclip_en" / f"{stem}.txt"

        if EXTRACT_AUDIO and not audio.exists(): ffmpeg_extract_audio(in_mp4, audio, SAMPLE_RATE)
        if EXTRACT_FRAMES and not frames_dir.exists(): ffmpeg_extract_frames(in_mp4, frames_dir, FPS, FRAME_SIZE)
        if EXTRACT_MELS and not mel.exists(): save_mel(audio, mel, SAMPLE_RATE)
        if RUN_ASR and not txt.exists():
            txt.parent.mkdir(parents=True, exist_ok=True)
            txt.write_text(asr.transcribe(audio), encoding="utf-8")

        rec.append({
            "dataset":"MultiHateClip","sample_id":stem,
            "video_path":str(in_mp4),"segment_path":"",
            "audio_path":str(audio),"frames_dir":str(frames_dir),
            "mel_path":str(mel),"text_path":str(txt),
            "start":0.0,"end":None,"duration":None,
            "label_bin": 0 if str(r["label"]).strip().lower()=="normal" else 1,
            "split": r.get("split","")
        })
    return pd.DataFrame.from_records(rec)

dfs=[]
if len(hc_df): dfs.append(preprocess_hateclipseg(hc_df))
if len(mm_df): dfs.append(preprocess_hatemm(mm_df))
if len(mh_df): dfs.append(preprocess_multihateclip(mh_df))

meta = pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()
print("Total processed samples:", len(meta))
display(meta.head(3))

HateClipSeg rows: 2
HateMM rows: 1083
MultiHateClip rows: 1004


HateClipSeg:   0%|          | 0/2 [00:00<?, ?it/s]


TypeError: expected str, bytes or os.PathLike object, not float

In [26]:
# %% [markdown]
# ## Save unified metadata parquet
meta_path = META / "metadata_master.parquet"
meta.to_parquet(meta_path, index=False)
print("Saved:", meta_path)

NameError: name 'meta' is not defined

In [27]:
# %% [markdown]
# ## Create stratified splits for items without an explicit split
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

if "split" not in meta.columns:
    meta["split"] = ""

needs = meta["split"].isin(["","unsplit","nan",np.nan])
to_split = meta[needs & meta["label_bin"].notna()].copy()

if len(to_split):
    y = to_split["label_bin"].astype(int)
    sss_test = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
    tr_idx, te_idx = next(sss_test.split(to_split, y))
    idx = to_split.index
    meta.loc[idx[te_idx], "split"] = "test"

    remain = to_split.loc[idx[tr_idx]]
    y2 = remain["label_bin"].astype(int)
    sss_val = StratifiedShuffleSplit(n_splits=1, test_size=0.111, random_state=42)  # ~10% total
    tr2, va2 = next(sss_val.split(remain, y2))
    meta.loc[remain.index[va2], "split"] = "val"
    meta.loc[remain.index[tr2], "split"] = "train"

print(meta["split"].value_counts(dropna=False))
meta.to_parquet(meta_path, index=False)

NameError: name 'meta' is not defined

In [28]:
# %% [markdown]
# ## Sanity checks: counts & quick peek
print(meta.groupby(["dataset","split","label_bin"]).size().reset_index(name="count").head(20))

# Peek one sample per dataset (paths exist?)
for ds in ["HateClipSeg","HateMM","MultiHateClip"]:
    sub = meta[meta["dataset"]==ds].head(1)
    if len(sub):
        row = sub.iloc[0].to_dict()
        print(f"\n[{ds}] sample_id={row['sample_id']}")
        print("segment_path:", row.get("segment_path"))
        print("audio_path  :", row.get("audio_path"))
        print("frames_dir  :", row.get("frames_dir"))
        print("text_path   :", row.get("text_path"))
        # show transcript head if available
        tp = row.get("text_path")
        if tp and Path(tp).exists():
            print("Transcript:", Path(tp).read_text(encoding="utf-8")[:200], "...")

NameError: name 'meta' is not defined

In [29]:
# %% [markdown]
# ## Text-only baseline (BERT) — smoke test for the pipeline
import torch, numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
from pathlib import Path

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tok = BertTokenizerFast.from_pretrained("bert-base-uncased")

def subset(df, split):
    sub = df[df["split"]==split].copy()
    sub = sub[~sub["text_path"].isna()]
    return sub

train_df = subset(meta, "train")
val_df   = subset(meta, "val")
test_df  = subset(meta, "test")
print("Sizes:", len(train_df), len(val_df), len(test_df))

class TextDS(Dataset):
    def __init__(self, df, tok, max_len=256):
        self.df = df.reset_index(drop=True); self.tok=tok; self.max_len=max_len
        self.txts = []
        for p in self.df["text_path"].fillna(""):
            try: self.txts.append(Path(p).read_text(encoding="utf-8"))
            except: self.txts.append("")
        self.labels = self.df["label_bin"].astype(int).tolist()
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        enc = self.tok(self.txts[i], truncation=True, padding="max_length",
                       max_length=self.max_len, return_tensors="pt")
        item = {k:v.squeeze(0) for k,v in enc.items()}
        item["labels"] = torch.tensor(self.labels[i]).long()
        return item

train_loader = DataLoader(TextDS(train_df, tok), batch_size=8, shuffle=True, num_workers=2)
val_loader   = DataLoader(TextDS(val_df, tok),   batch_size=8, shuffle=False, num_workers=2)
test_loader  = DataLoader(TextDS(test_df, tok),  batch_size=8, shuffle=False, num_workers=2)

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)
opt = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
epochs = 2
steps = max(1, epochs*len(train_loader))
sched = get_linear_schedule_with_warmup(opt, int(0.1*steps), steps)

def eval_loader(loader):
    model.eval(); preds=[]; gts=[]
    with torch.no_grad():
        for batch in loader:
            batch = {k:v.to(device) for k,v in batch.items()}
            out = model(**batch)
            p = out.logits.argmax(-1).detach().cpu().numpy()
            y = batch["labels"].detach().cpu().numpy()
            preds.append(p); gts.append(y)
    import numpy as np
    preds=np.concatenate(preds) if preds else np.array([])
    gts=np.concatenate(gts) if gts else np.array([])
    if len(gts)==0:
        return {"acc":None,"prec":None,"rec":None,"f1":None,"f1_macro":None}
    acc = accuracy_score(gts,preds)
    p,r,f,_ = precision_recall_fscore_support(gts,preds,average="binary",zero_division=0)
    f1m = f1_score(gts,preds,average="macro")
    return {"acc":acc,"prec":p,"rec":r,"f1":f,"f1_macro":f1m}

for ep in range(1, epochs+1):
    model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {ep}"):
        batch = {k:v.to(device) for k,v in batch.items()}
        out = model(**batch); out.loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step(); sched.step(); opt.zero_grad()
    print("Val:", eval_loader(val_loader))

print("Test:", eval_loader(test_loader))

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

NameError: name 'meta' is not defined