# Data Normalization and Description

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import json, ast, re, math, subprocess, shlex

# === Adjust this if your Kaggle dataset slug/path differs ===
DATASET_ROOT = Path("/kaggle/input/thesis-dataset/Thesis_dataset")
ANN_PATH     = DATASET_ROOT / "HateMM_annotation.csv"

# Expected media layout (as you showed): /hate_videos, /non_hate_videos
MEDIA_ROOT   = DATASET_ROOT

OUT_DIR = Path("/kaggle/working/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Canonical label set (unified across your thesis)
CANON_LABELS = ["normal","offensive","hateful","insulting","sexual","violence","self-harm"]

print("Annotation:", ANN_PATH.exists(), ANN_PATH)
print("Media root:", MEDIA_ROOT.exists(), MEDIA_ROOT)
print("Output dir:", OUT_DIR)


In [3]:
# ---------- Column utilities ----------

def _norm_key(s: str) -> str:
    if not isinstance(s, str): s = str(s)
    return s.strip().lower().replace(" ","").replace("_","").replace("-","")

def find_col(df: pd.DataFrame, *candidates):
    """Return the first matched column (fuzzy) from a list of candidate names."""
    norm_map = {_norm_key(c): c for c in df.columns}
    for cand in candidates:
        key = _norm_key(cand)
        if key in norm_map:
            return norm_map[key]
    return None

def safe_parse_list(x):
    """Parse list-like strings; try JSON, then literal_eval; return [] if fail."""
    if isinstance(x, (list, tuple)): return list(x)
    if not isinstance(x, str): return []
    s = x.strip()
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return []

# ---------- Label normalization ----------

def normalize_label_string(x: str) -> str:
    if x is None: return "normal"
    k = str(x).strip().lower()
    m = {
        "nonhate":"normal", "non_hate":"normal", "non-hate":"normal", "non hate":"normal",
        "none":"normal", "neutral":"normal", "benign":"normal", "normal":"normal",
        "hate":"hateful", "hateful":"hateful",
        "offensive":"offensive",
        "insult":"insulting", "insulting":"insulting",
        "sexual":"sexual",
        "violence":"violence", "violent":"violence",
        "selfharm":"self-harm", "self-harm":"self-harm", "self_harm":"self-harm", "harm":"self-harm",
    }
    return m.get(k, k)

# ---------- Sample id ----------

def build_sample_id(dataset, video_id, start, end, label):
    s = "0" if (start is None or pd.isna(start)) else f"{float(start):.3f}"
    e = "end" if (end is None or pd.isna(end)) else f"{float(end):.3f}"
    return f"{dataset}__{video_id}__{s}_{e}__{label}"

# ---------- Timestamp parsing ----------

def parse_timestamp_pair(v):
    """
    Accepts [start, end], or {"start":..,"end":..}, or strings "start-end".
    Returns (t0, t1) as floats or (None, None) if invalid.
    """
    if isinstance(v, (list, tuple)) and len(v) >= 2:
        try: return float(v[0]), float(v[1])
        except Exception: return (None, None)
    if isinstance(v, dict) and {"start","end"}.issubset({k.lower() for k in v.keys()}):
        try:
            return float(v.get("start") or v.get("Start")), float(v.get("end") or v.get("End"))
        except Exception:
            return (None, None)
    if isinstance(v, str):
        m = re.match(r"^\s*([0-9.]+)\s*[-,:]\s*([0-9.]+)\s*$", v)
        if m:
            try: return float(m.group(1)), float(m.group(2))
            except Exception: return (None, None)
    return (None, None)

# ---------- Video path resolver ----------

def index_media_files(media_root: Path):
    """
    Build maps for fast lookup by filename (with/without extension) and by stem.
    Returns dicts: name2path, stem2paths
    """
    files = list(media_root.glob("**/*.mp4"))
    name2path = {f.name: str(f) for f in files}
    stem2paths = {}
    for f in files:
        stem2paths.setdefault(f.stem, []).append(str(f))
    return name2path, stem2paths

NAME2PATH, STEM2PATHS = index_media_files(MEDIA_ROOT)

def resolve_video_path(video_id_or_file: str, label_norm: str | None = None) -> str | None:
    """
    Try multiple strategies:
    1) Full filename match in media tree
    2) Stem match (choose first)
    3) If label is available (normal/hateful), try subfolder convention
    """
    v = str(video_id_or_file).strip()
    # 1) exact filename
    if v in NAME2PATH:
        return NAME2PATH[v]
    # try with .mp4 ensure
    if not v.lower().endswith(".mp4") and (v + ".mp4") in NAME2PATH:
        return NAME2PATH[v + ".mp4"]
    # 2) stem match
    stem = v[:-4] if v.lower().endswith(".mp4") else v
    cand = STEM2PATHS.get(stem)
    if cand: return cand[0]
    # 3) try subfolder using label
    if label_norm:
        sub = "hate_videos" if label_norm == "hateful" else "non_hate_videos"
        p = MEDIA_ROOT / sub / (stem + ".mp4")
        if p.exists(): return str(p)
    return None


In [4]:
# Load annotation
df_raw = pd.read_csv(ANN_PATH)

# Resolve key columns (flexible names)
col_vid  = find_col(df_raw, "video_id", "id", "video", "video_file_name", "filename", "file")
col_lab  = find_col(df_raw, "label", "class", "category")
col_t0   = find_col(df_raw, "start", "start_sec", "segment_start", "begin")
col_t1   = find_col(df_raw, "end", "end_sec", "segment_end", "finish")
col_segL = find_col(df_raw, "Segment-Level Label", "segmentlevellabel", "segmentlabel")
col_segT = find_col(df_raw, "Segment Timestamp", "segmenttimestamp", "timestamps")
col_target = find_col(df_raw, "target", "targets", "target_group", "targetgroup")

if col_vid is None or col_lab is None:
    raise ValueError(f"Missing essential columns in HateMM_annotation.csv. Found: {list(df_raw.columns)}")

# Normalize base columns
df = df_raw.copy()
df["video_id_raw"] = df[col_vid].astype(str)
df["label"] = df[col_lab].apply(normalize_label_string)

# Parse target group (optional)
def parse_targets(x):
    if isinstance(x, str):
        if "[" in x or "{" in x:
            lst = safe_parse_list(x); return [str(i) for i in lst] if lst else []
        for sep in [";", ","]:
            if sep in x: return [t.strip() for t in x.split(sep) if t.strip()]
        return [x.strip()] if x.strip() else []
    return []
df["target_group"] = df[col_target].apply(parse_targets) if col_target else [[] for _ in range(len(df))]

# Build records
records = []

if col_segL and col_segT:
    # Case 1: segment-level labels & timestamps as lists (preferred if present)
    for _, row in df.iterrows():
        vid_key = row["video_id_raw"]
        labs_list = safe_parse_list(row[col_segL])
        ts_list   = safe_parse_list(row[col_segT])
        n = min(len(labs_list), len(ts_list))
        for lb, ts in zip(labs_list[:n], ts_list[:n]):
            # segment label could be string or list; normalize
            if isinstance(lb, (list, tuple)):
                # If HateMM uses only binary, take any non-normal as hateful
                lbls = [normalize_label_string("hateful" if any(bool(x) for x in lb) else "normal")]
            else:
                lbls = [normalize_label_string(lb)]
            t0, t1 = parse_timestamp_pair(ts)
            records.append({
                "dataset": "HateMM",
                "video_id": vid_key,
                "labels": lbls,
                "start_sec": t0, "end_sec": t1,
                "target_group": row["target_group"],
            })
else:
    # Case 2: video-level or per-row start/end present
    for _, row in df.iterrows():
        vid_key = row["video_id_raw"]
        # timestamps if exist
        t0 = float(row[col_t0]) if col_t0 and pd.notna(row[col_t0]) else None
        t1 = float(row[col_t1]) if col_t1 and pd.notna(row[col_t1]) else None
        records.append({
            "dataset": "HateMM",
            "video_id": vid_key,
            "labels": [row["label"]],
            "start_sec": t0, "end_sec": t1,
            "target_group": row["target_group"],
        })

hmm_multi = pd.DataFrame.from_records(records)
print("Raw samples (pre-path):", len(hmm_multi))
hmm_multi.head(3)


Raw samples (pre-path): 1083


Unnamed: 0,dataset,video_id,labels,start_sec,end_sec,target_group
0,HateMM,hate_video_1.mp4,[hateful],,,[Blacks]
1,HateMM,hate_video_2.mp4,[hateful],,,[Blacks]
2,HateMM,non_hate_video_1.mp4,[normal],,,[Others]


In [5]:
# Resolve video_path using filename or stem + optional label hint
def guess_filename(vid_str: str) -> str:
    """
    Many HateMM files are named like 'hate_video_4.mp4' or 'non_hate_video_18.mp4'.
    If the raw video_id doesn't include .mp4, keep as-is; resolver will try variants.
    """
    return vid_str if vid_str.lower().endswith(".mp4") else vid_str + ".mp4"

paths = []
for _, r in hmm_multi.iterrows():
    # Prefer direct filename in annotation; otherwise try video_id variants
    # Use first label as hint (for subfolder hate/non_hate)
    label_hint = normalize_label_string(r["labels"][0]) if isinstance(r["labels"], list) and len(r["labels"]) else None
    p = resolve_video_path(guess_filename(r["video_id"]), label_hint)
    if p is None:
        p = resolve_video_path(r["video_id"], label_hint)
    paths.append(p)

hmm_multi["video_path"] = paths

# Explode to single-label rows
def explode_single_label_rows(df, labels_col="labels", keep_cols=None):
    if keep_cols is None:
        keep_cols = [c for c in df.columns if c != labels_col]
    rows = []
    for _, r in df.iterrows():
        labs = r[labels_col] if isinstance(r[labels_col], list) else safe_parse_list(r[labels_col])
        if not labs: labs = ["normal"]
        for lb in labs:
            newr = {c: r[c] for c in keep_cols}
            newr["label"] = normalize_label_string(lb)
            rows.append(newr)
    return pd.DataFrame(rows)

hmm_single = explode_single_label_rows(
    hmm_multi,
    labels_col="labels",
    keep_cols=["dataset","video_id","video_path","start_sec","end_sec","target_group"]
)

# Binary indicator (any non-normal considered offensive=1)
hmm_single["binary_offensive"] = hmm_single["label"].apply(lambda s: int(s != "normal"))

# sample_id
hmm_single["sample_id"] = hmm_single.apply(
    lambda r: build_sample_id(r["dataset"], r["video_id"], r["start_sec"], r["end_sec"], r["label"]), axis=1
)

# Save raw manifest (before cleaning)
RAW_MANIFEST = OUT_DIR / "hatemm_samples.csv"
hmm_single.to_csv(RAW_MANIFEST, index=False)
print("Saved raw HateMM manifest:", RAW_MANIFEST, "| rows:", len(hmm_single))
hmm_single.head(5)


Saved raw HateMM manifest: /kaggle/working/processed/hatemm_samples.csv | rows: 1083


Unnamed: 0,dataset,video_id,video_path,start_sec,end_sec,target_group,label,binary_offensive,sample_id
0,HateMM,hate_video_1.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/ha...,,,[Blacks],hateful,1,HateMM__hate_video_1.mp4__0_end__hateful
1,HateMM,hate_video_2.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/ha...,,,[Blacks],hateful,1,HateMM__hate_video_2.mp4__0_end__hateful
2,HateMM,non_hate_video_1.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/no...,,,[Others],normal,0,HateMM__non_hate_video_1.mp4__0_end__normal
3,HateMM,hate_video_3.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/ha...,,,[Blacks],hateful,1,HateMM__hate_video_3.mp4__0_end__hateful
4,HateMM,non_hate_video_2.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/no...,,,[Blacks],normal,0,HateMM__non_hate_video_2.mp4__0_end__normal


In [6]:
# Robust duration probe
def ffprobe_duration(path: str) -> float | None:
    cmd = f"ffprobe -v error -show_entries format=duration -of default=nw=1:nk=1 {shlex.quote(str(path))}"
    try:
        out = subprocess.check_output(shlex.split(cmd), stderr=subprocess.DEVNULL, text=True).strip()
        dur = float(out)
        return dur if (dur and dur > 0) else None
    except Exception:
        return None

def seg_len(row):
    try:
        if pd.notna(row.get("start_sec")) and pd.notna(row.get("end_sec")):
            return max(0.0, float(row["end_sec"]) - float(row["start_sec"]))
    except Exception:
        pass
    return np.nan

df = pd.read_csv(RAW_MANIFEST)

# BEFORE stats
before_rows  = len(df)
before_paths = df["video_path"].notna().sum()
before_unique= df["video_path"].nunique()

# Probe duration
uniq_paths = df["video_path"].dropna().unique().tolist()
dur_map = {p: ffprobe_duration(p) for p in uniq_paths}
df["video_seconds"] = df["video_path"].map(dur_map)
df["segment_seconds"] = df.apply(seg_len, axis=1)

# Mark unusable
df["is_broken"] = df["video_seconds"].isna()

# Split kept/dropped
dropped = df[df["is_broken"]].copy()
kept    = df[~df["is_broken"]].copy()

# AFTER stats
after_rows   = len(kept)
after_unique = kept["video_path"].nunique()
drop_rows    = before_rows - after_rows
drop_rate    = (drop_rows / before_rows * 100.0) if before_rows else 0.0

print(f"BEFORE: rows={before_rows}, unique paths={before_unique}")
print(f"AFTER : rows={after_rows}, unique paths={after_unique}, dropped={drop_rows} ({drop_rate:.2f}%)")

# Save cleaned + dropped lists
CLEAN_MANIFEST = OUT_DIR / "hatemm_samples.cleaned.csv"
DROP_LIST      = OUT_DIR / "hatemm_samples.cleaned.dropped.csv"

kept.to_csv(CLEAN_MANIFEST, index=False)
dropped.to_csv(DROP_LIST, index=False)
print("Saved:", CLEAN_MANIFEST)
print("Dropped list:", DROP_LIST)

# Per-class tables (before/after)
per_class_before = df.groupby("label").size().rename("samples_before").reset_index()
per_class_after  = kept.groupby("label").size().rename("samples_kept").reset_index()
per_class = per_class_before.merge(per_class_after, on="label", how="outer").fillna(0)
per_class["samples_dropped"] = per_class["samples_before"] - per_class["samples_kept"]
per_class["drop_rate_%"] = per_class.apply(
    lambda r: (r["samples_dropped"] / r["samples_before"] * 100.0) if r["samples_before"] else 0.0, axis=1
)
PER_CLASS_CSV = OUT_DIR / "hatemm_cleaning_per_class.csv"
per_class.to_csv(PER_CLASS_CSV, index=False)
print("Saved per-class cleaning stats:", PER_CLASS_CSV)
per_class


BEFORE: rows=1083, unique paths=1083
AFTER : rows=1083, unique paths=1083, dropped=0 (0.00%)
Saved: /kaggle/working/processed/hatemm_samples.cleaned.csv
Dropped list: /kaggle/working/processed/hatemm_samples.cleaned.dropped.csv
Saved per-class cleaning stats: /kaggle/working/processed/hatemm_cleaning_per_class.csv


Unnamed: 0,label,samples_before,samples_kept,samples_dropped,drop_rate_%
0,hateful,431,431,0,0.0
1,normal,652,652,0,0.0


In [7]:
hmm_clean = pd.read_csv(CLEAN_MANIFEST)

# Per-class stats (cleaned only)
stats = (
    hmm_clean
    .groupby("label", dropna=False)
    .agg(
        samples=("sample_id","count"),
        avg_video_seconds=("video_seconds", "mean"),
        avg_segment_seconds=("segment_seconds", "mean"),
    )
    .reset_index()
    .sort_values("label")
)

# Overall (for the table "ALL" row)
overall = pd.DataFrame([{
    "label": "ALL",
    "samples": len(hmm_clean),
    "avg_video_seconds": hmm_clean["video_seconds"].mean(),
    "avg_segment_seconds": hmm_clean["segment_seconds"].mean()
}])

stats_all = pd.concat([overall, stats], ignore_index=True)

# Save for thesis
STATS_CSV = OUT_DIR / "hatemm_statistics_cleaned.csv"
stats_all.to_csv(STATS_CSV, index=False)
print("Saved HateMM stats:", STATS_CSV)
stats_all


Saved HateMM stats: /kaggle/working/processed/hatemm_statistics_cleaned.csv


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,label,samples,avg_video_seconds,avg_segment_seconds
0,ALL,1083,144.470157,
1,hateful,431,154.51758,
2,normal,652,137.828378,


# DataProcessing

In [8]:
from pathlib import Path
import pandas as pd
import numpy as np

OUT_DIR = Path("/kaggle/working/processed")
MANIFEST = OUT_DIR / "hatemm_samples.cleaned.csv"  # từ bước trước
assert MANIFEST.exists(), "Missing cleaned manifest. Run the cleaning step first."

# Caching dirs
CACHE_DIR = Path("/kaggle/working/cache_hatemm")
V_FEAT_DIR = CACHE_DIR / "vision"  # .npy per sample
A_FEAT_DIR = CACHE_DIR / "audio"   # .npy per sample
V_FRAME_DIR = CACHE_DIR / "frames" # extracted frames (intermediate)
WAV_DIR = CACHE_DIR / "wav"        # extracted wav (intermediate)

for p in [V_FEAT_DIR, A_FEAT_DIR, V_FRAME_DIR, WAV_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Parameters
FPS = 1.0               # frame sampling rate (frames per second)
IMG_SIZE = 224          # input size for CLIP/ResNet
MAX_FRAMES = 64         # cap frames per sample to avoid memory blow-up
SR = 16000              # audio sample rate
SEGMENT_PADDING = 0.0   # optional padding seconds around segments

df = pd.read_csv(MANIFEST)
print("Loaded cleaned manifest:", len(df))
df.head(3)

Loaded cleaned manifest: 1083


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,dataset,video_id,video_path,start_sec,end_sec,target_group,label,binary_offensive,sample_id,video_seconds,segment_seconds,is_broken
0,HateMM,hate_video_1.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/ha...,,,['Blacks'],hateful,1,HateMM__hate_video_1.mp4__0_end__hateful,94.998,,False
1,HateMM,hate_video_2.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/ha...,,,['Blacks'],hateful,1,HateMM__hate_video_2.mp4__0_end__hateful,129.16,,False
2,HateMM,non_hate_video_1.mp4,/kaggle/input/thesis-dataset/Thesis_dataset/no...,,,['Others'],normal,0,HateMM__non_hate_video_1.mp4__0_end__normal,108.832,,False


## Vision feature caching

In [9]:
# Extract frames with ffmpeg, then encode with OpenCLIP ViT-B/32 (if available) or torchvision ResNet50.
import subprocess, shlex, math, os, glob, random
from PIL import Image
import torch
import numpy as np

# Try import open_clip; else fallback to torchvision
BACKEND = None
try:
    import open_clip
    model_clip, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="openai")
    model_clip.eval().to("cuda" if torch.cuda.is_available() else "cpu")
    tok = open_clip.get_tokenizer("ViT-B-32")
    BACKEND = "open_clip"
except Exception:
    from torchvision import models, transforms
    resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    resnet.fc = torch.nn.Identity()
    resnet.eval().to("cuda" if torch.cuda.is_available() else "cpu")
    preprocess = transforms.Compose([
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
    ])
    BACKEND = "resnet50"
print("Vision backend:", BACKEND)

device = "cuda" if torch.cuda.is_available() else "cpu"

def ffmpeg_extract_frames(video_path: str, out_dir: Path, start=None, end=None, fps=1.0, size=IMG_SIZE):
    """
    Dump frames with ffmpeg at given FPS (optionally within [start, end]).
    Returns a sorted list of file paths.
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    # Clean previous
    for f in out_dir.glob("*.jpg"):
        f.unlink(missing_ok=True)
    ss = f"-ss {start:.3f}" if start is not None else ""
    to = f"-to {end:.3f}" if end is not None else ""
    vf = f"fps={fps},scale={size}:{size}:force_original_aspect_ratio=decrease,pad={size}:{size}:(ow-iw)/2:(oh-ih)/2"
    cmd = (
        f'ffmpeg -hide_banner -loglevel error -y {ss} -i {shlex.quote(video_path)} {to} '
        f'-vf "{vf}" -qscale:v 2 {shlex.quote(str(out_dir / "%05d.jpg"))}'
    )
    subprocess.run(cmd, shell=True, check=False)
    frames = sorted([str(p) for p in out_dir.glob("*.jpg")])
    return frames

@torch.no_grad()
def encode_images(img_paths):
    """Encode a list of image paths into a single feature vector (temporal mean)."""
    if not img_paths:
        return None
    if len(img_paths) > MAX_FRAMES:
        # uniform subsample to MAX_FRAMES
        idx = np.linspace(0, len(img_paths)-1, num=MAX_FRAMES).astype(int)
        img_paths = [img_paths[i] for i in idx]

    imgs = []
    for p in img_paths:
        try:
            im = Image.open(p).convert("RGB")
            im = preprocess(im)
            imgs.append(im)
        except Exception:
            continue
    if not imgs:
        return None

    batch = torch.stack(imgs, dim=0).to(device)
    if BACKEND == "open_clip":
        feats = model_clip.encode_image(batch)
        feats = torch.nn.functional.normalize(feats, dim=-1)  # cosine-friendly
    else:
        feats = resnet(batch)
    feat = feats.mean(dim=0).detach().cpu().numpy()
    return feat

def vision_cache_one(sample_id, video_path, start_sec, end_sec):
    out_path = V_FEAT_DIR / f"{sample_id}.npy"
    if out_path.exists():
        return str(out_path)
    # extract frames
    s = None if pd.isna(start_sec) else float(start_sec) - SEGMENT_PADDING
    e = None if pd.isna(end_sec) else float(end_sec) + SEGMENT_PADDING
    if s is not None and e is not None and e <= s:  # guard
        e = None
    frame_dir = V_FRAME_DIR / sample_id
    frames = ffmpeg_extract_frames(video_path, frame_dir, s, e, FPS, IMG_SIZE)
    feat = encode_images(frames)
    # cleanup frames to save disk
    for f in frame_dir.glob("*.jpg"):
        f.unlink(missing_ok=True)
    frame_dir.rmdir() if frame_dir.exists() and not any(frame_dir.iterdir()) else None
    if feat is None:
        return None
    np.save(out_path, feat)
    return str(out_path)

# Run over manifest
v_paths = []
for i, r in df.iterrows():
    fp = vision_cache_one(r["sample_id"], r["video_path"], r.get("start_sec"), r.get("end_sec"))
    v_paths.append(fp)
    if (i+1) % 100 == 0:
        print(f"[Vision] {i+1}/{len(df)} cached")
df["vision_feat_path"] = v_paths
print("Vision feature cached. Examples:", df["vision_feat_path"].head(3).tolist())


Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /root/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 197MB/s]


Vision backend: resnet50
[Vision] 100/1083 cached
[Vision] 200/1083 cached


[mov,mp4,m4a,3gp,3g2,mj2 @ 0x58fd4fd3c540] stream 0, offset 0x23cc8c9: partial file
/kaggle/input/thesis-dataset/Thesis_dataset/hate_videos/hate_video_95.mp4: Invalid data found when processing input
[h264 @ 0x58fd4fe3a200] Invalid NAL unit size (1472 > 1168).
[h264 @ 0x58fd4fe3a200] Error splitting the input into NAL units.
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x58fd4fd3c540] stream 0, offset 0x23cce16: partial file
/kaggle/input/thesis-dataset/Thesis_dataset/hate_videos/hate_video_95.mp4: Invalid data found when processing input
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x58fd4fd3c540] stream 0, offset 0x23cd31e: partial file
/kaggle/input/thesis-dataset/Thesis_dataset/hate_videos/hate_video_95.mp4: Invalid data found when processing input
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x58fd4fd3c540] stream 0, offset 0x23cd881: partial file
/kaggle/input/thesis-dataset/Thesis_dataset/hate_videos/hate_video_95.mp4: Invalid data found when processing input
Error while decoding stream #0:0: Invalid data found when processing inpu

[Vision] 300/1083 cached


Output file #0 does not contain any stream


[Vision] 400/1083 cached
[Vision] 500/1083 cached
[Vision] 600/1083 cached


Output file #0 does not contain any stream


[Vision] 700/1083 cached
[Vision] 800/1083 cached
[Vision] 900/1083 cached
[Vision] 1000/1083 cached
Vision feature cached. Examples: ['/kaggle/working/cache_hatemm/vision/HateMM__hate_video_1.mp4__0_end__hateful.npy', '/kaggle/working/cache_hatemm/vision/HateMM__hate_video_2.mp4__0_end__hateful.npy', '/kaggle/working/cache_hatemm/vision/HateMM__non_hate_video_1.mp4__0_end__normal.npy']


## Audio feature caching (log-mel mean using librosa)

In [10]:
# Extract audio (wav) with ffmpeg, then compute log-mel and mean-pool over time.
import librosa, soundfile as sf

def ffmpeg_extract_wav(video_path: str, out_wav: Path, start=None, end=None, sr=SR):
    out_wav.parent.mkdir(parents=True, exist_ok=True)
    ss = f"-ss {start:.3f}" if start is not None else ""
    to = f"-to {end:.3f}" if end is not None else ""
    cmd = (
        f'ffmpeg -hide_banner -loglevel error -y {ss} -i {shlex.quote(video_path)} {to} '
        f'-ac 1 -ar {sr} -vn {shlex.quote(str(out_wav))}'
    )
    return subprocess.run(cmd, shell=True).returncode == 0 and out_wav.exists() and out_wav.stat().st_size > 0

def audio_cache_one(sample_id, video_path, start_sec, end_sec, n_mels=64, hop_length=320, win_length=1024):
    out_path = A_FEAT_DIR / f"{sample_id}.npy"
    if out_path.exists(): return str(out_path)
    s = None if pd.isna(start_sec) else float(start_sec) - SEGMENT_PADDING
    e = None if pd.isna(end_sec) else float(end_sec) + SEGMENT_PADDING
    if s is not None and e is not None and e <= s:
        e = None
    wav_path = WAV_DIR / f"{sample_id}.wav"
    ok = ffmpeg_extract_wav(video_path, wav_path, s, e, SR)
    if not ok:
        return None
    try:
        y, sr = librosa.load(str(wav_path), sr=SR, mono=True)
        if y.size == 0:
            return None
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels,
                                           hop_length=hop_length, win_length=win_length, power=2.0)
        logS = librosa.power_to_db(S, ref=np.max)
        feat = logS.mean(axis=1).astype(np.float32)  # mean over time -> (n_mels,)
        np.save(out_path, feat)
        # cleanup wav to save disk
        try: wav_path.unlink(missing_ok=True)
        except: pass
        return str(out_path)
    except Exception:
        try: wav_path.unlink(missing_ok=True)
        except: pass
        return None

a_paths = []
for i, r in df.iterrows():
    fp = audio_cache_one(r["sample_id"], r["video_path"], r.get("start_sec"), r.get("end_sec"))
    a_paths.append(fp)
    if (i+1) % 100 == 0:
        print(f"[Audio] {i+1}/{len(df)} cached")
df["audio_feat_path"] = a_paths
print("Audio feature cached. Examples:", df["audio_feat_path"].head(3).tolist())

Output file #0 does not contain any stream
Output file #0 does not contain any stream
Output file #0 does not contain any stream


[Audio] 100/1083 cached


Output file #0 does not contain any stream


[Audio] 200/1083 cached


Output file #0 does not contain any stream
[mov,mp4,m4a,3gp,3g2,mj2 @ 0x5bfcdb518500] stream 1, offset 0x23ce62e: partial file
/kaggle/input/thesis-dataset/Thesis_dataset/hate_videos/hate_video_95.mp4: Invalid data found when processing input
Output file #0 does not contain any stream


[Audio] 300/1083 cached


Output file #0 does not contain any stream


[Audio] 400/1083 cached


Output file #0 does not contain any stream
Output file #0 does not contain any stream
Output file #0 does not contain any stream


[Audio] 500/1083 cached
[Audio] 600/1083 cached


Output file #0 does not contain any stream


[Audio] 700/1083 cached


Output file #0 does not contain any stream
Output file #0 does not contain any stream


[Audio] 800/1083 cached
[Audio] 900/1083 cached


Output file #0 does not contain any stream
Output file #0 does not contain any stream


[Audio] 1000/1083 cached
Audio feature cached. Examples: ['/kaggle/working/cache_hatemm/audio/HateMM__hate_video_1.mp4__0_end__hateful.npy', '/kaggle/working/cache_hatemm/audio/HateMM__hate_video_2.mp4__0_end__hateful.npy', '/kaggle/working/cache_hatemm/audio/HateMM__non_hate_video_1.mp4__0_end__normal.npy']


## 

In [11]:
# Keep only rows that have at least one feature (vision or audio)
usable = df[(df["vision_feat_path"].notna()) | (df["audio_feat_path"].notna())].copy()
usable["has_vision"] = usable["vision_feat_path"].notna()
usable["has_audio"]  = usable["audio_feat_path"].notna()

TRAIN_MANIFEST = OUT_DIR / "hatemm_train_manifest.csv"
usable.to_csv(TRAIN_MANIFEST, index=False)
print("Saved training manifest:", TRAIN_MANIFEST, "| rows:", len(usable))
usable[["sample_id","label","vision_feat_path","audio_feat_path"]].head(5)


Saved training manifest: /kaggle/working/processed/hatemm_train_manifest.csv | rows: 1083


Unnamed: 0,sample_id,label,vision_feat_path,audio_feat_path
0,HateMM__hate_video_1.mp4__0_end__hateful,hateful,/kaggle/working/cache_hatemm/vision/HateMM__ha...,/kaggle/working/cache_hatemm/audio/HateMM__hat...
1,HateMM__hate_video_2.mp4__0_end__hateful,hateful,/kaggle/working/cache_hatemm/vision/HateMM__ha...,/kaggle/working/cache_hatemm/audio/HateMM__hat...
2,HateMM__non_hate_video_1.mp4__0_end__normal,normal,/kaggle/working/cache_hatemm/vision/HateMM__no...,/kaggle/working/cache_hatemm/audio/HateMM__non...
3,HateMM__hate_video_3.mp4__0_end__hateful,hateful,/kaggle/working/cache_hatemm/vision/HateMM__ha...,/kaggle/working/cache_hatemm/audio/HateMM__hat...
4,HateMM__non_hate_video_2.mp4__0_end__normal,normal,/kaggle/working/cache_hatemm/vision/HateMM__no...,


In [18]:
from pathlib import Path
import shutil, json

ROOT = Path("/kaggle/working/processed")
assert (ROOT/"hatemm_samples.cleaned.csv").exists(), "Run the cleaning steps first."

# --- lựa chọn export ---
EXPORT_FEATURES = True  # True nếu muốn kèm các .npy (nặng nhưng train ngay)

# Danh sách file tối thiểu cho Notebook B
to_copy = [
    ROOT/"hatemm_samples.cleaned.csv",
    ROOT/"hatemm_train_manifest.csv",       # từ Cell 11 trước đó
    ROOT/"hatemm_statistics_cleaned.csv",   # bảng thống kê
    ROOT/"hatemm_cleaning_per_class.csv", # nếu có
    ROOT/"hatemm_samples.csv",  # nếu có
    ROOT/"label2id.json" if (ROOT/"label2id.json").exists() else None,
]
to_copy = [p for p in to_copy if p and p.exists()]

# Thêm features nếu cần
extra_dirs = []
if EXPORT_FEATURES:
    extra_dirs += [
        Path("/kaggle/working/cache_hatemm/vision"),
        Path("/kaggle/working/cache_hatemm/audio"),
        Path("/kaggle/working/cache_hatemm/frame"),
        Path("/kaggle/working/cache_hatemm/wav")
    ]


In [19]:
# /kaggle/working/cache_hatemm
from pathlib import Path
import zipfile
from datetime import datetime

EXPORT_DIR = Path("/kaggle/working/export_hatemm")
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

version = datetime.now().strftime("%Y%m%d_%H%M")
zip_path = EXPORT_DIR / f"hatemm_processed_{version}.zip"

readme = f"""HateMM processed package
Version: {version}

Contents:
- hatemm_samples.cleaned.csv : cleaned manifest (one row = one label/segment)
- hatemm_train_manifest.csv  : training manifest with feature paths (if features included)
- hatemm_statistics_cleaned.csv : dataset statistics (per-class, avg lengths)
- cleaning_summary_*.csv : drop stats before/after cleaning
- label2id.json : label mapping used in training (if present)
{"- features/vision/*.npy and features/audio/*.npy : cached features (optional)" if True else ""}

How to use in a new notebook:
1) Unzip to a working folder
2) If no features inside, re-run feature caching (or reuse your Cell 9–12)
3) Load manifests, build X/Y matrices, and train baseline.
"""

# build zip
with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf:
    # files
    for p in to_copy:
        zf.write(p, arcname=p.name)
    # features (optional)
    if EXPORT_FEATURES:
        for d in extra_dirs:
            if d.exists():
                for npy in d.rglob("*.npy"):
                    # put under features/vision or features/audio
                    rel = Path("features") / d.name / npy.name
                    zf.write(npy, arcname=str(rel))
    # readme
    zf.writestr("README.txt", readme)

print("Created:", zip_path)

Created: /kaggle/working/export_hatemm/hatemm_processed_20250919_0501.zip


In [None]:
extra_dirs