
# Thesis Chapters 4–5 — Data Processing Notebook
**Datasets**: MultiHateClip, HateMM, HateClipSeg  
**Environment**: Kaggle

This notebook standardizes the three datasets into CSV manifests that point to the **actual video paths** on Kaggle.

**Input root (read-only):** `/kaggle/input/dataset-sample`  
**Output root (writable):** `/kaggle/working/processed`

> Comments are in English as requested.


In [3]:

# === Imports & Global Paths ===
from pathlib import Path
import pandas as pd

# Kaggle dataset root (read-only)
DATASET_ROOT = Path("/kaggle/input/dataset-sample/preprocess_lab/data/raw")

# Output directory for processed manifests (writable on Kaggle)
OUT_DIR = Path("/kaggle/working/processed")
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Input root:", DATASET_ROOT)
print("Output root:", OUT_DIR)


Input root: /kaggle/input/dataset-sample/preprocess_lab/data/raw
Output root: /kaggle/working/processed


## 1) MultiHateClip — Load annotations and map to local video files

In [4]:

# MultiHateClip folder and expected files
mh_root = DATASET_ROOT / "MultiHateClip"

# Read splits (TSV format)
mh_train = pd.read_csv(mh_root / "train.tsv", sep="\t")
mh_valid = pd.read_csv(mh_root / "valid.tsv", sep="\t")
mh_test  = pd.read_csv(mh_root / "test.tsv",  sep="\t")

# Concatenate and add a 'split' column
df_mh = pd.concat([mh_train, mh_valid, mh_test], keys=["train","valid","test"], names=["split"])
df_mh.reset_index(level=0, inplace=True)
df_mh.rename(columns={"level_0": "split"}, inplace=True)

# Map each row to its actual .mp4 path under /data/{split}/
def mh_path(row):
    split = row["split"]
    # Some versions use 'video_id' column; fallback to first column if missing
    vid = row["video_id"] if "video_id" in row else row.iloc[0]
    return str(mh_root / "data" / split / f"{vid}.mp4")

df_mh["video_path"] = df_mh.apply(mh_path, axis=1)
df_mh["dataset"] = "MultiHateClip"

print("MultiHateClip rows:", len(df_mh))
df_mh.head(3)


MultiHateClip rows: 1001


Unnamed: 0,split,Video_ID,Majority_Voting,Label,Target_Victim,Component,Duration,video_path,dataset
0,train,4V0KGql_fUI,Normal,"['Normal', 'Normal']",[],[],[],/kaggle/input/dataset-sample/preprocess_lab/da...,MultiHateClip
1,train,5snzFreG79c,Offensive,"['Offensive', 'Offensive']",['Couple'],"['Transcript', 'Audio', 'Metadata']","[(30, 39)]",/kaggle/input/dataset-sample/preprocess_lab/da...,MultiHateClip
2,train,EyE82W10wgk,Normal,"['Normal', 'Counter Narrative']",[],[],[],/kaggle/input/dataset-sample/preprocess_lab/da...,MultiHateClip


## 2) HateMM — Load annotations and map to hate/non-hate folders

In [5]:

hmm_root = DATASET_ROOT / "HateMM"
df_hmm = pd.read_csv(hmm_root / "HateMM_annotation.csv")

# Resolve each video to either hate_videos/ or non_hate_videos/ based on the label
def hmm_path(row):
    sub = "hate_videos" if str(row["label"]).strip().lower() == "hate" else "non_hate_videos"
    return str(hmm_root / "data" / sub / row["video_file_name"])

df_hmm["video_path"] = df_hmm.apply(hmm_path, axis=1)
df_hmm["dataset"] = "HateMM"

print("HateMM rows:", len(df_hmm))
df_hmm.head(3)


HateMM rows: 1083


Unnamed: 0,video_file_name,label,hate_snippet,target,video_path,dataset
0,hate_video_1.mp4,Hate,"[['00:00:34', '00:01:34']]",Blacks,/kaggle/input/dataset-sample/preprocess_lab/da...,HateMM
1,hate_video_2.mp4,Hate,"[['00:00:06', '00:02:06']]",Blacks,/kaggle/input/dataset-sample/preprocess_lab/da...,HateMM
2,non_hate_video_1.mp4,Non Hate,,Others,/kaggle/input/dataset-sample/preprocess_lab/da...,HateMM


## 3) HateClipSeg — Flatten segment-level annotations and map to segment video files

In [6]:

hcs_root = DATASET_ROOT / "HateClipSeg"
df_video = pd.read_csv(hcs_root / "video_level_annotation.csv")  # kept for reference
df_seg   = pd.read_csv(hcs_root / "segment_level_annotation.csv")

# Expand segment-level entries: one row per segment with start/end and a label vector
records = []
for _, row in df_seg.iterrows():
    vid = row["Video ID"]
    # segment-level label is stored as a stringified list-of-lists; same for timestamps
    labels = eval(row["Segment-Level Label"])
    times  = eval(row["Segment Timestamp"])
    for lbl_vec, (t0, t1) in zip(labels, times):
        records.append({
            "Video ID": vid,
            "Segment Labels": lbl_vec,    # e.g., [0,1,0,0,0,0] (index mapping in the dataset README)
            "Start": t0,
            "End": t1,
            # Point to segment-level videos if present
            "video_path": str(hcs_root / "data" / "segment_level" / f"{vid}.mp4")
        })

df_hcs = pd.DataFrame.from_records(records)
df_hcs["dataset"] = "HateClipSeg"

print("HateClipSeg rows:", len(df_hcs))
df_hcs.head(3)


KeyError: 'Video ID'

### (Optional) Normalize labels to a canonical set for training convenience

In [None]:

# This section provides helper functions to standardize label names across datasets,
# if you need a unified set like: ['normal','hateful','insulting','sexual','violence','self-harm','offensive'].
# It is optional and safe to skip for raw manifests.

def normalize_label_string(x: str) -> str:
    m = {
        "non hate": "normal",
        "nonhate": "normal",
        "non_hate": "normal",
        "non-hate": "normal",
        "normal": "normal",
        "hate": "hateful",
        "hateful": "hateful",
        "offensive": "offensive",
        "insult": "insulting",
        "insulting": "insulting",
        "sexual": "sexual",
        "violence": "violence",
        "violent": "violence",
        "harm": "self-harm",
        "self-harm": "self-harm",
        "self_harm": "self-harm"
    }
    k = str(x).strip().lower()
    return m.get(k, k)

# Example: apply to HateMM (binary) → 'normal' / 'hateful'
if "label" in df_hmm.columns:
    df_hmm["label_canonical"] = df_hmm["label"].apply(normalize_label_string)
df_hmm.head(2)


## 4) Save processed manifests to `/kaggle/working/processed`

In [None]:

(df_mh
 .to_csv(OUT_DIR / "multihateclip.csv", index=False))
(df_hmm
 .to_csv(OUT_DIR / "hatemm.csv", index=False))
(df_hcs
 .to_csv(OUT_DIR / "hateclipseg.csv", index=False))

print("Saved:")
print(" -", OUT_DIR / "multihateclip.csv")
print(" -", OUT_DIR / "hatemm.csv")
print(" -", OUT_DIR / "hateclipseg.csv")


## 5) Quick sanity checks

In [None]:

# Check existence of a few sample files to ensure paths are correct.
# (Kaggle inputs can be read; not all files are guaranteed to exist in the sample.)
from pathlib import Path

def exists(p): 
    try: 
        return Path(p).exists()
    except Exception:
        return False

print("Sample MultiHateClip path exists?",
      exists(df_mh.loc[df_mh.index[0], "video_path"]) if len(df_mh) else None)

print("Sample HateMM path exists?",
      exists(df_hmm.loc[df_hmm.index[0], "video_path"]) if len(df_hmm) else None)

print("Sample HateClipSeg path exists?",
      exists(df_hcs.loc[df_hcs.index[0], "video_path"]) if len(df_hcs) else None)



### Next steps
- Use these CSVs as input to your Chapter 4 training pipeline (frame/audio extraction, ASR, feature caching).
- If you need, I can extend this notebook with ffmpeg/OpenCV extractors and a CLIP/BERT/wav2vec2 feature cache.
