# Hate Speech Detection — Colab Notebook (Improved)
Notebook này tự chạy trên **Google Colab** cho dự án Hate Speech.
**Tính năng chính**:
- Tự động đọc dữ liệu từ `data/` (hoặc upload zip)
- Chuẩn hoá cột (`text`, `label`)
- Huấn luyện **vinai/phobert-base** (mặc định) với **EarlyStopping**
- Tính **accuracy / precision / recall / f1** và hiển thị **confusion matrix**
- Hỗ trợ **class weights** để xử lý mất cân bằng

> Nếu gặp lỗi tải PhoBERT, đổi sang `bert-base-multilingual-cased`.


## 0) Kiểm tra GPU

In [None]:

import torch, sys, platform
print("PyTorch:", torch.__version__, "Python:", sys.version.split()[0])
!nvidia-smi || echo "No GPU? Vào Runtime ▸ Change runtime type ▸ chọn GPU"


## 1) Cài thư viện cần thiết

In [None]:

!pip -q install -U transformers datasets evaluate accelerate scikit-learn matplotlib
# Tắt Weights & Biases để tránh prompt đăng nhập
import os
os.environ["WANDB_MODE"] = "disabled"


## 2) Lấy dữ liệu
### Cách A (khuyến nghị): Upload zip dự án (chứa thư mục `hate speech/`)

In [None]:

from google.colab import files
import zipfile, io, os, shutil

UPLOAD_DIR = "/content/project"
os.makedirs(UPLOAD_DIR, exist_ok=True)

print("Chọn file zip dự án (ví dụ: hate speech (3).zip) ...")
uploaded = files.upload()
for name, data in uploaded.items():
    if name.endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(data), 'r') as zf:
            zf.extractall(UPLOAD_DIR)
        print("Đã giải nén vào:", UPLOAD_DIR)

# Đoán thư mục gốc dự án
# Tìm thư mục có tên 'hate speech'
import glob
candidates = glob.glob(UPLOAD_DIR + "/**/hate speech", recursive=True)
PROJECT_DIR = candidates[0] if candidates else UPLOAD_DIR  # fallback
print("PROJECT_DIR =", PROJECT_DIR)

DATA_DIR = PROJECT_DIR + "/data"
print("DATA_DIR =", DATA_DIR)
!ls -R "$DATA_DIR" || echo "Không tìm thấy thư mục data/. Hãy tự đặt đường dẫn DATA_DIR."


### Cách B: Mount Google Drive (tuỳ chọn)

In [None]:

# from google.colab import drive
# drive.mount('/content/drive')
# PROJECT_DIR = "/content/drive/MyDrive/your_project/hate speech"
# DATA_DIR = PROJECT_DIR + "/data"


## 3) Đọc & chuẩn hoá dữ liệu (`text`, `label`)

In [None]:

import os, pandas as pd, numpy as np

def _load_csv(path):
    df = pd.read_csv(path)
    # chuẩn hoá tên cột
    cols = {c.lower().strip(): c for c in df.columns}
    # map synonym -> canonical
    if "text" not in cols and "free_text" in cols:
        df.rename(columns={cols["free_text"]:"text"}, inplace=True)
    if "label" not in cols and "label_id" in cols:
        df.rename(columns={cols["label_id"]:"label"}, inplace=True)
    # Nếu vẫn chưa có 'text' hoặc 'label', thử suy đoán
    if "text" not in df.columns:
        # chọn cột string dài nhất
        str_cols = [c for c in df.columns if df[c].dtype == object]
        if str_cols:
            df.rename(columns={str_cols[0]:"text"}, inplace=True)
    if "label" not in df.columns:
        # nếu còn cột duy nhất kiểu int, dùng làm label
        int_cols = [c for c in df.columns if np.issubdtype(df[c].dtype, np.integer)]
        if int_cols:
            df.rename(columns={int_cols[0]:"label"}, inplace=True)
    keep = [c for c in ["text","label"] if c in df.columns]
    return df[keep]

def try_paths(root):
    # ưu tiên data/ gốc
    cands = [
        (root + "/train.csv", root + "/dev.csv", root + "/test.csv"),
        (root + "/vihsd/train.csv", root + "/vihsd/dev.csv", root + "/vihsd/test.csv"),
    ]
    for tr, dv, te in cands:
        if os.path.exists(tr) and os.path.exists(dv) and os.path.exists(te):
            return tr, dv, te
    raise FileNotFoundError("Không tìm thấy bộ 3 file train/dev/test trong data/ hoặc data/vihsd/.")

train_path, dev_path, test_path = try_paths(DATA_DIR)
train_df, dev_df, test_df = _load_csv(train_path), _load_csv(dev_path), _load_csv(test_path)
print(train_df.shape, dev_df.shape, test_df.shape)
print(train_df.head(3))
print(dev_df.head(3))

# Làm sạch cơ bản
def clean_text(s):
    if not isinstance(s, str):
        return ""
    return " ".join(s.split())  # gọn khoảng trắng
for df in (train_df, dev_df, test_df):
    df["text"] = df["text"].astype(str).map(clean_text)
    df.dropna(subset=["text","label"], inplace=True)

# Đảm bảo label là int (0..K-1)
if not np.issubdtype(train_df["label"].dtype, np.integer):
    # nếu label ở dạng chuỗi, map sang số
    uniq = sorted(train_df["label"].astype(str).unique().tolist())
    label2id = {v:i for i,v in enumerate(uniq)}
    id2label = {i:v for v,i in label2id.items()}
    for df in (train_df, dev_df, test_df):
        df["label"] = df["label"].astype(str).map(label2id)
else:
    # lấy id2label theo unique từ train
    uniq = sorted(train_df["label"].unique().tolist())
    label2id = {int(i):int(i) for i in uniq}
    id2label = {int(i):str(i) for i in uniq}

num_labels = len(set(train_df["label"])) 
print("num_labels =", num_labels)
print("label2id:", label2id)

# Lưu tạm sau chuẩn hoá (tham khảo)
TMP_DIR = "/content/tmp_processed"
os.makedirs(TMP_DIR, exist_ok=True)
train_df.to_csv(TMP_DIR + "/train.csv", index=False)
dev_df.to_csv(TMP_DIR + "/dev.csv", index=False)
test_df.to_csv(TMP_DIR + "/test.csv", index=False)


## 4) Khám phá nhanh (EDA)

In [None]:

import pandas as pd
import matplotlib.pyplot as plt

def plot_dist(df, title):
    ax = df['label'].value_counts().sort_index().plot(kind='bar', rot=0)
    ax.set_title(title); ax.set_xlabel("label"); ax.set_ylabel("count")
    plt.show()

plot_dist(train_df, "Train label distribution")
plot_dist(dev_df,   "Dev label distribution")
plot_dist(test_df,  "Test label distribution")
print("Ví dụ mẫu:
", train_df.sample(3, random_state=42))


## 5) Tokenizer & HuggingFace Dataset

In [None]:

from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

MODEL_NAME = "vinai/phobert-base"  # đổi nếu cần
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_fn(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        max_length=256,
        padding=False,
    )

hf_ds = DatasetDict({
    "train": Dataset.from_pandas(train_df, preserve_index=False),
    "validation": Dataset.from_pandas(dev_df, preserve_index=False),
    "test": Dataset.from_pandas(test_df, preserve_index=False),
}).map(tokenize_fn, batched=True)

hf_ds = hf_ds.remove_columns([c for c in hf_ds["train"].column_names if c not in ("input_ids","attention_mask","label")])
hf_ds


## 6) Huấn luyện — Trainer + Class Weights + EarlyStopping

In [None]:

import numpy as np, torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
import evaluate
from sklearn.utils.class_weight import compute_class_weight

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, num_labels=num_labels, label2id={str(k):k for k in range(num_labels)}, id2label={k:str(k) for k in range(num_labels)}
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

acc_metric  = evaluate.load("accuracy")
prec_metric = evaluate.load("precision")
rec_metric  = evaluate.load("recall")
f1_metric   = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":  acc_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": prec_metric.compute(predictions=preds, references=labels, average="macro")["precision"],
        "recall":    rec_metric.compute(predictions=preds, references=labels, average="macro")["recall"],
        "f1":        f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

# Class weights (optional)
USE_CLASS_WEIGHTS = True
class_weights = None
if USE_CLASS_WEIGHTS:
    y = train_df["label"].values
    classes = np.unique(y)
    weights = compute_class_weight(class_weight="balanced", classes=classes, y=y)
    class_weights = torch.tensor(weights, dtype=torch.float)

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k,v in inputs.items() if k != "labels"})
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(logits.device) if class_weights is not None else None)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

args = TrainingArguments(
    output_dir="/content/results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    gradient_accumulation_steps=1,
    report_to="none",
    seed=42,
)

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=hf_ds["train"],
    eval_dataset=hf_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()


## 7) Đánh giá trên Test + Ma trận nhầm lẫn

In [None]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

pred = trainer.predict(hf_ds["test"])
y_true = pred.label_ids
y_pred = np.argmax(pred.predictions, axis=-1)

print(classification_report(y_true, y_pred, digits=4))

cm = confusion_matrix(y_true, y_pred)
print("Confusion matrix:\n", cm)

# plot
import itertools
plt.figure(figsize=(5,4))
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(num_labels)
plt.xticks(tick_marks, [str(i) for i in range(num_labels)], rotation=45)
plt.yticks(tick_marks, [str(i) for i in range(num_labels)])
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
    plt.text(j, i, format(cm[i, j], 'd'),
             horizontalalignment="center",
             color="white" if cm[i, j] > thresh else "black")
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()


## 8) Lưu model lên Google Drive (tuỳ chọn)

In [None]:

# from google.colab import drive
# drive.mount('/content/drive')
# SAVE_DIR = "/content/drive/MyDrive/hate_speech_model_phobert"
# trainer.save_model(SAVE_DIR)
# tokenizer.save_pretrained(SAVE_DIR)
# print("Saved to", SAVE_DIR)


## 9) Suy luận nhanh (inference)

In [None]:

def predict_texts(texts):
    enc = tokenizer(texts, truncation=True, max_length=256, return_tensors="pt").to(trainer.model.device)
    with torch.no_grad():
        logits = trainer.model(**enc).logits
    preds = torch.argmax(logits, dim=-1).cpu().numpy().tolist()
    return preds

samples = [
    "Mày thật là đồ vô học!",
    "Mọi người ơi, hãy giúp nhau và tử tế hơn nhé.",
]
print(predict_texts(samples))
