In [2]:
# Cell 1: Imports & basic config

import os
import json
from collections import Counter

import numpy as np
import pandas as pd

from sklearn.metrics import (
    classification_report,
    f1_score,
    accuracy_score
)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM,
    GlobalMaxPool1D, Dense, Dropout
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Reproducibility
SEED = 42
np.random.seed(SEED)
tf.random.set_seed(SEED)

# ==== EDIT THESE PATHS TO MATCH YOUR DRIVE ====
PATH_DATASET_JSON = "/content/drive/MyDrive/HateXplain/Data/dataset.json"
PATH_SPLIT_JSON   = "/content/drive/MyDrive/HateXplain/Data/post_id_divisions.json"
PATH_GLOVE_TXT    = "/content/drive/MyDrive/glove.6B.100d.txt"  # adjust if needed

MAX_LEN = 80  # sequence length for BiLSTM
EMBED_DIM = 100  # should match GloVe file (100 for glove.6B.100d)


In [3]:
# Cell 2: Load HateXplain dataset and build df

# Load main dataset
with open(PATH_DATASET_JSON, "r") as f:
    data = json.load(f)

print(f"Loaded {len(data)} posts from dataset.json")

rows = []

for post_id, entry in data.items():
    tokens = entry.get("post_tokens", [])
    annotators = entry.get("annotators", [])

    # Reconstruct full text
    text = " ".join(tokens)

    # Collect annotator labels
    labels = [a.get("label") for a in annotators if "label" in a]
    labels = [l for l in labels if l is not None]
    if len(labels) == 0:
        continue

    # Majority vote over ["hatespeech","offensive","normal"]
    label_counts = Counter(labels)
    majority_label = label_counts.most_common(1)[0][0]

    # Collect targets from annotators
    targets = []
    for a in annotators:
        t = a.get("target", [])
        if isinstance(t, list):
            targets.extend(t)
    if len(targets) == 0:
        main_target = "None"
    else:
        main_target = Counter(targets).most_common(1)[0][0]

    rows.append({
        "post_id": post_id,
        "text": text,
        "label_3way": majority_label,   # Task 1
        "target_group_raw": main_target # Task 2
    })

df = pd.DataFrame(rows)
print("DataFrame shape:", df.shape)
df.head()


Loaded 20148 posts from dataset.json
DataFrame shape: (20148, 4)


Unnamed: 0,post_id,text,label_3way,target_group_raw
0,1179055004553900032_twitter,i dont think im getting my baby them white 9 h...,normal,
1,1179063826874032128_twitter,we cannot continue calling ourselves feminists...,normal,
2,1178793830532956161_twitter,nawt yall niggers ignoring me,normal,African
3,1179088797964763136_twitter,<user> i am bit confused coz chinese ppl can n...,hatespeech,Asian
4,1179085312976445440_twitter,this bitch in whataburger eating a burger with...,hatespeech,Caucasian


In [4]:
# Cell 3: Attach official HateXplain split info

with open(PATH_SPLIT_JSON, "r") as f:
    split_data = json.load(f)

train_ids = set(split_data.get("train", []))
val_ids   = set(split_data.get("val", []))
test_ids  = set(split_data.get("test", []))

def get_split(pid):
    if pid in train_ids:
        return "train"
    elif pid in val_ids:
        return "val"
    elif pid in test_ids:
        return "test"
    else:
        return "unknown"

df["split"] = df["post_id"].apply(get_split)

print(df["split"].value_counts())
df.head()


split
train      15383
test        1924
val         1922
unknown      919
Name: count, dtype: int64


Unnamed: 0,post_id,text,label_3way,target_group_raw,split
0,1179055004553900032_twitter,i dont think im getting my baby them white 9 h...,normal,,test
1,1179063826874032128_twitter,we cannot continue calling ourselves feminists...,normal,,train
2,1178793830532956161_twitter,nawt yall niggers ignoring me,normal,African,train
3,1179088797964763136_twitter,<user> i am bit confused coz chinese ppl can n...,hatespeech,Asian,train
4,1179085312976445440_twitter,this bitch in whataburger eating a burger with...,hatespeech,Caucasian,val


In [5]:
# Cell 4: Label structure for Task 1 (3-class) and Task 2 (12-class)

# Task 1: 3-way label mapping
label_map_3 = {
    "normal": 0,
    "offensive": 1,
    "hatespeech": 2,
}

df["label_3way_id"] = df["label_3way"].map(label_map_3)

print("3-way label distribution:")
print(df["label_3way"].value_counts())
print("\n3-way IDs distribution:")
print(df["label_3way_id"].value_counts().sort_index())


# Task 2: normalize target groups into 12 categories

target_vocab = [
    "African", "Arab", "Asian", "Caucasian", "Hispanic",
    "Homosexual", "Islam", "Jewish", "None",
    "Other", "Refugee", "Women"
]

def normalize_target(t):
    if t in target_vocab:
        return t
    # Anything else goes into "Other"
    return "Other"

df["target_group"] = df["target_group_raw"].apply(normalize_target)

# Map to IDs
target_to_id = {g: i for i, g in enumerate(target_vocab)}
id_to_target = {i: g for g, i in target_to_id.items()}

df["target_group_id"] = df["target_group"].map(target_to_id)

print("\nTarget group distribution (normalized):")
print(df["target_group"].value_counts())


3-way label distribution:
label_3way
normal        8153
hatespeech    6234
offensive     5761
Name: count, dtype: int64

3-way IDs distribution:
label_3way_id
0    8153
1    5761
2    6234
Name: count, dtype: int64

Target group distribution (normalized):
target_group
None          6901
African       3107
Islam         1932
Homosexual    1692
Jewish        1633
Women         1219
Other         1149
Refugee        848
Arab           586
Caucasian      467
Asian          357
Hispanic       257
Name: count, dtype: int64


In [6]:
# Cell 5: Train/Val/Test splits for both tasks

train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "val"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Task 1 labels
y_train_3 = train_df["label_3way_id"].values
y_val_3   = val_df["label_3way_id"].values
y_test_3  = test_df["label_3way_id"].values

# Task 2 labels
y_train_group = train_df["target_group_id"].values
y_val_group   = val_df["target_group_id"].values
y_test_group  = test_df["target_group_id"].values

# Text
X_train_text = train_df["text"].astype(str).values
X_val_text   = val_df["text"].astype(str).values
X_test_text  = test_df["text"].astype(str).values


Train: 15383, Val: 1922, Test: 1924


In [7]:
# Cell 6: Tokenize text and pad sequences

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_val_seq   = tokenizer.texts_to_sequences(X_val_text)
X_test_seq  = tokenizer.texts_to_sequences(X_test_text)

X_train_seq = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding="post", truncating="post")
X_val_seq   = pad_sequences(X_val_seq,   maxlen=MAX_LEN, padding="post", truncating="post")
X_test_seq  = pad_sequences(X_test_seq,  maxlen=MAX_LEN, padding="post", truncating="post")

print("Seq shapes:", X_train_seq.shape, X_val_seq.shape, X_test_seq.shape)
vocab_size = len(tokenizer.word_index) + 1
print("Vocab size:", vocab_size)


Seq shapes: (15383, 80) (1922, 80) (1924, 80)
Vocab size: 24912


In [10]:
# Cell 7: Load GloVe and build embedding matrix

embeddings_index = {}

with open(PATH_GLOVE_TXT, "r", encoding="utf8") as f:
    for line in f:
        values = line.rstrip().split(" ")
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        if coefs.shape[0] != EMBED_DIM:
            continue
        embeddings_index[word] = coefs

print(f"Loaded {len(embeddings_index)} word vectors from GloVe.")

embedding_matrix = np.random.normal(scale=0.6, size=(vocab_size, EMBED_DIM)).astype("float32")

for word, idx in tokenizer.word_index.items():
    if idx >= vocab_size:
        continue
    vec = embeddings_index.get(word)
    if vec is not None:
        embedding_matrix[idx] = vec

embedding_matrix.shape


Loaded 400000 word vectors from GloVe.


(24912, 100)

In [11]:
# Cell 8: Define BiLSTM for Task 1 (3-class: normal, offensive, hatespeech)

num_classes_task1 = 3

inputs = Input(shape=(MAX_LEN,), name="input_ids")
x = Embedding(
    input_dim=vocab_size,
    output_dim=EMBED_DIM,
    weights=[embedding_matrix],
    input_length=MAX_LEN,
    trainable=False,
    name="embedding"
)(inputs)

x = Bidirectional(LSTM(64, return_sequences=True), name="bilstm")(x)
x = GlobalMaxPool1D(name="global_max_pool")(x)
x = Dense(64, activation="relu", name="dense")(x)
x = Dropout(0.5, name="dropout")(x)
outputs = Dense(num_classes_task1, activation="softmax", name="output")(x)

bilstm_task1 = Model(inputs=inputs, outputs=outputs, name="bilstm_task1_3class")
bilstm_task1.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

bilstm_task1.summary()




In [13]:
# Cell 9: Train BiLSTM for Task 1

es = EarlyStopping(
    monitor="val_loss",
    patience=3,
    mode="min",
    restore_best_weights=True,
    verbose=1,
)

history_task1 = bilstm_task1.fit(
    X_train_seq,
    y_train_3,
    validation_data=(X_val_seq, y_val_3),
    epochs=15,
    batch_size=64,
    callbacks=[es],
    verbose=1,
)


Epoch 1/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 154ms/step - accuracy: 0.4599 - loss: 1.0375 - val_accuracy: 0.6041 - val_loss: 0.8645
Epoch 2/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 162ms/step - accuracy: 0.6165 - loss: 0.8516 - val_accuracy: 0.6384 - val_loss: 0.8090
Epoch 3/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 156ms/step - accuracy: 0.6610 - loss: 0.7985 - val_accuracy: 0.6384 - val_loss: 0.7893
Epoch 4/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 159ms/step - accuracy: 0.6828 - loss: 0.7508 - val_accuracy: 0.6457 - val_loss: 0.7822
Epoch 5/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 162ms/step - accuracy: 0.7032 - loss: 0.7150 - val_accuracy: 0.6363 - val_loss: 0.7928
Epoch 6/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 173ms/step - accuracy: 0.7171 - loss: 0.6713 - val_accuracy: 0.6467 - val_loss: 0.7806
Epoch 7/15

In [14]:
# Cell 10: Evaluation for Task 1 (3-class)

label_names_3 = ["normal", "offensive", "hatespeech"]

# Validation
y_val_pred_prob_3 = bilstm_task1.predict(X_val_seq)
y_val_pred_3 = np.argmax(y_val_pred_prob_3, axis=1)

print("=== Task 1 (3-class) — Validation Metrics (BiLSTM) ===")
print(classification_report(y_val_3, y_val_pred_3, target_names=label_names_3, digits=4))

macro_f1_val_3 = f1_score(y_val_3, y_val_pred_3, average="macro")
acc_val_3 = accuracy_score(y_val_3, y_val_pred_3)
print(f"Macro F1 (val): {macro_f1_val_3:.4f}")
print(f"Accuracy (val): {acc_val_3:.4f}")

# Test
y_test_pred_prob_3 = bilstm_task1.predict(X_test_seq)
y_test_pred_3 = np.argmax(y_test_pred_prob_3, axis=1)

print("\n=== Task 1 (3-class) — Test Metrics (BiLSTM) ===")
print(classification_report(y_test_3, y_test_pred_3, target_names=label_names_3, digits=4))

macro_f1_test_3 = f1_score(y_test_3, y_test_pred_3, average="macro")
acc_test_3 = accuracy_score(y_test_3, y_test_pred_3)
print(f"Macro F1 (test): {macro_f1_test_3:.4f}")
print(f"Accuracy (test): {acc_test_3:.4f}")

print("\n=== SUMMARY FOR REPORT (TASK 1) ===")
print(f"BiLSTM (Task 1, 3-class) — Test Macro F1: {macro_f1_test_3:.4f}, Test Accuracy: {acc_test_3:.4f}")


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step
=== Task 1 (3-class) — Validation Metrics (BiLSTM) ===
              precision    recall  f1-score   support

      normal     0.7112    0.6338    0.6703       781
   offensive     0.4911    0.5529    0.5202       548
  hatespeech     0.7307    0.7504    0.7404       593

    accuracy                         0.6467      1922
   macro avg     0.6443    0.6457    0.6436      1922
weighted avg     0.6545    0.6467    0.6491      1922

Macro F1 (val): 0.6436
Accuracy (val): 0.6467
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step

=== Task 1 (3-class) — Test Metrics (BiLSTM) ===
              precision    recall  f1-score   support

      normal     0.7441    0.6841    0.7129       782
   offensive     0.5158    0.5347    0.5251       548
  hatespeech     0.7080    0.7593    0.7327       594

    accuracy                         0.6648      1924
   macro avg     0.6560    0.6594    0.6569     

In [15]:
# Cell 11: BiLSTM for Task 2 (target group classification, 12 classes)

num_groups = len(target_vocab)  # 12

inputs2 = Input(shape=(MAX_LEN,), name="input_ids_task2")
x2 = Embedding(
    input_dim=vocab_size,
    output_dim=EMBED_DIM,
    weights=[embedding_matrix],
    input_length=MAX_LEN,
    trainable=False,
    name="embedding_task2"
)(inputs2)

x2 = Bidirectional(LSTM(64, return_sequences=True), name="bilstm_task2")(x2)
x2 = GlobalMaxPool1D(name="global_max_pool_task2")(x2)
x2 = Dense(64, activation="relu", name="dense_task2")(x2)
x2 = Dropout(0.5, name="dropout_task2")(x2)
outputs2 = Dense(num_groups, activation="softmax", name="output_task2")(x2)

bilstm_task2 = Model(inputs=inputs2, outputs=outputs2, name="bilstm_task2_group")
bilstm_task2.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

bilstm_task2.summary()




In [16]:
# Cell 12: Train BiLSTM for Task 2

es2 = EarlyStopping(
    monitor="val_loss",
    patience=3,
    mode="min",
    restore_best_weights=True,
    verbose=1,
)

history_task2 = bilstm_task2.fit(
    X_train_seq,
    y_train_group,
    validation_data=(X_val_seq, y_val_group),
    epochs=15,
    batch_size=64,
    callbacks=[es2],
    verbose=1,
)


Epoch 1/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 243ms/step - accuracy: 0.3451 - loss: 2.0866 - val_accuracy: 0.5557 - val_loss: 1.4988
Epoch 2/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 183ms/step - accuracy: 0.5435 - loss: 1.5163 - val_accuracy: 0.6134 - val_loss: 1.2104
Epoch 3/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 154ms/step - accuracy: 0.5920 - loss: 1.2944 - val_accuracy: 0.6441 - val_loss: 1.0912
Epoch 4/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 169ms/step - accuracy: 0.6299 - loss: 1.1654 - val_accuracy: 0.6634 - val_loss: 1.0154
Epoch 5/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 169ms/step - accuracy: 0.6482 - loss: 1.0846 - val_accuracy: 0.6644 - val_loss: 0.9758
Epoch 6/15
[1m241/241[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 154ms/step - accuracy: 0.6661 - loss: 1.0201 - val_accuracy: 0.6759 - val_loss: 0.9562
Epoch 7/15

In [18]:
# Cell 13: Evaluation for Task 2 (target group)

group_names = target_vocab  # ['African', 'Arab', ..., 'Women']

# Validation
y_val_group_prob = bilstm_task2.predict(X_val_seq)
y_val_group_pred = np.argmax(y_val_group_prob, axis=1)

print("=== Task 2 — Validation Metrics (BiLSTM) ===")
print(classification_report(y_val_group, y_val_group_pred, target_names=group_names, digits=4))

macro_f1_val_group = f1_score(y_val_group, y_val_group_pred, average="macro")
micro_f1_val_group = f1_score(y_val_group, y_val_group_pred, average="micro")
acc_val_group = accuracy_score(y_val_group, y_val_group_pred)

print(f"Macro F1 (val): {macro_f1_val_group:.4f}")
print(f"Micro F1 (val): {micro_f1_val_group:.4f}")
print(f"Accuracy (val): {acc_val_group:.4f}")

# Test
y_test_group_prob = bilstm_task2.predict(X_test_seq)
y_test_group_pred = np.argmax(y_test_group_prob, axis=1)

print("\n=== Task 2 — Test Metrics (BiLSTM) ===")
print(classification_report(y_test_group, y_test_group_pred, target_names=group_names, digits=4))

macro_f1_test_group = f1_score(y_test_group, y_test_group_pred, average="macro")
micro_f1_test_group = f1_score(y_test_group, y_test_group_pred, average="micro")
acc_test_group = accuracy_score(y_test_group, y_test_group_pred)

print(f"Macro F1 (test): {macro_f1_test_group:.4f}")
print(f"Micro F1 (test): {micro_f1_test_group:.4f}")
print(f"Accuracy (test): {acc_test_group:.4f}")

print("\n=== SUMMARY FOR REPORT (TASK 2) ===")
print(f"BiLSTM (Task 2, target group) — Test Macro F1: {macro_f1_test_group:.4f}, "
      f"Test Micro F1: {micro_f1_test_group:.4f}, Test Accuracy: {acc_test_group:.4f}")


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step
=== Task 2 — Validation Metrics (BiLSTM) ===
              precision    recall  f1-score   support

     African     0.7804    0.7884    0.7844       293
        Arab     0.6557    0.7018    0.6780        57
       Asian     0.5641    0.8148    0.6667        27
   Caucasian     1.0000    0.0238    0.0465        42
    Hispanic     0.8824    0.7895    0.8333        19
  Homosexual     0.6505    0.8272    0.7283       162
       Islam     0.7778    0.8415    0.8084       183
      Jewish     0.8768    0.7857    0.8288       154
        None     0.6443    0.7324    0.6855       695
       Other     0.5000    0.0177    0.0342       113
     Refugee     0.6957    0.7059    0.7007        68
       Women     0.4757    0.4495    0.4623       109

    accuracy                         0.6899      1922
   macro avg     0.7086    0.6232    0.6047      1922
weighted avg     0.6900    0.6899    0.6640      1922

Macro F1 (val)

In [19]:
# Cell 14: Pack metrics into dict for copy-paste in report

metrics_summary = {
    "task1": {
        "macro_f1_test": float(macro_f1_test_3),
        "accuracy_test": float(acc_test_3),
    },
    "task2": {
        "macro_f1_test": float(macro_f1_test_group),
        "micro_f1_test": float(micro_f1_test_group),
        "accuracy_test": float(acc_test_group),
    }
}

metrics_summary


{'task1': {'macro_f1_test': 0.6568951040120572,
  'accuracy_test': 0.6647609147609148},
 'task2': {'macro_f1_test': 0.5830087605302409,
  'micro_f1_test': 0.6943866943866944,
  'accuracy_test': 0.6943866943866944}}