In [3]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
!gunzip cc.id.300.vec.gz

--2025-11-26 00:36:51--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.id.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.51, 3.163.189.14, 3.163.189.108, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.51|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1227018698 (1.1G) [binary/octet-stream]
Saving to: ‘cc.id.300.vec.gz’


2025-11-26 00:36:56 (276 MB/s) - ‘cc.id.300.vec.gz’ saved [1227018698/1227018698]



In [4]:
from gensim.models import KeyedVectors

ft = KeyedVectors.load_word2vec_format("cc.id.300.vec", binary=False)

In [68]:
import pandas as pd
import numpy as np
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

df = pd.read_csv("/kaggle/input/datasethatespeechmultilabel/clean_data.csv")
df = df.dropna()
X = df["Tweet"]
y = df.drop(["Tweet", "num_labels"], axis=1)

sss = MultilabelStratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=42
)

for train_idx, test_idx in sss.split(X, y):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

print("Proporsi label train:")
print(y_train.mean())

print("\nProporsi label test:")
print(y_test.mean())

Proporsi label train:
HS_Individual    0.271706
HS_Group         0.151181
HS_Religion      0.060358
HS_Race          0.043126
HS_Physical      0.024562
HS_Gender        0.023324
dtype: float64

Proporsi label test:
HS_Individual    0.271896
HS_Group         0.151181
HS_Religion      0.060548
HS_Race          0.043031
HS_Physical      0.024752
HS_Gender        0.023229
dtype: float64


In [69]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 35000
tokenizer = Tokenizer(num_words=max_words, oov_token="<unk>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq  = tokenizer.texts_to_sequences(X_test)

max_len = 50
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad  = pad_sequences(X_test_seq, maxlen=max_len)

num_labels = y_train.shape[1]

In [70]:
import numpy as np

embedding_dim = 300
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, idx in word_index.items():
    if idx >= max_words:
        continue
    if word in ft:
        embedding_matrix[idx] = ft[word]
    else:
        embedding_matrix[idx] = np.zeros(embedding_dim)

In [71]:
import numpy as np
from tensorflow.keras.callbacks import Callback

def exact_match_ratio(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)
    return np.mean(np.all(actual == pred, axis=1))

def hamming_loss_(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)
    N, L   = actual.shape

    diff = actual != pred
    return diff.sum() / (N * L)

def micro_f1(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)

    TP = np.sum((actual == 1) & (pred == 1))
    FP = np.sum((actual == 0) & (pred == 1))
    FN = np.sum((actual == 1) & (pred == 0))

    if TP == 0:
        return 0.0

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall    = TP / (TP + FN) if (TP + FN) > 0 else 0.0

    if precision + recall == 0:
        return 0.0

    return 2 * precision * recall / (precision + recall)

def macro_f1(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)

    L = actual.shape[1]
    f1_scores = []

    for j in range(L):
        y_true = actual[:, j]
        y_pred = pred[:, j]

        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        FN = np.sum((y_true == 1) & (y_pred == 0))

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        recall    = TP / (TP + FN) if (TP + FN) > 0 else 0.0

        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0.0

        f1_scores.append(f1)

    return np.mean(f1_scores)

class MultiLabelMetrics(Callback):
    def __init__(self, X_val, y_val):
        super().__init__()
        self.X_val = X_val
        self.y_val = y_val

    def on_epoch_end(self, epoch, logs=None):
        y_prob = self.model.predict(self.X_val, verbose=0)
        y_pred = (y_prob > 0.5).astype(int)

        emr = exact_match_ratio(self.y_val, y_pred)
        ham = hamming_loss_(self.y_val, y_pred)
        mic = micro_f1(self.y_val, y_pred)
        mac = macro_f1(self.y_val, y_pred)

        print(f" — val_exact_match: {emr:.4f} — val_hamming: {ham:.4f} — val_micro_f1: {mic:.4f} — val_macro_f1: {mac:.4f}")

        logs["val_exact_match"] = emr
        logs["val_hamming_loss"] = ham
        logs["val_micro_f1"] = mic
        logs["val_macro_f1"] = mac

In [72]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model = Sequential()

model.add(
    Embedding(
        input_dim=max_words,
        output_dim=embedding_dim,
        weights=[embedding_matrix],
        input_length=max_len,
        trainable=False
    )
)

model.add(Dropout(0.25))

model.add(
    Bidirectional(
        LSTM(256, return_sequences=True, dropout=0.3, recurrent_dropout=0.2)
    )
)

model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))

model.add(Dense(num_labels, activation="sigmoid"))

model.compile(
    loss="binary_crossentropy",
    optimizer="adam"
)



In [73]:
for train_idx, test_idx in sss.split(X_train_pad, y_train):
    X_train_final = X_train_pad[train_idx] 
    X_val_final   = X_train_pad[test_idx]   

    y_train_final = y_train.iloc[train_idx]
    y_val_final   = y_train.iloc[test_idx] 


metrics_callback = MultiLabelMetrics(X_val_final, y_val_final)

In [74]:
history = model.fit(
    X_train_final,
    y_train_final,
    validation_data=(X_val_final, y_val_final),
    epochs=10,
    batch_size=64,
    callbacks=[metrics_callback],
    verbose=1
)

Epoch 1/10


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None, 6), output.shape=(None, 50, 6)