In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,hamming_loss,accuracy_score,make_scorer,f1_score
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC,SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import shap

df = pd.read_csv("data//re_dataset.csv",encoding='latin-1')
df.head(3)

: 

In [None]:
label_cols = [
    "HS_Individual", "HS_Group", "HS_Religion",
    "HS_Race", "HS_Physical", "HS_Gender"
]

In [None]:
alay_df = pd.read_csv("data//new_kamusalay.csv")
alay_dict = dict(zip(alay_df["alay"], alay_df["normal"]))

def normalize_alay(text):
    words = text.split()
    return " ".join([alay_dict.get(w, w) for w in words])

def clean_text(text):
    text = str(text).lower()
    
    text = re.sub(r"\buser\b", " ", text)         # hapus USER anonim
    text = re.sub(r"\brt\b", " ", text)           # hapus retweet marker
    text = text.replace("\n", " ")                # hapus newline
    text = re.sub(r"http\S+|www\S+", " ", text)   # hapus URL
    text = re.sub(r"[^a-zA-Z\s]", " ", text)      # hapus angka/simbol
    text = re.sub(r"\s+", " ", text).strip()      # hapus spasi berlebih
    text = normalize_alay(text)                   # normalisasi pakai kamus CSV
    
    return text


df["Tweet"] = df["Tweet"].apply(clean_text)
df.to_csv("data/clean_dataset.csv",index=False)

In [None]:
df = df.drop(columns=['HS','Abusive','HS_Other','HS_Weak','HS_Moderate','HS_Strong'])

In [None]:
df["num_labels"] = df[label_cols].sum(axis=1)
df[["Tweet", "num_labels"] + label_cols].head()

In [None]:
df[df['num_labels']>1].sample(5)

In [None]:
df.iloc[3680]['Tweet']

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold, MultilabelStratifiedShuffleSplit

X = df['Tweet']
y = df.drop(['Tweet','num_labels'], axis=1)
sss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

In [None]:
label_counts = y_train.sum()
label_props = y_train.mean()

print("\nProporsi (persentase positif) per label:")
print(label_props)

label_counts = y_test.sum()
label_props = y_test.mean()

print("\nProporsi (persentase positif) per label:")
print(label_props)

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,1),max_features=20000)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

model_scores = {}

In [None]:
def evaluate_model_cv(clf, X, y, cv, model_name="model"): 
    subset_acc_scores, hamming_scores, f1_micro_scores, f1_macro_scores = [], [], [], []

    for train_idx, val_idx in cv.split(X, y):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx] if hasattr(y, "iloc") else y[train_idx]
        
        clf.fit(X_tr, y_tr)
        y_pred = clf.predict(X_val)

        subset_acc_scores.append(accuracy_score(y_val, y_pred))
        hamming_scores.append(hamming_loss(y_val, y_pred))
        f1_micro_scores.append(f1_score(y_val, y_pred, average="micro"))
        f1_macro_scores.append(f1_score(y_val, y_pred, average="macro"))

    subset_accuracy = np.mean(subset_acc_scores)
    hamming_losses = np.mean(hamming_scores)
    f1_micro = np.mean(f1_micro_scores)
    f1_macro = np.mean(f1_macro_scores)

    results = {
        "subset_acc_scores": subset_acc_scores,
        "hamming_scores": hamming_scores,
        "f1_micro_scores": f1_micro_scores,
        "f1_macro_scores": f1_macro_scores,
        "subset_accuracy": subset_accuracy,
        "hamming_loss": hamming_losses,
        "f1_micro": f1_micro,
        "f1_macro": f1_macro
    }

    print(f"\n=== {model_name} ===")
    print("Mean Subset Accuracy:", subset_accuracy)
    print("Mean Hamming Loss:", hamming_losses)
    print("Mean Micro F1:", f1_micro)
    print("Mean Macro F1:", f1_macro)

    return {model_name: results}

In [None]:
mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)

log_reg = LogisticRegression(class_weight="balanced", max_iter=1000)
clf1 = OneVsRestClassifier(log_reg)

svm = LinearSVC(class_weight="balanced", max_iter=2000)
clf2 = OneVsRestClassifier(svm)

nb = MultinomialNB()
clf3 = OneVsRestClassifier(nb)
    
model_scores = {}
model_scores.update(evaluate_model_cv(clf1, X_train_tfidf, y_train, mskf, model_name="Log_Reg"))
model_scores.update(evaluate_model_cv(clf2, X_train_tfidf, y_train, mskf, model_name="SVM"))
model_scores.update(evaluate_model_cv(clf3, X_train_tfidf, y_train, mskf, model_name="NB"))

In [None]:
results_df = pd.DataFrame({
    model: {
        "Subset Accuracy": scores["subset_accuracy"],
        "Hamming Loss": scores["hamming_loss"],
        "F1 Micro": scores["f1_micro"],
        "F1 Macro": scores["f1_macro"]
    }
    for model, scores in model_scores.items()
}).T
results_df.round(4)

In [None]:
def training_ml(clf, X_train, X_test, y_train, y_test, use_vectorizer=False):
    if isinstance(X_train, pd.Series):
            X_train = X_train.to_frame()
            X_test = X_test.to_frame()
    if use_vectorizer:
        transformer = ColumnTransformer([
            ('vectorizer', vectorizer, 'Tweet')
        ])
        pipeline = Pipeline([
            ('transformer', transformer),
            ('model', clf)
        ])
    else:
        pipeline = Pipeline([
            ('model', clf)
        ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    print("Classification Report:\n", classification_report(y_test, y_pred,zero_division=0))
    print("Hamming Loss:", hamming_loss(y_test, y_pred))
    print("Subset Accuracy:", accuracy_score(y_test, y_pred),"\n")

    return pipeline,y_pred

In [None]:
def exact_match_ratio(actual,pred):
    actual = np.array(actual)
    pred = np.array(pred)
    return (actual == pred).all(axis=1).mean()

def hamming_loss_(actual,pred):
    actual = np.array(actual)
    pred = np.array(pred)
    return (actual != pred).mean()
    
def micro_f1(actual,pred):
    actual = np.array(actual)
    pred = np.array(pred)
    TP = np.sum((actual == 1) & (pred == 1))
    FN = np.sum((actual == 1) & (pred == 0))
    FP = np.sum((actual == 0) & (pred == 1))
    
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    return 2*precision*recall/(precision+recall)

def macro_f1(actual, pred):
    actual = np.array(actual).T # Transpose biar iterasi per label (bukan per sampel)
    pred = np.array(pred).T
    f1_scores = []

    for i in range(len(actual)):
        TP = np.sum((actual[i] == 1) & (pred[i] == 1))
        FN = np.sum((actual[i] == 1) & (pred[i] == 0))
        FP = np.sum((actual[i] == 0) & (pred[i] == 1))

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0

        if (precision + recall) > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0
        f1_scores.append(f1)
        
    return np.mean(f1_scores)

In [179]:
pipeline_logreg,y_pred_logreg = training_ml(clf1, X_train, X_test, y_train, y_test, use_vectorizer=True)
pipeline_svm,y_pred_svm = training_ml(clf2, X_train, X_test, y_train, y_test, use_vectorizer=True)

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.82      0.81      1113
           1       0.86      0.88      0.87      1009
           2       0.65      0.78      0.71       716
           3       0.57      0.74      0.64       397
           4       0.52      0.74      0.61       159
           5       0.56      0.85      0.68       113
           6       0.46      0.82      0.59        65
           7       0.38      0.67      0.48        61
           8       0.69      0.81      0.74       745
           9       0.61      0.76      0.68       677
          10       0.49      0.73      0.58       341
          11       0.60      0.79      0.68        95

   micro avg       0.68      0.80      0.73      5491
   macro avg       0.60      0.78      0.67      5491
weighted avg       0.69      0.80      0.74      5491
 samples avg       0.43      0.45      0.42      5491

Hamming Loss: 0.10133510503669957
Subset Accuracy: 0.53

In [192]:
hamming_loss_(y_test,y_pred_svm),exact_match_ratio(y_test,y_pred_svm),macro_f1(y_test,y_pred_svm),micro_f1(y_test,y_pred_svm)

(np.float64(0.09567198177676538),
 np.float64(0.5592255125284739),
 np.float64(0.6815527657754917),
 np.float64(0.7350622043104957))

In [195]:
n_samples = 5
sample_idx = np.random.choice(len(X_test), n_samples, replace=False)

y_pred= y_pred_svm
for i, idx in enumerate(sample_idx):
    text = X_test.iloc[idx]
    actual = y_test.iloc[idx].values
    pred = y_pred[idx]

    # Buat mapping label
    actual_labels = [label_cols[j] for j, val in enumerate(actual) if val == 1]
    pred_labels = [label_cols[j] for j, val in enumerate(pred) if val == 1]

    print(f"\n=== Sample {i+1} ===")
    print("Tweet:", text)
    print("Actual Labels:", actual_labels)
    print("Predicted Labels:", pred_labels)

IndexError: list index out of range

In [91]:
def predict_new_tweet(texts, clf, label_cols):
    X_new = pd.DataFrame({'Tweet': texts})
    y_pred = clf.predict(X_new)

    for i, text in enumerate(texts):
        pred_labels = [label_cols[j] for j, val in enumerate(y_pred[i]) if val == 1]

        print("="*40)
        print(f"Tweet: {text}")
        print(f"Predicted Labels: {pred_labels if pred_labels else ['None']}")
        print()

new_tweets = [
    "Dasar bodoh banget, bikin malu bangsa",
    "Saya cinta Indonesia",
    "Orang dari ras X memang menyebalkan",
    "Jangan hina agama saya!",
    "Kamu itu kayak babi"
]

predict_new_tweet(new_tweets, pipeline_svm, label_cols)

Tweet: Dasar bodoh banget, bikin malu bangsa
Predicted Labels: ['HS_Individual', 'HS_Group']

Tweet: Saya cinta Indonesia
Predicted Labels: ['None']

Tweet: Orang dari ras X memang menyebalkan
Predicted Labels: ['HS_Individual']

Tweet: Jangan hina agama saya!
Predicted Labels: ['None']

Tweet: Kamu itu kayak babi
Predicted Labels: ['HS_Individual']



In [145]:
import importlib
import kernel_shap
importlib.reload(kernel_shap)

clf_model = pipeline_logreg.named_steps['model']
kalimat = "Orang cacat itu beban masyarakat."

print("Kalimat: ",kalimat)
print(kernel_shap.shap_kernel_instance(kalimat, vectorizer, clf_model,label_names=label_cols))

Kalimat:  Orang cacat itu beban masyarakat.

Kalimat:  Orang cacat itu beban masyarakat.
Fitur aktif (5): ['beban', 'cacat', 'itu', 'masyarakat', 'orang']
Total fitur aktif = 5, total subset = 32
Jumlah subset yang dipakai: 32

Model memiliki 6 label.

=== HS_Individual ===
beban: -0.0015
cacat: 0.0151
itu: 0.0591
masyarakat: -0.1258
orang: 0.0563
Baseline: 0.2819
Prediksi asli HS_Individual: 0.2792

=== HS_Group ===
beban: 0.0043
cacat: -0.0751
itu: -0.0071
masyarakat: 0.0310
orang: 0.0230
Baseline: 0.2166
Prediksi asli HS_Group: 0.1897

=== HS_Religion ===
beban: -0.0099
cacat: -0.0562
itu: 0.0330
masyarakat: 0.0296
orang: 0.0194
Baseline: 0.1182
Prediksi asli HS_Religion: 0.1289

=== HS_Race ===
beban: -0.0016
cacat: -0.0203
itu: 0.0209
masyarakat: 0.0091
orang: 0.0224
Baseline: 0.0592
Prediksi asli HS_Race: 0.0885

=== HS_Physical ===
beban: -0.0073
cacat: 0.5284
itu: 0.0165
masyarakat: -0.0528
orang: 0.1134
Baseline: 0.0714
Prediksi asli HS_Physical: 0.6907

=== HS_Gender ===
beba

In [93]:
# import shap
# import pandas as pd

# def kernel_shap_explainer(clf, vectorizer, sentence, nsamples=300):
#     import shap
#     import numpy as np
#     import matplotlib.pyplot as plt

#     X_instance = vectorizer.transform([sentence])
#     background = np.zeros(X_instance.shape)

#     def predict_fn(X):
#         return clf.predict_proba(X)[:, 1]

#     explainer = shap.KernelExplainer(predict_fn, background)
#     shap_values = explainer.shap_values(X_instance, nsamples=nsamples)

#     active_idx = X_instance.nonzero()[1]
#     feature_names = vectorizer.get_feature_names_out()
#     fitur_aktif = feature_names[active_idx]
#     shap_active = shap_values[0][active_idx]

#     shap.waterfall_plot(
#         shap.Explanation(
#             values=shap_active,
#             base_values=explainer.expected_value,
#             feature_names=fitur_aktif
#         )
#     )


# shap.initjs()
# kernel_shap_explainer(clf1, vectorizer, "Orang ini bodoh banget dan jahat")

In [94]:
import numpy as np

# 1. Data
X = np.arange(20)
y = np.array([0]*8 + [1]*6 + [2]*6)
K = 3

def StratKFold(X,y,K):
    classes = np.unique(y)
    all_class_idx = {}
    
    for i in classes:
        class_idx = np.where(y==i)[0]
        all_class_idx[i] = np.array_split(class_idx,K)

    train_folds = {}
    test_folds = {}
    for i in range(K):
        temp = []
        for c in classes:
            temp.append(all_class_idx[c][i])
        test_folds[i] = np.concatenate(temp)
        train_folds[i] = np.setdiff1d(np.arange(len(y)),test_folds[i])
    return train_folds,test_folds

StratKFold(X,y,K)

({0: array([ 3,  4,  5,  6,  7, 10, 11, 12, 13, 16, 17, 18, 19]),
  1: array([ 0,  1,  2,  6,  7,  8,  9, 12, 13, 14, 15, 18, 19]),
  2: array([ 0,  1,  2,  3,  4,  5,  8,  9, 10, 11, 14, 15, 16, 17])},
 {0: array([ 0,  1,  2,  8,  9, 14, 15]),
  1: array([ 3,  4,  5, 10, 11, 16, 17]),
  2: array([ 6,  7, 12, 13, 18, 19])})

In [95]:
X = np.arange(10).reshape(-1, 1)
y = np.array([
    [1, 0, 0],
    [1, 1, 0],#
    [0, 1, 1],#
    [1, 0, 1],#
    [0, 0, 1],#
    [1, 1, 1],
    [0, 1, 0],
    [1, 0, 0],#
    [0, 0, 1],#
    [1, 1, 0]#
])

def multilabel_train_test_split(X, y, test_size=0.3, seed=42):
    rng = np.random.default_rng(seed)
    n_samples = len(y)
    n_test = int(np.floor(n_samples * test_size))   
    
    # distribusi global label
    label_sum = y.sum(axis=0)
    target_test = np.round(label_sum * test_size).astype(int)
    current_test = np.zeros_like(label_sum)
    
    # inisialisasi list untuk index test
    test_indices = set()

    # buat urutan acak dari seluruh index
    all_indices = np.arange(n_samples)
    rng.shuffle(all_indices)

    for idx in all_indices:
        # stop kalau test set sudah penuh
        if len(test_indices) >= n_test:
            break

        # hitung seberapa “berguna” data ini untuk balancing
        label_vector = y[idx]
        score = 0
        for j, label_value in enumerate(label_vector):
            if label_value == 1 and current_test[j] < target_test[j]:
                score += 1
        
        # semakin banyak label yg belum tercapai, semakin “berguna”
        if score > 0:
            test_indices.add(idx)
            current_test += label_vector

    # sisanya jadi train
    train_indices = np.setdiff1d(np.arange(n_samples), list(test_indices))

    return train_indices, np.array(list(test_indices))

multilabel_train_test_split(X,y)

(array([1, 2, 3, 4, 7, 8, 9]), array([0, 5, 6]))

In [96]:
def MultilabelStratifiedKFold_manual(X, y, K=3, seed=42):
    rng = np.random.default_rng(seed)
    n_samples = len(y)
    n_labels = y.shape[1]
    
    # Hitung distribusi global per label
    label_sum = y.sum(axis=0)
    target_per_fold = label_sum / K
    
    # Inisialisasi penghitung label di setiap fold
    fold_label_counts = np.zeros((K, n_labels), dtype=int)
    
    # Simpan index per fold
    folds = [[] for _ in range(K)]
    
    # Acak urutan data agar tidak bias
    indices = np.arange(n_samples)
    rng.shuffle(indices)

    # Loop semua data
    for idx in indices:
        sample_labels = y[idx]
        # Cari fold yang "paling butuh" data ini
        fold_scores = []
        for f in range(K):
            score = 0
            for j in range(n_labels):
                if sample_labels[j] == 1 and fold_label_counts[f, j] < target_per_fold[j]:
                    score += 1
            fold_scores.append(score)
        
        # Pilih fold dengan score tertinggi (paling kekurangan label data ini)
        best_fold = np.argmax(fold_scores)
        folds[best_fold].append(idx)
        fold_label_counts[best_fold] += sample_labels
    
    # Convert list of folds jadi array index
    fold_indices = [np.array(f) for f in folds]
    
    # Cetak hasil
    for i, test_idx in enumerate(fold_indices):
        train_idx = np.setdiff1d(np.arange(n_samples), test_idx)
        print(f"\n=== Fold {i+1} ===")
        print("Test idx :", test_idx)
        print("Train idx:", train_idx)
        print("Distribusi label test :", y[test_idx].sum(axis=0))
        print("Distribusi label train:", y[train_idx].sum(axis=0))
        print("-"*50)
    
    return fold_indices

MultilabelStratifiedKFold_manual(X, y, K=3)


=== Fold 1 ===
Test idx : [5 6 0 4]
Train idx: [1 2 3 7 8 9]
Distribusi label test : [2 2 2]
Distribusi label train: [4 3 3]
--------------------------------------------------

=== Fold 2 ===
Test idx : [7 3 2]
Train idx: [0 1 4 5 6 8 9]
Distribusi label test : [2 1 2]
Distribusi label train: [4 4 3]
--------------------------------------------------

=== Fold 3 ===
Test idx : [9 1 8]
Train idx: [0 2 3 4 5 6 7]
Distribusi label test : [2 2 1]
Distribusi label train: [4 3 4]
--------------------------------------------------


[array([5, 6, 0, 4]), array([7, 3, 2]), array([9, 1, 8])]