In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, hamming_loss,classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
import re
import time
pd.set_option('display.max_colwidth', 120)

In [8]:
label_cols = [
    "HS_Individual", "HS_Group", "HS_Religion",
    "HS_Race", "HS_Physical", "HS_Gender"
]

df = pd.read_csv("/kaggle/input/datasethatespeechmultilabel/re_dataset.csv",encoding='latin-1')
df = df.drop(columns=['HS','Abusive','HS_Other','HS_Weak','HS_Moderate','HS_Strong'])

def normalize_alay(text):
    words = text.split()
    return " ".join([alay_dict.get(w, w) for w in words])

def clean_text(text):
    text = str(text).lower()
    
    text = re.sub(r"\buser\b", " ", text)         # hapus USER anonim
    text = re.sub(r"\brt\b", " ", text)           # hapus retweet marker
    text = text.replace("\n", " ")                # hapus newline
    text = re.sub(r"http\S+|www\S+", " ", text)   # hapus URL
    text = re.sub(r"[^a-zA-Z\s]", " ", text)      # hapus angka/simbol
    text = re.sub(r"\s+", " ", text).strip()      # hapus spasi berlebih
    text = normalize_alay(text)                   # normalisasi pakai kamus CSV
    
    return text

alay_df = pd.read_csv("/kaggle/input/datasethatespeechmultilabel/new_kamusalay.csv")
alay_dict = dict(zip(alay_df["alay"], alay_df["normal"]))

df["Tweet"] = df["Tweet"].apply(clean_text)
for text in df['Tweet'].head(5):
    print(text)
    print("")

df["num_labels"] = df[label_cols].sum(axis=1)
df.head()

df.to_csv("clean_data.csv",index=False)

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

X = df['Tweet']
y = df.drop(['Tweet','num_labels'], axis=1)
sss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in sss.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

label_props = y_train.mean()

print("\nProporsi (persentase positif) data train per label:")
print(label_props)

label_props = y_test.mean()

print("\nProporsi (persentase positif) data test per label:")
print(label_props)

import numpy as np

def exact_match_ratio(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)
    return np.mean(np.all(actual == pred, axis=1))

def hamming_loss_(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)
    N, L   = actual.shape

    diff = actual != pred
    return diff.sum() / (N * L)

def micro_f1(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)

    TP = np.sum((actual == 1) & (pred == 1))
    FP = np.sum((actual == 0) & (pred == 1))
    FN = np.sum((actual == 1) & (pred == 0))

    if TP == 0:
        return 0.0

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall    = TP / (TP + FN) if (TP + FN) > 0 else 0.0

    if precision + recall == 0:
        return 0.0

    return 2 * precision * recall / (precision + recall)

def macro_f1(actual, pred):
    actual = np.array(actual)
    pred   = np.array(pred)

    L = actual.shape[1]
    f1_scores = []

    for j in range(L):
        y_true = actual[:, j]
        y_pred = pred[:, j]

        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        FN = np.sum((y_true == 1) & (y_pred == 0))

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        recall    = TP / (TP + FN) if (TP + FN) > 0 else 0.0

        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0.0

        f1_scores.append(f1)

    return np.mean(f1_scores)

def training_ml(clf, X_train, X_test, y_train, y_test, use_vectorizer=False):

    if isinstance(X_train, pd.Series):
        X_train = X_train.to_frame()
        X_test  = X_test.to_frame()

    if use_vectorizer:
        # Pakai TF-IDF yang SUDAH fit, jangan fit ulang
        X_train_vec = vectorizer.transform(X_train['Tweet'])
        X_test_vec  = vectorizer.transform(X_test['Tweet'])
    else:
        X_train_vec = X_train.values
        X_test_vec  = X_test.values

    clf_name = clf.estimator.__class__.__name__
    print("Model:", clf_name)
    print("Training dimulai...")

    start = time.time()
    clf.fit(X_train_vec, y_train)
    end = time.time()

    print("Training selesai.")
    print("Durasi:", round(end-start, 3), "detik")

    y_pred = clf.predict(X_test_vec)

    print("Classification Report:\n",
          classification_report(y_test, y_pred, zero_division=0))
    print("Hamming Loss:", hamming_loss(y_test, y_pred))
    # print("Subset Accuracy:", accuracy_score(y_test, y_pred), "\n")

    return clf, y_pred

vectorizer = TfidfVectorizer(
    ngram_range=(1,1),
    max_features=20000
)
vectorizer.fit(X_train)

X_train_vec = vectorizer.transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

log_reg = LogisticRegression(class_weight="balanced", max_iter=1000,random_state=42)
clf1 = OneVsRestClassifier(log_reg)

svm = LinearSVC(class_weight="balanced", max_iter=2000,random_state=42)
clf2 = OneVsRestClassifier(svm)

pipeline_logreg,y_pred_logreg = training_ml(clf1, X_train, X_test, y_train, y_test, use_vectorizer=True)
pipeline_svm,y_pred_svm = training_ml(clf2, X_train, X_test, y_train, y_test, use_vectorizer=True)

di saat semua cowok berusaha melacak perhatian gue kamu lantas remehkan perhatian yang gue kasih khusus ke kamu basic kamu cowok bego

siapa yang telat memberi tau kamu edan sarap gue bergaul dengan cigax jifla calis sama siapa itu licew juga

kadang aku berpikir kenapa aku tetap percaya pada tuhan padahal aku selalu jatuh berkali kali kadang aku merasa tuhan itu meninggalkan aku sendirian ketika orang tuaku berencana berpisah ketika kakakku lebih memilih jadi kristen ketika aku anak ter

aku itu aku dan ku tau matamu sipit tapi dilihat dari mana itu aku

kaum cebong kafir sudah kelihatan dongoknya dari awal tambah dungu lagi haha


Proporsi (persentase positif) data train per label:
HS_Individual    0.271476
HS_Group         0.150831
HS_Religion      0.060180
HS_Race          0.043000
HS_Physical      0.024490
HS_Gender        0.023256
dtype: float64

Proporsi (persentase positif) data test per label:
HS_Individual    0.271450
HS_Group         0.150721
HS_Religion      0.060364
HS_Rac

In [6]:
# import optuna

# def objective_logistic(trial):
#     # Hyperparameter space
#     C = trial.suggest_float("C", 1e-4, 10, log=True)
#     penalty = trial.suggest_categorical("penalty", ["l2"])
#     solver = trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    
#     clf = OneVsRestClassifier(
#         LogisticRegression(
#             C=C,
#             penalty=penalty,
#             solver=solver,
#             class_weight="balanced",
#             max_iter=2000
#         )
#     )

#     clf.fit(X_train_vec, y_train)
#     pred = clf.predict(X_test_vec)

#     score = micro_f1(y_test, pred)
#     return score

# study_lr = optuna.create_study(direction="maximize")
# study_lr.optimize(objective_logistic, n_trials=500)

# print("Best LR Params:", study_lr.best_params)
# print("Best LR Score:", study_lr.best_value)



In [7]:
# from sklearn.svm import LinearSVC

# def objective_svm(trial):
#     C = trial.suggest_float("C", 1e-4, 10, log=True)
#     tol = trial.suggest_float("tol", 1e-5, 1e-1, log=True)
#     loss = trial.suggest_categorical("loss", ["hinge", "squared_hinge"])

#     clf = OneVsRestClassifier(
#         LinearSVC(
#             C=C,
#             tol=tol,
#             loss=loss,
#             class_weight="balanced",
#             max_iter=3000
#         )
#     )

#     clf.fit(X_train_vec, y_train)
#     pred = clf.predict(X_test_vec)

#     score = micro_f1(y_test, pred)
#     return score

# study_svm = optuna.create_study(direction="maximize")
# study_svm.optimize(objective_svm, n_trials=300)

# print("Best SVM Params:", study_svm.best_params)
# print("Best SVM Score:", study_svm.best_value)

In [139]:
import datetime
import pickle

best_lr = OneVsRestClassifier(
    LogisticRegression(
        C=study_lr.best_params["C"],
        solver=study_lr.best_params["solver"],
        class_weight="balanced",
        max_iter=2000
    )
)

best_lr, pred_lr = training_ml(
    best_lr, 
    X_train, 
    X_test, 
    y_train, 
    y_test, 
    use_vectorizer=True
)

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
lr_filename = f"best_logreg_{timestamp}.pkl"

with open(lr_filename, "wb") as f:
    pickle.dump(best_lr, f)

print("Model Logistic Regression disimpan ke:", lr_filename)

best_svm = OneVsRestClassifier(
    LinearSVC(
        C=study_svm.best_params["C"],
        tol=study_svm.best_params["tol"],
        loss=study_svm.best_params["loss"],
        class_weight="balanced",
        max_iter=3000
    )
)

best_svm, pred_svm = training_ml(
    best_svm, 
    X_train, 
    X_test, 
    y_train, 
    y_test, 
    use_vectorizer=True
)

svm_filename = f"best_svm_{timestamp}.pkl"

with open(svm_filename, "wb") as f:
    pickle.dump(best_svm, f)

print("Model LinearSVC disimpan ke:", svm_filename)

Model: LogisticRegression
Training dimulai...
Training selesai.
Durasi: 0.569 detik
Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.76      0.71       715
           1       0.57      0.68      0.62       397
           2       0.53      0.73      0.62       159
           3       0.58      0.86      0.70       113
           4       0.56      0.78      0.65        65
           5       0.49      0.66      0.56        61

   micro avg       0.61      0.74      0.67      1510
   macro avg       0.57      0.75      0.64      1510
weighted avg       0.61      0.74      0.67      1510
 samples avg       0.29      0.31      0.29      1510

Hamming Loss: 0.07086813464945584
Subset Accuracy: 0.6761579347000759 

Model Logistic Regression disimpan ke: best_logreg_20251126_001639.pkl
Model: LinearSVC
Training dimulai...
Training selesai.
Durasi: 0.241 detik
Classification Report:
               precision    recall  f1-score   support