In [76]:
import re
import string
import emoji
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import torch
from torch import nn
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder

In [77]:
data_train = pd.read_csv(r"E:\Lomba\ANFORCOM DDSC COMPETITION 2025 - PENYISIHAN\penyisihan-ddsc25\train.csv")
data_test = pd.read_csv(r"E:\Lomba\ANFORCOM DDSC COMPETITION 2025 - PENYISIHAN\penyisihan-ddsc25\test.csv")
sample_sub = pd.read_csv(r"E:\Lomba\ANFORCOM DDSC COMPETITION 2025 - PENYISIHAN\penyisihan-ddsc25\sample_submission.csv")

In [78]:
data_train

Unnamed: 0,text,label
0,@hyalfay @NatharElyas @BosPurwa @NatharElyas @...,ideologi
1,@sienchao @Sherly0ctaviany @jokowi Halo @Sienc...,ideologi
2,@giovannikurnwn @BudiBukanIntel Nama Budi Berl...,ideologi
3,@LeotardosMy @easyanying Kita akui keberhasila...,ideologi
4,@LuckyGaben @susipudjiastuti @prabowo Indonesi...,ideologi
...,...,...
4995,Memang kristen zionis itu rata-rata IslamoPhob...,harmoni
4996,Masjid Lafadz Allah Dan wanita yang lekat deng...,harmoni
4997,@_MbakSri_ Mulut yang selalu jual toleransi ma...,harmoni
4998,@itsmesoya_ kalo masalah fisik dan lisan masi ...,pekerjaan


In [79]:
slang_dict = {
    "gak": "tidak",
    "ga": "tidak",
    "nggak": "tidak",
    "aja": "saja",
    "tp": "tapi",
    "yg": "yang",
    "dgn": "dengan",
    "dlm": "dalam",
    "utk": "untuk",
    "jg": "juga",
    "tdk": "tidak",
    "sdh": "sudah",
    "blm": "belum"
}

def normalize_slang(text):
    tokens = text.split()
    return " ".join([slang_dict.get(t, t) for t in tokens])

def clean_text(text):
    # lowercase
    text = text.lower()
    # hapus url
    text = re.sub(r"http\S+|www.\S+", "", text)
    # hapus mention
    text = re.sub(r"@\w+", "", text)
    # ubah hashtag jadi kata
    text = re.sub(r"#(\w+)", r"\1", text)
    # ubah emoji jadi kata
    text = emoji.demojize(text, delimiters=(" ", " "))
    # hapus tanda baca
    text = text.translate(str.maketrans("", "", string.punctuation))
    # normalisasi slang
    text = normalize_slang(text)
    # hapus spasi ganda
    text = re.sub(r"\s+", " ", text).strip()
    return text

data_train['clean_text'] = data_train['text'].apply(clean_text)
data_test['clean_text'] = data_test['text'].apply(clean_text)

print(data_train[['text','clean_text']].head())

le = LabelEncoder()
data_train['label'] = le.fit_transform(data_train['label'])

                                                text  \
0  @hyalfay @NatharElyas @BosPurwa @NatharElyas @...   
1  @sienchao @Sherly0ctaviany @jokowi Halo @Sienc...   
2  @giovannikurnwn @BudiBukanIntel Nama Budi Berl...   
3  @LeotardosMy @easyanying Kita akui keberhasila...   
4  @LuckyGaben @susipudjiastuti @prabowo Indonesi...   

                                          clean_text  
0  prabowo menang di 2024 karena strategi cerdas ...  
1  halo mungkin terus mengangkat isu negatif joko...  
2  nama budi berlari mungkin mencerminkan sikap k...  
3  kita akui keberhasilannya kita akui juga kekur...  
4  indonesia masih bisa menuju kesuksesan tapi ru...  


In [80]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data_train['clean_text'].values,
    data_train['label'].values,
    test_size=0.2,
    stratify=data_train['label'].values,
    random_state=42
)

In [81]:
vectorizer = TfidfVectorizer(max_features=20000,ngram_range=(1,2))
x_train = vectorizer.fit_transform(train_texts)
x_val = vectorizer.transform(val_texts)
x_test = vectorizer.transform(data_test['clean_text'])

lr = LogisticRegression(max_iter=300,class_weight='balanced')
lr.fit(x_train,train_labels)
val_preds_lr = lr.predict(x_val)

print(balanced_accuracy_score(val_labels,val_preds_lr))

0.5237369629402601


In [82]:
model_name = "indolem/indobertweet-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
max_len = 128

def tokenize(batch):
    return tokenizer(
        batch,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )

train_encodings = tokenize(list(train_texts))
val_encodings = tokenize(list(val_texts))
test_encodings = tokenize(list(data_test['clean_text']))



In [83]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.encodings["input_ids"])
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
    
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
test_dataset = Dataset(test_encodings)

In [84]:
data_train

Unnamed: 0,text,label,clean_text
0,@hyalfay @NatharElyas @BosPurwa @NatharElyas @...,2,prabowo menang di 2024 karena strategi cerdas ...
1,@sienchao @Sherly0ctaviany @jokowi Halo @Sienc...,2,halo mungkin terus mengangkat isu negatif joko...
2,@giovannikurnwn @BudiBukanIntel Nama Budi Berl...,2,nama budi berlari mungkin mencerminkan sikap k...
3,@LeotardosMy @easyanying Kita akui keberhasila...,2,kita akui keberhasilannya kita akui juga kekur...
4,@LuckyGaben @susipudjiastuti @prabowo Indonesi...,2,indonesia masih bisa menuju kesuksesan tapi ru...
...,...,...,...
4995,Memang kristen zionis itu rata-rata IslamoPhob...,0,memang kristen zionis itu ratarata islamophobi...
4996,Masjid Lafadz Allah Dan wanita yang lekat deng...,0,masjid lafadz allah dan wanita yang lekat deng...
4997,@_MbakSri_ Mulut yang selalu jual toleransi ma...,0,mulut yang selalu jual toleransi malah dia sen...
4998,@itsmesoya_ kalo masalah fisik dan lisan masi ...,3,kalo masalah fisik dan lisan masi bisa di tole...


In [85]:
from sklearn.preprocessing import LabelEncoder

class WeightedLossTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights.to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

le = LabelEncoder()
train_labels = le.fit_transform(data_train["label"])  

# sekarang train_labels sudah berupa integer array
class_counts = np.bincount(train_labels)
class_weights = torch.tensor(len(train_labels) / (len(class_counts) * class_counts),
                             dtype=torch.float)

In [86]:
num_labels = len(np.unique(train_labels))

id2label = {int(i): str(l) for i, l in enumerate(le.classes_)}
label2id = {str(l): int(i) for i, l in enumerate(le.classes_)}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(id2label),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [87]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=4,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="balanced_accuracy",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=50,
)

In [88]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    bal_acc = balanced_accuracy_score(labels, preds)
    return {"balanced_accuracy": bal_acc}

In [89]:
trainer = WeightedLossTrainer(
    class_weights=class_weights,
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  super().__init__(*args, **kwargs)


In [90]:
trainer.train()




Epoch,Training Loss,Validation Loss,Balanced Accuracy
1,1.3886,1.409698,0.522399
2,1.1077,1.335938,0.561942
3,0.8331,1.369681,0.563973
4,0.7739,1.365982,0.566197




TrainOutput(global_step=1000, training_loss=1.1222579040527343, metrics={'train_runtime': 6784.1745, 'train_samples_per_second': 2.358, 'train_steps_per_second': 0.147, 'total_flos': 1052500918272000.0, 'train_loss': 1.1222579040527343, 'epoch': 4.0})

In [98]:
preds = trainer.predict(test_dataset).predictions
final_preds = np.argmax(preds, axis=-1)



In [99]:
preds_lr_test = lr.predict(x_test)
final_preds_ens = []
for i in range(len(final_preds)):
    if final_preds[i] == preds_lr_test[i]:
        final_preds_ens.append(final_preds[i])
    else:
        # fallback to BERT (lebih kuat)
        final_preds_ens.append(final_preds[i])


In [100]:
submission_ens = sample_sub.copy()
submission_ens["label"] = le.inverse_transform(final_preds_ens)
submission_ens.to_csv("submission_12.csv", index=False)

In [101]:
submission_ens = sample_sub.copy()
submission_ens["label"] = le.inverse_transform(final_preds_ens)
submission_ens.to_csv("submission_12.csv", index=False)

In [109]:
encode_baru = le.fit_transform(data_train['label'])
encode_baru
data_train['label']

0       2
1       2
2       2
3       2
4       2
       ..
4995    0
4996    0
4997    0
4998    3
4999    0
Name: label, Length: 5000, dtype: int64

In [112]:
data_train_2 = pd.read_csv(r"E:\Lomba\ANFORCOM DDSC COMPETITION 2025 - PENYISIHAN\penyisihan-ddsc25\train.csv")
data_train_2

Unnamed: 0,text,label
0,@hyalfay @NatharElyas @BosPurwa @NatharElyas @...,ideologi
1,@sienchao @Sherly0ctaviany @jokowi Halo @Sienc...,ideologi
2,@giovannikurnwn @BudiBukanIntel Nama Budi Berl...,ideologi
3,@LeotardosMy @easyanying Kita akui keberhasila...,ideologi
4,@LuckyGaben @susipudjiastuti @prabowo Indonesi...,ideologi
...,...,...
4995,Memang kristen zionis itu rata-rata IslamoPhob...,harmoni
4996,Masjid Lafadz Allah Dan wanita yang lekat deng...,harmoni
4997,@_MbakSri_ Mulut yang selalu jual toleransi ma...,harmoni
4998,@itsmesoya_ kalo masalah fisik dan lisan masi ...,pekerjaan


In [None]:
le = LabelEncoder()
data_train_2["label_encoded"] = le.fit_transform(data_train_2["label"])


label_mapping = dict(zip(le.transform(le.classes_), le.classes_))
print("Mapping:", label_mapping)

final_preds_labels = [label_mapping[p] for p in final_preds]
final_preds_ens_labels = [label_mapping[p] for p in final_preds_ens]

submission_ens = sample_sub.copy()
submission_ens["label"] = final_preds_ens_labels
submission_ens.to_csv("submission_12.csv", index=False)

Mapping: {np.int64(0): 'harmoni', np.int64(1): 'hilirisasi', np.int64(2): 'ideologi', np.int64(3): 'pekerjaan', np.int64(4): 'pemerataan', np.int64(5): 'pertahanan', np.int64(6): 'reformasi', np.int64(7): 'sdm'}
