In [1]:
from datasets import load_dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from datasets import ClassLabel
stage_1_source = dataset["source_stage_1"]
class_labels = ClassLabel(names=list(set(stage_1_source["relevant"])))

In [3]:
import re, unicodedata, jaconv

_URL = re.compile(r"https?://\S+")
_MENTION = re.compile(r"@\w+")
_WS = re.compile(r"\s+")
_KUTI_CUT = re.compile(r"(?i)kutipan.*$", re.DOTALL)
def cleantext(row: str):
    text = row["content"]  # type: ignore
    text = unicodedata.normalize("NFKC", text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    text = text.replace("tanya grok", " ")
    text = text.replace("grokproductivitypasang", " ")
    text = text.replace("\\n", " ").replace("\\r", " ")
    text = _URL.sub(" <url> ", text)
    text = text.replace("ini tidak tersedia", " ")
    text = _MENTION.sub("@USER", text)
    text = re.sub(r"^rt\s+", "", text, flags=re.I)
    text = re.sub(r"(\b\d{4})(?=[a-zA-Z])", r"\1 ", text)
    text = _KUTI_CUT.sub("", text)
    text = _WS.sub(" ", text).strip()
    row["content"] = text  # type: ignore
    return row


In [4]:
stage_1_source = stage_1_source.map(cleantext, num_proc=10)

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
model = AutoModelForSequenceClassification.from_pretrained("tianharjuno/ruu-tni-relevancy-classification-p1", cache_dir="cache/")
tokenizer = AutoTokenizer.from_pretrained("tianharjuno/ruu-tni-relevancy-classification-p1", cache_dir="cache/")
device = torch.device("mps")
model.to(device)
def tokenize(batch):
    return tokenizer(
        batch["content"],
        padding="max_length",
        max_length=128,
        truncation=True,
    )


In [6]:
tokenized_source = stage_1_source.map(tokenize, batched=True, batch_size=128, num_proc=10)

In [7]:
from transformers import Trainer, TrainingArguments
from transformers.data.data_collator import default_data_collator
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np
def compute_metrics(class_names):
    num_classes = len(class_names)

    def callback(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, torch.Tensor):
            logits = logits.detach().cpu().numpy()
        if isinstance(labels, torch.Tensor):
            labels = labels.detach().cpu().numpy()
        preds = np.argmax(logits, axis=1)
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            labels, preds, average="macro", zero_division=0
        )
        acc = accuracy_score(labels, preds)
        p_cls, r_cls, f1_cls, support_cls = precision_recall_fscore_support(
            labels,
            preds,
            average=None,
            zero_division=0,
            labels=list(range(num_classes)),
        )
        metrics = {
            "accuracy": acc,
            "macro_f1": macro_f1,
            "macro_precision": macro_p,
            "macro_recall": macro_r,
        }
        for idx, name in enumerate(class_names):
            metrics[f"{name}_precision"] = p_cls[idx]  # type: ignore
            metrics[f"{name}_recall"] = r_cls[idx]  # type: ignore
            metrics[f"{name}_f1"] = f1_cls[idx]  # type: ignore
            metrics[f"{name}_support"] = int(support_cls[idx])  # type: ignore
        return metrics

    return callback

training_args = TrainingArguments(
    overwrite_output_dir=True,
    eval_strategy="epoch",  # evaluate at the end of each epoch
    save_strategy="epoch",  # save checkpoint at the end of each epoch
    learning_rate=1e-5,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.05,
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_f1",
    greater_is_better=True,
    warmup_ratio=0.01,
    bf16=True,
)
compute_metrics_callback = compute_metrics(class_labels.names)
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_source,
    compute_metrics=compute_metrics_callback,
    data_collator=default_data_collator,
)

In [8]:
predictions = trainer.predict(tokenized_source)




In [12]:
prediction_labels = predictions.predictions.argmax(axis=1)

In [13]:
count_label_1 = (prediction_labels == 1).sum()

In [15]:
import numpy as np

# Returns unique labels and their corresponding counts
unique, counts = np.unique(prediction_labels, return_counts=True)

# Combine them into a dictionary for easy reading
label_counts = dict(zip(unique, counts))

print(label_counts)
# Output example: {0: 120, 1: 450, 2: 30}


{np.int64(0): np.int64(96199), np.int64(1): np.int64(105384)}
