In [1]:
from datasets import load_dataset, ClassLabel
ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
train_ds = ds["sampled_20000_labeled"]
test_ds = ds["test"]
# Split into 80% train, 20% temp (for eval + test)
train_ds_split= train_ds.train_test_split(test_size=0.2, seed=42,)

train_ds = train_ds_split["train"]
eval_ds = train_ds_split["test"]

print(train_ds, eval_ds, test_ds)

print(f"Train: {len(train_ds)}, Eval: {len(eval_ds)}, Test: {len(test_ds)}")

class_labels = ClassLabel(names=list(set(train_ds["related"])))

train_ds = train_ds.cast_column("related", class_labels)
eval_ds = eval_ds.cast_column("related", class_labels)
test_ds = test_ds.cast_column("related", class_labels)

cleaned_train_ds = train_ds.select_columns(["content", "related"])
cleaned_test_ds = test_ds.select_columns(["content", "related"])
cleaned_eval_ds = eval_ds.select_columns(["content", "related"])
cleaned_train_ds = cleaned_train_ds.rename_column("related", "label")
cleaned_eval_ds = cleaned_eval_ds.rename_column("related", "label")
cleaned_test_ds = cleaned_test_ds.rename_column("related", "label")

cleaned_train_ds = cleaned_train_ds.rename_column("content", "text")
cleaned_eval_ds = cleaned_eval_ds.rename_column("content", "text")
cleaned_test_ds = cleaned_test_ds.rename_column("content", "text")

Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'related', 'confidence'],
    num_rows: 11560
}) Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'related', 'confidence'],
    num_rows: 2890
}) Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'related', 'confidence'],
    num_rows: 2000
})
Train: 11560, Eval: 2890, Test: 2000


In [2]:
print(cleaned_train_ds, cleaned_eval_ds, cleaned_test_ds)

Dataset({
    features: ['text', 'label'],
    num_rows: 11560
}) Dataset({
    features: ['text', 'label'],
    num_rows: 2890
}) Dataset({
    features: ['text', 'label'],
    num_rows: 2000
})


In [3]:
from collections import Counter

# Extract all labels
labels = cleaned_train_ds["label"]

# Count occurrences
label_counts = Counter(labels)

num_false = label_counts.get(0, 0)  # count of label 0 / False
num_true = label_counts.get(1, 0)   # count of label 1 / True

print(f"False: {num_false}, True: {num_true}")


False: 3738, True: 7822


In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

device = torch.device("cuda")

# Load model & tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "indolem/indobertweet-base-uncased",
    cache_dir="cache/"
)
tokenizer = AutoTokenizer.from_pretrained(
    "indolem/indobertweet-base-uncased",
    cache_dir="cache/"
)

# Move model to device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [12]:
def tokenize(key):
    def callback(row):
        return tokenizer(
            row[key],
            truncation=True,
            padding="max_length",
            max_length=128,
        )
    return callback

In [13]:
tokenizer_callback = tokenize("text")
encoded_train_ds = cleaned_train_ds.map(tokenizer_callback, batch_size=256, batched=True)
encoded_eval_ds = cleaned_eval_ds.map(tokenizer_callback, batch_size=256, batched=True)
encoded_test_ds = cleaned_test_ds.map(tokenizer_callback, batch_size=256, batched=True)

Map:   0%|          | 0/11560 [00:00<?, ? examples/s]

Map:   0%|          | 0/2890 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [14]:
encoded_train_ds.set_format("torch", columns=["label", "input_ids", "attention_mask"])
encoded_eval_ds.set_format("torch", columns=["label", "input_ids", "attention_mask"])
encoded_test_ds.set_format("torch", columns=["label", "input_ids", "attention_mask"])


In [15]:
from transformers import Trainer
from torch import nn

class WeightedLossTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        weights = self.class_weights.to(logits.device)
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [16]:
from transformers.training_args import TrainingArguments
from transformers.trainer_callback import EarlyStoppingCallback
from transformers.trainer import Trainer
from transformers.data.data_collator import default_data_collator
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import numpy as np

def compute_metrics(class_names):
    num_classes = len(class_names)
    def callback(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, torch.Tensor):
            logits = logits.detach().cpu().numpy()
        if isinstance(labels, torch.Tensor):
            labels = labels.detach().cpu().numpy()
        preds = np.argmax(logits, axis=1)
        macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
            labels, preds, average="macro", zero_division=0
        )
        acc = accuracy_score(labels, preds)
        p_cls, r_cls, f1_cls, support_cls = precision_recall_fscore_support(
            labels, 
            preds, 
            average=None, 
            zero_division=0,
            labels=list(range(num_classes))
        )
        metrics = {
            "accuracy": acc,
            "macro_f1": macro_f1,
            "macro_precision": macro_p,
            "macro_recall": macro_r,
        }
        for idx, name in enumerate(class_names):
            metrics[f"{name}_precision"] = p_cls[idx] #type: ignore
            metrics[f"{name}_recall"]    = r_cls[idx]  #type: ignore
            metrics[f"{name}_f1"]        = f1_cls[idx]  #type: ignore
            metrics[f"{name}_support"]   = int(support_cls[idx])  #type: ignore
        return metrics
    return callback


training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    eval_strategy="epoch",     # evaluate at the end of each epoch
    save_strategy="epoch",           # save checkpoint at the end of each epoch
    learning_rate=3e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=256,
    num_train_epochs=40,
    weight_decay=0.01,
    logging_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_macro_f1",
    greater_is_better=True,
    warmup_steps=500,
    fp16=True,
    gradient_accumulation_steps=1,
)

def train_model(
    model,
    train_dataset,
    eval_dataset,
    test_dataset,
    input_class_names,
    class_weights
):
    compute_callback = compute_metrics(input_class_names)
    trainer = WeightedLossTrainer(
        model = model,
        args = training_args,
        train_dataset = train_dataset,
        eval_dataset=eval_dataset,
        data_collator=default_data_collator,
        compute_metrics=compute_callback,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
        class_weights=class_weights
    )
    print("Starting training...")
    trainer.train()
    print("Training complete. Evaluating...")
    return trainer.evaluate(eval_dataset = test_dataset)

In [17]:
import os
num_false = label_counts.get(False, 0)
num_true = label_counts.get(True, 0)
total = num_false + num_true

my_weights = torch.tensor([num_true / total,  # weight for class 0 (False)
                           num_false / total], # weight for class 1 (True)
                           dtype=torch.float)
results = train_model(model, encoded_train_ds, encoded_eval_ds, encoded_test_ds, class_labels.names, my_weights)

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1,Macro Precision,Macro Recall,False Precision,False Recall,False F1,False Support,True Precision,True Recall,True F1,True Support
1,0.5946,0.439376,0.818339,0.801357,0.793779,0.814713,0.690909,0.804233,0.743276,945,0.896648,0.825193,0.859438,1945
2,0.3929,0.350835,0.842215,0.827955,0.819321,0.843604,0.719677,0.847619,0.778426,945,0.918965,0.839589,0.877485,1945
3,0.3385,0.334567,0.85917,0.845661,0.837006,0.859737,0.746789,0.861376,0.8,945,0.927222,0.858098,0.891322,1945
4,0.2945,0.331023,0.86263,0.849249,0.840669,0.862851,0.752768,0.863492,0.804337,945,0.928571,0.862211,0.894162,1945
5,0.2433,0.356002,0.840138,0.829051,0.820429,0.854303,0.699752,0.895238,0.785515,945,0.941106,0.813368,0.872587,1945
6,0.1866,0.404863,0.8609,0.845515,0.839141,0.853949,0.762827,0.833862,0.796764,945,0.915455,0.874036,0.894266,1945
7,0.1297,0.461535,0.842215,0.829542,0.820462,0.849589,0.711322,0.870899,0.783064,945,0.929602,0.828278,0.87602,1945


Training complete. Evaluating...


In [20]:
import pandas as pd
import numpy as np
import torch
from transformers import Trainer, TrainingArguments
from IPython.display import display, HTML

# We don't need a full training setup, just a place to output predictions.
# This creates a dummy TrainingArguments object.
training_args = TrainingArguments(
    output_dir="./temp_trainer_results",
    per_device_eval_batch_size=64, # Use a large batch size for fast eval
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
)

print("Running predictions on test_ds...")

# --- 2. Run Predictions ---
# The .predict() method runs inference on test_ds and returns a PredictionOutput object
prediction_output = trainer.predict(encoded_test_ds)

print("Predictions complete.")

# The predictions are logits (raw model scores). We need the class index (0 or 1).
predicted_labels = np.argmax(prediction_output.predictions, axis=-1)

# The true labels are also in the output, just to be safe
true_labels = prediction_output.label_ids

# --- 3. Create DataFrame for Review ---

# Get the original text from the dataset
# This is why your test_ds *must* have the 'text' column
try:
    original_texts = encoded_test_ds['text']
except KeyError:
    print("="*50)
    print("ERROR: Your 'test_ds' does not have a 'text' column.")
    print("Please reload your dataset without removing the 'text' column.")
    print("="*50)
    # Stop execution if the text column is missing
    raise

# Create the main DataFrame
df = pd.DataFrame({
    'text': original_texts,
    'true_label': true_labels,
    'predicted_label': predicted_labels
})

# --- 4. Isolate and Categorize Errors ---

# Filter to get only the rows where the model was wrong
errors_df = df[df['true_label'] != df['predicted_label']].copy()

# Add a new column to categorize the error type
def get_error_type(row):
    if row['true_label'] == 0 and row['predicted_label'] == 1:
        # Model predicted 1 (Relevant), but it was 0 (Not Relevant)
        return 'False Positive (FP)'
    elif row['true_label'] == 1 and row['predicted_label'] == 0:
        # Model predicted 0 (Not Relevant), but it was 1 (Relevant)
        return 'False Negative (FN)'

errors_df['error_type'] = errors_df.apply(get_error_type, axis=1)

print(f"\nFound {len(errors_df)} misclassified samples out of {len(df)} total.")

# --- 5. Display the Wrong Answers for Review ---

# Set pandas display options for better text viewing in the notebook
pd.set_option('display.max_colwidth', 300) # Show more text
pd.set_option('display.max_rows', 100)     # Show more rows

print("\n--- 🔴 FALSE POSITIVES (Model said 'Relevant', but was 'Not Relevant') ---")
display(errors_df[errors_df['error_type'] == 'False Positive (FP)'])

print("\n--- 🔵 FALSE NEGATIVES (Model said 'Not Relevant', but was 'Relevant') ---")
display(errors_df[errors_df['error_type'] == 'False Negative (FN)'])

Running predictions on test_ds...


Predictions complete.

Found 293 misclassified samples out of 2000 total.

--- 🔴 FALSE POSITIVES (Model said 'Relevant', but was 'Not Relevant') ---


Unnamed: 0,text,true_label,predicted_label,error_type
6,mantan mertuanya dulu juga begitu setiap hadapi aksi demo. dulu bahasanya ditunggangi komunis. sepertinya kita sedang dibawa mundur ke belakang.,0,1,False Positive (FP)
15,itu kenapa dihadang sih? kan lagi unjuk rasa!,0,1,False Positive (FP)
21,"sudah tau berita dari sukatani yang meminta maaf pada institusi polri dan menurunkan lagunya di platform-platform streaming online? menurut pengajar sth indonesia asfinawati, hal ini tidak dapat dibenarkan dan menandakan bahwa saat ini #indonesiagelap. #kamibersamasukatani",0,1,False Positive (FP)
56,"aku kangen jay banget tolong, dia kenapa tidak update update hueueueue #gagalkanuutni #cabutuutni #tolakuutni #tolakrevisiuutni #peringatandarurat #indonesiagelap #tolakdwifungsiabri #supremasisipil #tolakruupolri #tolakruukejaksaan",0,1,False Positive (FP)
93,"sering, waktu itu bahas bobroknya coretax, warna cat yang bagus buat lebaran, bagaimana cara duit tambahan, mengomongkan dwifungsi abri, harga rumah, kontrakan, proyeksi masa depan, dan masih banyak lagi.",0,1,False Positive (FP)
141,#indonesiagelap #cabutuutni #fufufafaaibnasional,0,1,False Positive (FP)
219,terima kasih kepada teman-teman yang ikut meramaikan sg tentang ruu tni ini karena memang perlu guys kita manfaatkan rasa “fomo” gen saja and gen alpha biar jadi penasaran sama “isu apa sekarang” melihat banyaknya repost an dan akhirnya ikutan repost!!,0,1,False Positive (FP)
255,#cabutruutni #tolakruupolri,0,1,False Positive (FP)
261,love pink sok muchh #tolakuutni #tolakruupolri #tolakrevisiuutni #tolakdwifungsiabri #tolakruupolri,0,1,False Positive (FP)
280,tidak ada itu namanya dwifungsi bro #sinergitasuntukbangsa,0,1,False Positive (FP)



--- 🔵 FALSE NEGATIVES (Model said 'Not Relevant', but was 'Relevant') ---


Unnamed: 0,text,true_label,predicted_label,error_type
12,"prabowo anti buruh, kebijakan yang bapak lontarkan mohon untuk difikirkan kembali. negara maju saja seperti inggris, jerman, italia, dan negara eropa lainnya, buruh melakukan demonstrasi besar-besaran menuntut kenaikan upah minimum dan berhasil naik di atas 20%.",1,0,False Negative (FN)
17,perjuangan belum selesai #adilijokowi #indonesiagelap #peringatandarurat,1,0,False Negative (FN)
18,apanya yang memperbaiki citra institusi #indonesiagelap #kamibersamasukatani <url> ini tidak tersedia.,1,0,False Negative (FN)
33,"unjuk rasa hari buruh sedunia di kantor dprd jateng, ratusan mahasiswa nyaris bentrok <url>",1,0,False Negative (FN)
64,"*jenderal polisi aktif dilantik jadi sekjen dpd ri* dwifungsi polri? <url> *follow dan baca berita terkini di channel konteks <url> irjen sama iqbal jadi sekjen dpd ri langgar uu, harus dievaluasi ulang - konteks.co.iddari konteks.co.id",1,0,False Negative (FN)
...,...,...,...,...
1952,sudahlah diam saja,1,0,False Negative (FN)
1958,ya ampun. we’re sok back tapi 90’s dictatorship guys. #indonesiagelap,1,0,False Negative (FN)
1972,"pilih wakil menteri juga barang sekon yang biasa tukang protes, tukang demo bahkan provokator juga bisa jadi menteri bahkan lulusan usni universitas tidak laku modal jilati mulyono bisa jadi wamenaker, ini ngeri buat investor !! semoga presiden makin cerdas membaca situasi",1,0,False Negative (FN)
1973,"prof dilatih oleh pm masa itu yang ada adab. adab untuk berusaha n berjuang ketika susah. generasi hari ini, diajar oleh untuk berdemo. bermula dizaman um, demonstrasi cari kesalahan adalah adabny. jadi kerajaan sekarang pun adabnya menyalahkan org.freemalaysiatoday.commuhasabah perlu dalam isu ...",1,0,False Negative (FN)


In [11]:
def calculate_weighted_metrics(eval_metrics):
    """
    Calculates the weighted-average F1, precision, and recall
    from a Hugging Face eval metrics dictionary.
    """
    
    # --- Get F1 and Support ---
    f1_false = eval_metrics.get('eval_False_f1')
    support_false = eval_metrics.get('eval_False_support')
    f1_true = eval_metrics.get('eval_True_f1')
    support_true = eval_metrics.get('eval_True_support')
    
    # --- Get Precision ---
    precision_false = eval_metrics.get('eval_False_precision')
    precision_true = eval_metrics.get('eval_True_precision')
    
    # --- Get Recall ---
    recall_false = eval_metrics.get('eval_False_recall')
    recall_true = eval_metrics.get('eval_True_recall')

    # Check that we have the minimum required keys
    if None in [f1_false, support_false, f1_true, support_true]:
        print("Error: Missing required keys for F1/support.")
        return {}

    # --- Calculate Total Support ---
    total_support = support_false + support_true
    if total_support == 0:
        print("Error: Total support is zero.")
        return {}

    # --- Calculate Weighted Averages ---
    weighted_f1 = ( (f1_false * support_false) + (f1_true * support_true) ) / total_support
    
    weighted_precision = ( (precision_false * support_false) + (precision_true * support_true) ) / total_support
    
    weighted_recall = ( (recall_false * support_false) + (recall_true * support_true) ) / total_support

    return {
        "weighted_f1": weighted_f1,
        "weighted_precision": weighted_precision,
        "weighted_recall": weighted_recall,
        "total_support": total_support
    }

In [12]:
print(calculate_weighted_metrics(results))
print(results)

{'weighted_f1': 0.8443277350249011, 'weighted_precision': 0.8439607440587283, 'weighted_recall': 0.845, 'total_support': 2000}
{'eval_loss': 0.5155515670776367, 'eval_accuracy': 0.845, 'eval_macro_f1': 0.8293310548111575, 'eval_macro_precision': 0.8323690431753141, 'eval_macro_recall': 0.8266513811551479, 'eval_False_precision': 0.7923976608187134, 'eval_False_recall': 0.7633802816901408, 'eval_False_f1': 0.7776183644189383, 'eval_False_support': 710, 'eval_True_precision': 0.8723404255319149, 'eval_True_recall': 0.889922480620155, 'eval_True_f1': 0.8810437452033768, 'eval_True_support': 1290, 'eval_runtime': 0.8425, 'eval_samples_per_second': 2373.901, 'eval_steps_per_second': 9.496, 'epoch': 11.0}


In [13]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from scipy.special import softmax # Or use torch.nn.functional.softmax
import numpy as np
import pandas as pd
from tqdm import tqdm
model.eval()     # Put model in evaluation mode (turns off dropout)

# 3. Prepare your new data
new_texts = ds["cleaned"]["content"]

batch_size = 32  # <-- Adjust this based on your GPU memory. Try 16, 32, or 64.

# --- 5. NEW: Batch Prediction Loop ---
all_probabilities = []
all_predicted_indices = []

print(f"Starting prediction on {len(new_texts)} texts in batches of {batch_size}...")

# Use torch.no_grad() for the whole loop
with torch.no_grad():
    # Loop over the new_texts in chunks
    for i in tqdm(range(0, len(new_texts), batch_size)):
        
        # Get the current batch of texts
        batch_texts = new_texts[i : i + batch_size]
        
        # Tokenize the batch
        inputs = tokenizer(
            batch_texts, 
            padding=True,       # Pad to the longest text *in this batch*
            truncation=True,    # Truncate to model's max length
            max_length=512,     # Explicitly set max length
            return_tensors="pt"
        )
        
        # Move batch to GPU
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Run inference
        outputs = model(**inputs)
        
        # Get logits and convert to probabilities
        logits = outputs.logits
        probabilities = softmax(logits.cpu().numpy(), axis=1)
        
        # Get predicted indices
        predicted_indices = np.argmax(probabilities, axis=1)
        
        # Store results
        all_probabilities.append(probabilities)
        all_predicted_indices.append(predicted_indices)

print("Prediction complete.")

# --- 6. Consolidate Results ---
# Combine the results from all batches
all_probabilities = np.concatenate(all_probabilities, axis=0)
all_predicted_indices = np.concatenate(all_predicted_indices, axis=0)

# --- 7. Package into DataFrame ---
results_data = []
for i in range(len(new_texts)):
    results_data.append({
        'text': new_texts[i],
        'predicted_label': class_labels.names[all_predicted_indices[i]],
        'confidence': all_probabilities[i].max(),
        'prob_False': all_probabilities[i][0],
        'prob_True': all_probabilities[i][1]
    })

df = pd.DataFrame(results_data)
print(df)

Starting prediction on 201583 texts in batches of 32...


 81%|████████  | 5098/6300 [01:02<00:14, 80.94it/s]


KeyboardInterrupt: 

In [None]:
cleaned_ds = ds["cleaned"]
predictions_list = df["predicted_label"].tolist()
# 3. Define the function that .map() will run on each row
def update_row(example, idx):
    """
    Takes a single row (example) and its index (idx).
    It replaces the value in 'related' with the new prediction.
    """
    example['related'] = predictions_list[idx]
    return example

# 4. Apply the function to the entire dataset
# .map() returns a new dataset, so we must re-assign it
print("Starting to update 'related' column...")
cleaned_ds = cleaned_ds.map(update_row, with_indices=True)

print("Column updated successfully!")
print(cleaned_ds)

# Check the first row to see the change
# You should now see your string label (e.g., "True" or "False")
# instead of the old boolean (e.g., True or False)
print(cleaned_ds[5])

In [None]:
view_ds = cleaned_ds.select_columns(["content", "related"])
for row in view_ds.to_list()[:1000]:
    print(row)

In [None]:
ds["cleaned"] = cleaned_ds

In [None]:
ds.push_to_hub("tianharjuno/twitter-parse", commit_description="Labeled cleaned ds")

In [None]:
model.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1", commit_description="Changed training pipeline")
tokenizer.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1", commit_description="Changed training pipeline")