# Start parsing label studio

In [1]:
LABEL_STUDIO_DUMP="out/labelstudio/p1/p1_training_labeled.json"
LABELED_CLEANED="out/labelstudio/p1/p1_training_cleaned.json"

In [2]:
import json
with open(LABEL_STUDIO_DUMP, "r", encoding="utf-8") as file:
  documents = json.load(file)

rawdata = [doc["data"] for doc in documents]
metadata = [doc["meta"] for doc in documents]
annotations = [doc["annotations"][0] for doc in documents]

relevant_count = 0;
irrelevant_count = 0;

relevancy_label_mapping = {
  "relevant": 1,
  "irrelevant": 0
}

for data, annotation, meta in zip(rawdata, annotations, metadata):
  label = annotation["result"][0]["value"]["choices"][0]
  if label == "relevant":
    relevant_count += 1
  if label == "irrelevant":
    irrelevant_count += 1
  data["relevancy_label"] = relevancy_label_mapping[label]
  data.update(meta)
  
print(f"Relevant count: {relevant_count} Irrelevant Count: {irrelevant_count}")

with open(LABELED_CLEANED, "w", encoding="utf-8") as file:
  json.dump(rawdata, file, ensure_ascii=False, indent=2)


Relevant count: 351 Irrelevant Count: 649


## Train initial model for pre-annotation on 700 manually labeled data

In [3]:
from datasets import load_dataset, ClassLabel, concatenate_datasets

# Load the full dataset
ds = load_dataset("json", data_files=LABELED_CLEANED)["train"]

unique_labels = ds.unique("relevancy_label")
class_label = ClassLabel(names=[str(label) for label in unique_labels])
ds = ds.cast_column("relevancy_label", class_label)

# First split: 80% train, 20% temp (val + test)
split_ds = ds.train_test_split(test_size=0.1, seed=42, stratify_by_column="relevancy_label")

train_dataset = split_ds["train"]
val_dataset = split_ds["test"]

print(f"Train: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 1000 examples [00:00, 47460.84 examples/s]
Casting the dataset: 100%|██████████| 1000/1000 [00:00<00:00, 190105.79 examples/s]

Train: 900 samples
Validation: 100 samples





In [4]:
import re

def remove_urls(example):
    # Remove http, https, and www URLs
    example["text"] = re.sub(r'https?://\S+|www\.\S+', '', example["text"])
    return example


In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

class CustomModel(BertForSequenceClassification):
  def __init__(self, config, class_weights=None):
    super().__init__(config)
    self.class_weights = class_weights
    self.loss_fct = nn.CrossEntropyLoss(weight=class_weights) if class_weights is not None else nn.CrossEntropyLoss()

  def forward(self, input_ids=None, attention_mask=None, labels=None):
    outputs = super().forward(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    loss = None
    if labels is not None:
        loss = self.loss_fct(logits, labels)

    return (loss, logits) if loss is not None else logits



tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
model = CustomModel.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/", num_labels=2)
model.to(device)

Some weights of CustomModel were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [6]:
def tokenizer_function(examples):
  return tokenizer(
    examples["text"],
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt" 
  )

In [7]:
train_dataset = train_dataset.map(remove_urls)
test_dataset = val_dataset.map(remove_urls)

Map: 100%|██████████| 900/900 [00:00<00:00, 23556.59 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 16493.53 examples/s]


In [8]:
train_dataset = train_dataset.map(tokenizer_function, batched=True)
test_dataset = val_dataset.map(tokenizer_function, batched=True)

Map: 100%|██████████| 900/900 [00:00<00:00, 12833.81 examples/s]
Map: 100%|██████████| 100/100 [00:00<00:00, 8239.31 examples/s]


In [9]:
train_dataset = train_dataset.rename_column("relevancy_label", "labels")
test_dataset = test_dataset.rename_column("relevancy_label", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])

In [10]:
print(train_dataset[0])

{'labels': tensor(0), 'input_ids': tensor([    2, 30459, 11283, 12508, 13133,   144, 10252,  4842, 30459,   300,
        18881,    36, 30459,  5754,   734,  4471,  2723,  7595,   773, 30459,
         2659,  2651, 16655, 16655,  3403,  3403, 30459,  5759, 10785, 10785,
          457,  4565,    86, 14949,  7137,  4476, 30371,  4425,  8385,  6856,
         2490,  7264, 30463,  3921, 30463,  1137,  5237,   259,   804,  5165,
        30468,  2124,   368,   746, 10643, 30477,     3,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=7,
  per_device_train_batch_size=16,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=5,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  eval_strategy="epoch",
)

In [12]:
from transformers import Trainer, default_data_collator
from torch.optim import AdamW
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)
  
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator,
  optimizers=(AdamW(model.parameters(), lr=5e-5), None)
)

In [13]:
from transformers import Trainer, default_data_collator
import evaluate

accuracy_metric = evaluate.load("accuracy")
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator,
)

In [14]:
train_dataloader = trainer.get_train_dataloader()
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['labels', 'input_ids', 'attention_mask'])




In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2819,0.129032,0.94
2,0.5113,0.543101,0.79
3,0.1057,0.098385,0.97
4,0.0725,0.200213,0.95
5,0.0681,0.32434,0.92
6,0.0005,0.348205,0.93
7,0.0004,0.432108,0.92




TrainOutput(global_step=399, training_loss=0.14358261361529112, metrics={'train_runtime': 511.2404, 'train_samples_per_second': 12.323, 'train_steps_per_second': 0.78, 'total_flos': 828799824384000.0, 'train_loss': 0.14358261361529112, 'epoch': 7.0})

In [27]:
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)

CLASS_NAMES = ["irrelevant", "relevant"]  # adjust if you have more

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)

    # overall (macro) metrics
    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)

    # per-class metrics
    per_class = precision_recall_fscore_support(labels, preds, average=None, zero_division=0)
    p_cls, r_cls, f1_cls, support_cls = per_class

    # Flatten per-class metrics into scalars in the returned dict
    metrics = {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
    }
    for idx, name in enumerate(CLASS_NAMES):
        metrics[f"{name}_precision"] = p_cls[idx]
        metrics[f"{name}_recall"]    = r_cls[idx]
        metrics[f"{name}_f1"]        = f1_cls[idx]
        metrics[f"{name}_support"]   = support_cls[idx]

    return metrics

from transformers import Trainer

evaluation_trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
metrics = evaluation_trainer.evaluate()
pred_out = evaluation_trainer.predict(test_dataset)
preds  = pred_out.predictions.argmax(axis=1)
labels = pred_out.label_ids

print("\n=== Confusion Matrix ===")
print(confusion_matrix(labels, preds))

print("\n=== Classification Report ===")
print(classification_report(labels, preds, target_names=CLASS_NAMES, digits=4))

print("=== Scalar metrics returned by Trainer ===")
for k, v in metrics.items():
    print(f"{k:20s}: {v:.4f}")






=== Confusion Matrix ===
[[64  1]
 [ 2 33]]

=== Classification Report ===
              precision    recall  f1-score   support

  irrelevant     0.9697    0.9846    0.9771        65
    relevant     0.9706    0.9429    0.9565        35

    accuracy                         0.9700       100
   macro avg     0.9701    0.9637    0.9668       100
weighted avg     0.9700    0.9700    0.9699       100

=== Scalar metrics returned by Trainer ===
eval_loss           : 0.0984
eval_model_preparation_time: 0.0015
eval_accuracy       : 0.9700
eval_macro_f1       : 0.9668
eval_macro_precision: 0.9701
eval_macro_recall   : 0.9637
eval_irrelevant_precision: 0.9697
eval_irrelevant_recall: 0.9846
eval_irrelevant_f1  : 0.9771
eval_irrelevant_support: 65.0000
eval_relevant_precision: 0.9706
eval_relevant_recall: 0.9429
eval_relevant_f1    : 0.9565
eval_relevant_support: 35.0000
eval_runtime        : 3.1031
eval_samples_per_second: 32.2260
eval_steps_per_second: 4.1890


In [19]:
trainer.save_model("models/ruu-tni-relevancy-classification-p1")
tokenizer.save_pretrained("tokenizers/ruu-tni-relevancy-classification-p1")

('tokenizers/ruu-tni-relevancy-classification-p1/tokenizer_config.json',
 'tokenizers/ruu-tni-relevancy-classification-p1/special_tokens_map.json',
 'tokenizers/ruu-tni-relevancy-classification-p1/vocab.txt',
 'tokenizers/ruu-tni-relevancy-classification-p1/added_tokens.json',
 'tokenizers/ruu-tni-relevancy-classification-p1/tokenizer.json')

In [24]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification-p1")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification-p1")
model.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1")
tokenizers.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1")

model.safetensors: 100%|██████████| 498M/498M [00:33<00:00, 15.0MB/s]   
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/tianharjuno/ruu-tni-relevancy-classification-p1/commit/7d051601964831ee4d7ab5840923863ffb25e03b', commit_message='Upload tokenizer', commit_description='', oid='7d051601964831ee4d7ab5840923863ffb25e03b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tianharjuno/ruu-tni-relevancy-classification-p1', endpoint='https://huggingface.co', repo_type='model', repo_id='tianharjuno/ruu-tni-relevancy-classification-p1'), pr_revision=None, pr_num=None)

In [22]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification-p1", cache_dir="cache/")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification-p1", cache_dir="cache/")
model.eval()

def predict(text):
  inputs = tokenizers(text, return_tensors="pt", truncation=True, padding=True)
  with torch.no_grad():
    outputs = model(**inputs)
  logits = outputs.logits
  
  probabilities = torch.softmax(logits, dim=1)
  predicted_class_idx = torch.argmax(probabilities, dim=1).item()
  
  label_mappings = {
    1: "relevant",
    0: "irrelevant"
  }
  
  print(f"Predicted class index: {label_mappings[predicted_class_idx]}")
  print(f"Probabilities: {probabilities.squeeze().tolist()}")


In [23]:
predict("Kehidupan WNI semakin terancam dengan ruu tni #cabutruutni")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predicted class index: relevant
Probabilities: [0.015615860000252724, 0.9843841791152954]


## Pre-annotation on new sampled data using relevancy stage 1 model.

In [30]:
import json
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import numpy as np

# Load input JSON
with open("out/indobertweet/indobertweet-kmeans-embed.json", "r", encoding="utf-8") as file:
    documents = json.load(file)

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification-p1", device_map=None
)
tokenizer = AutoTokenizer.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification-p1"
)
model.eval()
device = torch.device("mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else "cpu")
model.to(device)

# Prediction function with batching
def predict_in_batches(texts, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Batch predicting"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=256
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        indices = torch.argmax(probs, dim=1)
        label_map = ["irrelevant", "relevant"]
        batch_results = [
            {"label": label_map[i.item()], "score": probs[j][i].item()}
            for j, i in enumerate(indices)
        ]
        results.extend(batch_results)
    return results

# Run prediction
texts = [doc["content"] for doc in documents]
results = predict_in_batches(texts, batch_size=32)

#calculate entropy
def binary_entropy(p):
    p = np.clip(p, 1e-12, 1 - 1e-12)  # Avoid log(0)
    return -p * np.log2(p) - (1 - p) * np.log2(1 - p)
    
parsed_result = []
for doc, result in zip(documents, results):
    doc["prediction_result"] = result["label"]
    doc["prediction_score"] = result["score"]
    doc["entropy_score"] = binary_entropy(result["score"])
    parsed_result.append(doc)

# Save output
with open("out/labelstudio/p2/p2_training_preannotated.json", "w", encoding="utf-8") as file:
    json.dump(parsed_result, file, ensure_ascii=False, indent=2)

Batch predicting: 100%|██████████| 511/511 [02:30<00:00,  3.40it/s]


In [31]:

import json
from collections import defaultdict
import math
import pandas as pd
import random
with open("out/labelstudio/p2/p2_training_preannotated.json", "r", encoding="utf-8") as file:
  documents = json.load(file)
with open("out/labelstudio/p1/p1_training_cleaned.json", "r", encoding="utf-8") as file:
  p1documents = json.load(file)
  
p1documents_ids = [doc["tweet_id"] for doc in p1documents]

purified = []

for doc in documents:
  found = 0
  for ids in p1documents_ids:
    if(doc["tweet_id"] == ids):
      found = 1
  if found == 0:
    purified.append(doc)

print(len(documents), len(p1documents), len(purified))
documents = purified
SAMPLE_COUNT = 2000

buckets = defaultdict(list)
buckets_sampled = defaultdict(list)
bucket_content_count = defaultdict(int)
bucket_ratio_count = defaultdict(float)
data_count = 0

for doc in documents:
  label = doc["bucket_label"]
  buckets[label].append(doc)
  bucket_content_count[label] += 1
  data_count += 1
  
for bucket_name, count in bucket_content_count.items():
  bucket_ratio_count[bucket_name] = count / data_count
  
for bucket_name, data in buckets.items():
  data.sort(key=lambda x: x["entropy_score"], reverse=True)

for bucket_name, data in buckets.items():
  bucket_sample_total = math.ceil(bucket_ratio_count[bucket_name] * SAMPLE_COUNT)
  data_df = pd.DataFrame(data)
  
  high_entropy_threshold = data_df["entropy_score"].quantile(0.30)   # 30th percentile (start of high entropy)
  medium_entropy_lower = data_df["entropy_score"].quantile(0.50)     # 50th percentile
  medium_entropy_upper = data_df["entropy_score"].quantile(0.80)     # 80th percentile
  low_entropy_lower = data_df["entropy_score"].quantile(0.10)        # 10th percentile
  low_entropy_upper = data_df["entropy_score"].quantile(0.20)        # 20th percentile
  
  
  # High entropy: top 70% entropy, i.e. > 30th percentile
  # Print thresholds firs

  low_entropy_tweets = data_df[
    (data_df["entropy_score"] >= low_entropy_lower) & 
    (data_df["entropy_score"] <= low_entropy_upper)
  ]

  medium_entropy_tweets = data_df[
    (data_df["entropy_score"] >= medium_entropy_lower) & 
    (data_df["entropy_score"] <= medium_entropy_upper)
  ]

  high_entropy_tweets = data_df[
    data_df["entropy_score"] > high_entropy_threshold
  ]

  high_entropy_count = math.ceil(bucket_sample_total * 0.7)
  medium_entropy_count = math.ceil(bucket_sample_total * 0.2)
  low_entropy_count = bucket_sample_total - high_entropy_count - medium_entropy_count

  high_entropy_tweets_adjusted = high_entropy_tweets[:high_entropy_count]
  medium_entropy_tweets_adjusted = medium_entropy_tweets.sample(n=medium_entropy_count, random_state=42)
  low_entropy_tweets_adjusted = low_entropy_tweets.sample(n = low_entropy_count, random_state=42)
  
  print(high_entropy_count, medium_entropy_count, low_entropy_count)
  
  concat = pd.concat([high_entropy_tweets_adjusted, medium_entropy_tweets_adjusted, low_entropy_tweets_adjusted], ignore_index=True)
  buckets_sampled[bucket_name] = concat.to_dict(orient="records")
  
merged = []
for dd in buckets_sampled.values():
  merged.extend(dd)
  
formatted = []
for doc in merged:
  formatted.append({
    "data": {
      "text": doc["content"],
      "bucket_label": doc["bucket_label"],
      "prediction_score": doc["prediction_score"],
      "prediction_result": doc["prediction_result"],
      "entropy_score": doc["entropy_score"]
      },
    "meta": {
      "tweet_id": doc["tweet_id"],
      "time": doc["time"],
      "author": doc["author"],
      "comment_count": doc["comment_count"],
      "repost_count": doc["repost_count"],
      "like_count": doc["like_count"],
      "view_count": doc["view_count"],
      "created_at": doc["created_at"],
    },
    "predictions": [{
      "model_version": "1.0.0",
      "score": doc["prediction_score"],
      "result": [{
        "from_name": "sentiment",
        "to_name": "text",
        "type": "choices",
        "value": {"choices": [doc["prediction_result"]]},
      }]
    }]
  })  

with open("out/labelstudio/p2/p2_training_prepped.json", "w", encoding="utf-8") as file:
  json.dump(formatted, file, ensure_ascii=False, indent=2)


16324 1000 15324
144 41 20
50 15 6
224 64 31
70 20 10
157 45 22
119 34 16
200 57 28
64 19 8
111 32 15
269 77 37
