# Start parsing label studio

In [57]:
LABEL_STUDIO_DUMP="out/labelstudio/p1/p1_training_labeled.json"
LABELED_CLEANED="out/labelstudio/p1/p1_training_cleaned.json"

In [58]:
import json
with open(LABEL_STUDIO_DUMP, "r", encoding="utf-8") as file:
  documents = json.load(file)

rawdata = [doc["data"] for doc in documents]
metadata = [doc["meta"] for doc in documents]
annotations = [doc["annotations"][0] for doc in documents]

relevant_count = 0;
irrelevant_count = 0;

relevancy_label_mapping = {
  "relevant": 1,
  "irrelevant": 0
}

for data, annotation, meta in zip(rawdata, annotations, metadata):
  label = annotation["result"][0]["value"]["choices"][0]
  if label == "relevant":
    relevant_count += 1
  if label == "irrelevant":
    irrelevant_count += 1
  data["relevancy_label"] = relevancy_label_mapping[label]
  data.update(meta)
  
print(f"Relevant count: {relevant_count} Irrelevant Count: {irrelevant_count}")

with open(LABELED_CLEANED, "w", encoding="utf-8") as file:
  json.dump(rawdata, file, ensure_ascii=False, indent=2)


Relevant count: 416 Irrelevant Count: 602


## Train initial model for pre-annotation on 700 manually labeled data

In [59]:
from datasets import load_dataset, ClassLabel, concatenate_datasets

# Load the full dataset
ds = load_dataset("json", data_files=LABELED_CLEANED)["train"]

from datasets import ClassLabel

# Define number of classes
num_classes = len(set(ds["bucket_label"]))

# Replace column with ClassLabel version
class_label = ClassLabel(num_classes=num_classes, names=[str(i) for i in range(num_classes)])
ds = ds.cast_column("bucket_label", class_label)

unique_labels = ds.unique("relevancy_label")
class_label = ClassLabel(names=[str(label) for label in unique_labels])
ds = ds.cast_column("relevancy_label", class_label)

# First split: 80% train, 20% temp (val + test)
split_ds = ds.train_test_split(test_size=0.2, seed=42, stratify_by_column="relevancy_label")

train_dataset = split_ds["train"]
val_dataset = split_ds["test"]

print(f"Train: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")

Generating train split: 1018 examples [00:00, 35199.47 examples/s]
Casting the dataset: 100%|██████████| 1018/1018 [00:00<00:00, 438063.14 examples/s]
Casting the dataset: 100%|██████████| 1018/1018 [00:00<00:00, 441095.19 examples/s]

Train: 814 samples
Validation: 204 samples





In [60]:
import re

def remove_urls(example):
    # Remove http, https, and www URLs
    example["text"] = re.sub(r'<url>', '', example["text"])
    return example


In [61]:
from transformers import AutoTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
import json
from sklearn.utils import class_weight
import numpy as np
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
class CustomModel(BertForSequenceClassification):
  def __init__(self, config, class_weights=None):
    super().__init__(config)
    labels = np.array(train_dataset["relevancy_label"])
    weights = class_weight.compute_class_weight(
      class_weight="balanced",
      classes = np.array([0, 1]),
      y = labels
    )
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

    weights[0] = weights[0] * 1.5  # Adjust weight for class 0
    self.class_weights = torch.tensor(weights, dtype=torch.float).to(device)
    self.loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
    # self.loss_fct = nn.CrossEntropyLoss(weight=class_weights) if class_weights is not None else nn.CrossEntropyLoss(weight=weight)

  def forward(self, input_ids=None, attention_mask=None, labels=None):
    outputs = super().forward(input_ids, attention_mask=attention_mask)
    logits = outputs.logits

    loss = None
    if labels is not None:
        loss = self.loss_fct(logits, labels)

    return (loss, logits) if loss is not None else logits

with open("out/hashtag_list.json", "r") as file:
  hashtags = json.load(file)

tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased", cache_dir="cache/")
model = CustomModel.from_pretrained("indolem/indobertweet-base-uncased", cache_dir="cache/", num_labels=2)
model.to(device)
tokenizer.add_tokens(hashtags)
model.resize_token_embeddings(len(tokenizer))

Some weights of CustomModel were not initialized from the model checkpoint at indolem/indobertweet-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'loss_fct.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(32025, 768, padding_idx=0)

In [62]:
def tokenizer_function(examples):
  return tokenizer(
    examples["text"],
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt" 
  )

In [63]:
train_dataset = train_dataset.map(remove_urls)
test_dataset = val_dataset.map(remove_urls)
train_dataset = train_dataset.map(tokenizer_function, batched=True)
test_dataset = val_dataset.map(tokenizer_function, batched=True)

Map: 100%|██████████| 814/814 [00:00<00:00, 19436.98 examples/s]
Map: 100%|██████████| 204/204 [00:00<00:00, 18225.62 examples/s]
Map: 100%|██████████| 814/814 [00:00<00:00, 10121.23 examples/s]
Map: 100%|██████████| 204/204 [00:00<00:00, 8155.46 examples/s]


In [64]:
train_dataset = train_dataset.rename_column("relevancy_label", "labels")
test_dataset = test_dataset.rename_column("relevancy_label", "labels")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])

In [65]:
print(train_dataset[0])

{'labels': tensor(0), 'input_ids': tensor([    3,  2106, 15593,  2480,  2548,  5311,  1777, 18499,  1713,  2426,
        15430,  6358,    17,  1953,  3892,  5219,    35,  3417,  1777,  1656,
            5,  5311,  2341,  4923,  6773,  1959, 14316,  3798,  4856,  2549,
          965,  2480,  2548,    18, 31925,     4,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [66]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=5,
  per_device_train_batch_size=16,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=50,
  load_best_model_at_end=True,
  metric_for_best_model="eval_precision",
  greater_is_better=True,
  eval_strategy="epoch",
  warmup_ratio=0.1,
  weight_decay=0.05,
  learning_rate=3e-5,
  lr_scheduler_type="linear"
)

In [67]:
from transformers import Trainer, default_data_collator
from torch.optim import AdamW
import evaluate
import numpy as np
accuracy_metric = evaluate.load("accuracy")
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
)


def compute_metrics(eval_pred):
    """
    eval_pred is a transformers EvalPrediction: (logits, labels)
    Returns a dict whose keys become 'eval_<key>' in Trainer logs.
    """
    logits, labels = eval_pred
    # Convert to numpy, take argmax for predicted class (0/1).
    preds = np.argmax(logits, axis=-1)

    # F1, precision, recall for the positive class (1 = relevant)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )

    acc = accuracy_score(labels, preds)

    return {
        "eval_accuracy": acc,
        "eval_precision": precision,
        "eval_recall": recall,
        "eval_f1": f1,
    }
  
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator,
  optimizers=(AdamW(model.parameters(), lr=5e-5), None)
)

In [68]:
from transformers import Trainer, default_data_collator
import evaluate

accuracy_metric = evaluate.load("accuracy")
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator,
)

In [69]:
train_dataloader = trainer.get_train_dataloader()
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['labels', 'input_ids', 'attention_mask'])




In [70]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5724,0.396877,0.838235,0.8125,0.783133,0.797546
2,0.3225,0.300545,0.852941,0.804598,0.843373,0.823529
3,0.2302,0.312566,0.848039,0.861111,0.746988,0.8
4,0.1869,0.27981,0.872549,0.793814,0.927711,0.855556
5,0.1205,0.297771,0.857843,0.77551,0.915663,0.839779




TrainOutput(global_step=255, training_loss=0.2823714928299773, metrics={'train_runtime': 371.89, 'train_samples_per_second': 10.944, 'train_steps_per_second': 0.686, 'total_flos': 535430997657600.0, 'train_loss': 0.2823714928299773, 'epoch': 5.0})

In [79]:
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
)
import pandas as pd
import json
from transformers import Trainer
from datasets import Dataset

CLASS_NAMES = ["irrelevant", "relevant"]  # adjust if you have more

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=1)

    # overall (macro) metrics
    macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    acc = accuracy_score(labels, preds)

    # per-class metrics
    per_class = precision_recall_fscore_support(labels, preds, average=None, zero_division=0)
    p_cls, r_cls, f1_cls, support_cls = per_class

    # Flatten per-class metrics into scalars in the returned dict
    metrics = {
        "accuracy": acc,
        "macro_f1": macro_f1,
        "macro_precision": macro_p,
        "macro_recall": macro_r,
    }
    for idx, name in enumerate(CLASS_NAMES):
        metrics[f"{name}_precision"] = p_cls[idx]
        metrics[f"{name}_recall"]    = r_cls[idx]
        metrics[f"{name}_f1"]        = f1_cls[idx]
        metrics[f"{name}_support"]   = support_cls[idx]

    return metrics

with open("out/golden-standard.json", "r", encoding="utf-8") as file:
    bucket_effect = json.load(file)
evaluation_pd = pd.DataFrame.from_dict(bucket_effect)
evaluation_dataset = Dataset.from_pandas(evaluation_pd)
evaluation_dataset = evaluation_dataset.rename_column("relevancy_label", "labels")
evaluation_dataset = evaluation_dataset.map(remove_urls)
evaluation_dataset = evaluation_dataset.map(tokenizer_function, batched=True)
evaluation_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])

evaluation_trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=evaluation_dataset,
    data_collator=default_data_collator,
    optimizers=(AdamW(model.parameters(), lr=5e-5), None),
    compute_metrics=compute_metrics,
)
metrics = evaluation_trainer.evaluate()
pred_out = evaluation_trainer.predict(evaluation_dataset)

logits = torch.tensor(pred_out.predictions, dtype=torch.float32)
probs = torch.nn.functional.softmax(logits, dim=1)

threshold = 0.99
relevant_probs = probs[:, 1]
preds = (relevant_probs > threshold).int()
# preds  = pred_out.predictions.argmax(axis=1)
labels = pred_out.label_ids

print("\n=== Confusion Matrix ===")
print(confusion_matrix(labels, preds))

print("\n=== Classification Report ===")
print(classification_report(labels, preds, target_names=CLASS_NAMES, digits=4))

print("=== Scalar metrics returned by Trainer ===")
for k, v in metrics.items():
    print(f"{k:20s}: {v:.4f}")

Map: 100%|██████████| 1100/1100 [00:00<00:00, 20693.01 examples/s]
Map: 100%|██████████| 1100/1100 [00:00<00:00, 13046.60 examples/s]





=== Confusion Matrix ===
[[678   0]
 [317 105]]

=== Classification Report ===
              precision    recall  f1-score   support

  irrelevant     0.6814    1.0000    0.8105       678
    relevant     1.0000    0.2488    0.3985       422

    accuracy                         0.7118      1100
   macro avg     0.8407    0.6244    0.6045      1100
weighted avg     0.8036    0.7118    0.6524      1100

=== Scalar metrics returned by Trainer ===
eval_loss           : 0.2856
eval_model_preparation_time: 0.0009
eval_accuracy       : 0.8727
eval_macro_f1       : 0.8586
eval_macro_precision: 0.8883
eval_macro_recall   : 0.8449
eval_irrelevant_precision: 0.8494
eval_irrelevant_recall: 0.9646
eval_irrelevant_f1  : 0.9033
eval_irrelevant_support: 678.0000
eval_relevant_precision: 0.9273
eval_relevant_recall: 0.7251
eval_relevant_f1    : 0.8138
eval_relevant_support: 422.0000
eval_runtime        : 25.5869
eval_samples_per_second: 42.9910
eval_steps_per_second: 5.3930


In [None]:
trainer.save_model("models/ruu-tni-relevancy-classification-p1")
tokenizer.save_pretrained("tokenizers/ruu-tni-relevancy-classification-p1")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification-p1")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification-p1")

model.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1")
tokenizers.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1")

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification-p1", cache_dir="cache/")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification-p1", cache_dir="cache/")
model.eval()

def predict(text):
  inputs = tokenizers(text, return_tensors="pt", truncation=True, padding=True)
  with torch.no_grad():
    outputs = model(**inputs)
  logits = outputs.logits
  
  probabilities = torch.softmax(logits, dim=1)
  predicted_class_idx = torch.argmax(probabilities, dim=1).item()
  
  label_mappings = {
    1: "relevant",
    0: "irrelevant"
  }
  
  print(f"Predicted class index: {label_mappings[predicted_class_idx]}")
  print(f"Probabilities: {probabilities.squeeze().tolist()}")


In [None]:
predict("ruu tni kontol #cabutruutni")

## Pre-annotation on new sampled data using relevancy stage 1 model.

In [None]:
import json
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import numpy as np

# Load input JSON
with open("out/indobertweet/indobertweet-kmeans-embed.json", "r", encoding="utf-8") as file:
    documents = json.load(file)

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification-p1", device_map=None
)
tokenizer = AutoTokenizer.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification-p1"
)
model.eval()
device = torch.device("mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else "cpu")
model.to(device)

# Prediction function with batching
def predict_in_batches(texts, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Batch predicting"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=256
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        indices = torch.argmax(probs, dim=1)
        label_map = ["irrelevant", "relevant"]
        batch_results = [
            {"label": label_map[i.item()], "score": probs[j][i].item()}
            for j, i in enumerate(indices)
        ]
        results.extend(batch_results)
    return results

# Run prediction
texts = [doc["content"] for doc in documents]
results = predict_in_batches(texts, batch_size=32)

#calculate entropy
def binary_entropy(p):
    p = np.clip(p, 1e-12, 1 - 1e-12)  # Avoid log(0)
    return -p * np.log2(p) - (1 - p) * np.log2(1 - p)
    
parsed_result = []
for doc, result in zip(documents, results):
    doc["prediction_result"] = result["label"]
    doc["prediction_score"] = result["score"]
    doc["entropy_score"] = binary_entropy(result["score"])
    parsed_result.append(doc)

# Save output
with open("out/labelstudio/p2/p2_training_preannotated.json", "w", encoding="utf-8") as file:
    json.dump(parsed_result, file, ensure_ascii=False, indent=2)

#### Compare previous bucketing technique to current bucketing technique

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

with open("out/bucketing-effect-analyze.json", "r", encoding="utf-8") as file:
    documents = json.load(file)

df = pd.DataFrame.from_dict(documents)

entropy = df['entropy_score'].sort_values()

# Define bucket edges (uniform)
uniform_bins = np.linspace(entropy.min(), entropy.max(), 20)

# Define bucket edges (quantiles)
quantile_bins = entropy.quantile(np.linspace(0, 1, 20)).values

def cumulative_curve(entropy_values, bins):
    counts, bin_edges = np.histogram(entropy_values, bins=bins)
    cum_counts = np.cumsum(counts)
    cum_fraction = cum_counts / cum_counts[-1]
    # Use bin upper edges for x axis (skip first edge)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
    return bin_centers, cum_fraction

# Uniform bucketing cumulative curve
x_uniform, y_uniform = cumulative_curve(entropy, uniform_bins)

# Plot cumulative curves
plt.figure(figsize=(10,6))
plt.plot(x_uniform, y_uniform, label='Uniform Bucketing', marker='o')

plt.xlabel('Entropy Score')
plt.ylabel('Cumulative Fraction of Samples')
plt.title('Cumulative Distribution of Samples by Entropy Score')
plt.legend()
plt.grid(True)
plt.show()




In [None]:

import json
from collections import defaultdict
import math
import pandas as pd
with open("out/labelstudio/p2/p2_training_preannotated.json", "r", encoding="utf-8") as file:
  documents = json.load(file)
with open("out/labelstudio/p1/p1_training_cleaned.json", "r", encoding="utf-8") as file:
  p1documents = json.load(file)
  
p1documents_ids = [doc["tweet_id"] for doc in p1documents]

purified = []

for doc in documents:
  found = 0
  for ids in p1documents_ids:
    if(doc["tweet_id"] == ids):
      found = 1
  if found == 0:
    purified.append(doc)

print(len(documents), len(p1documents), len(purified))
documents = purified
SAMPLE_COUNT = 2000

buckets = defaultdict(list)
buckets_sampled = defaultdict(list)
bucket_content_count = defaultdict(int)
bucket_ratio_count = defaultdict(float)
data_count = 0

for doc in documents:
  label = doc["bucket_label"]
  buckets[label].append(doc)
  bucket_content_count[label] += 1
  data_count += 1
  
for bucket_name, count in bucket_content_count.items():
  bucket_ratio_count[bucket_name] = count / data_count
  
for bucket_name, data in buckets.items():
  data.sort(key=lambda x: x["entropy_score"], reverse=True)

for bucket_name, data in buckets.items():
  bucket_sample_total = math.ceil(bucket_ratio_count[bucket_name] * SAMPLE_COUNT)
  data_df = pd.DataFrame(data)
  
  high_entropy_threshold = data_df["entropy_score"].quantile(0.30)   # 30th percentile (start of high entropy)
  medium_entropy_lower = data_df["entropy_score"].quantile(0.50)     # 50th percentile
  medium_entropy_upper = data_df["entropy_score"].quantile(0.80)     # 80th percentile
  low_entropy_lower = data_df["entropy_score"].quantile(0.10)        # 10th percentile
  low_entropy_upper = data_df["entropy_score"].quantile(0.20)        # 20th percentile
  
  
  # High entropy: top 70% entropy, i.e. > 30th percentile
  # Print thresholds firs

  low_entropy_tweets = data_df[
    (data_df["entropy_score"] >= low_entropy_lower) & 
    (data_df["entropy_score"] <= low_entropy_upper)
  ]

  medium_entropy_tweets = data_df[
    (data_df["entropy_score"] >= medium_entropy_lower) & 
    (data_df["entropy_score"] <= medium_entropy_upper)
  ]

  high_entropy_tweets = data_df[
    data_df["entropy_score"] > high_entropy_threshold
  ]

  high_entropy_count = math.ceil(bucket_sample_total * 0.7)
  medium_entropy_count = math.ceil(bucket_sample_total * 0.2)
  low_entropy_count = bucket_sample_total - high_entropy_count - medium_entropy_count
  low_entropy_count = max(0, low_entropy_count)

  high_entropy_tweets_adjusted = high_entropy_tweets[:high_entropy_count]
  medium_entropy_tweets_adjusted = medium_entropy_tweets.sample(n=medium_entropy_count, random_state=42)
  
  low_entropy_tweets_adjusted = low_entropy_tweets.sample(n = low_entropy_count, random_state=42)
  
  print(high_entropy_count, medium_entropy_count, low_entropy_count)
  
  concat = pd.concat([high_entropy_tweets_adjusted, medium_entropy_tweets_adjusted, low_entropy_tweets_adjusted], ignore_index=True)
  buckets_sampled[bucket_name] = concat.to_dict(orient="records")
  
merged = []
for dd in buckets_sampled.values():
  merged.extend(dd)
  
formatted = []
for doc in merged:
  formatted.append({
    "data": {
      "text": doc["content"],
      "bucket_label": doc["bucket_label"],
      "prediction_score": doc["prediction_score"],
      "prediction_result": doc["prediction_result"],
      "entropy_score": doc["entropy_score"]
      },
    "meta": {
      "tweet_id": doc["tweet_id"],
      "time": doc["time"],
      "author": doc["author"],
      "comment_count": doc["comment_count"],
      "repost_count": doc["repost_count"],
      "like_count": doc["like_count"],
      "view_count": doc["view_count"],
      "created_at": doc["created_at"],
    },
    "predictions": [{
      "model_version": "1.0.0",
      "score": doc["prediction_score"],
      "result": [{
        "from_name": "sentiment",
        "to_name": "text",
        "type": "choices",
        "value": {"choices": [doc["prediction_result"]]},
      }]
    }]
  })  

with open("out/labelstudio/p2/p2_training_prepped.json", "w", encoding="utf-8") as file:
  json.dump(formatted, file, ensure_ascii=False, indent=2)
