In [None]:
labels = [
  "probable_spam",
  "medium_no_hashtags",
  "long_with_hashtags",
  "medium_with_hashtags",
  "short_no_hashtag",
  "short_with_hashtag",
  "shotgun"
]

# Start parsing label studio

In [53]:
import json
with open("out/labelstudio/p1-training/labelstudio-dump.json", "r", encoding="utf-8") as file:
  documents = json.load(file)

rawdata = [doc["data"] for doc in documents]
annotations = [doc["annotations"][0] for doc in documents]

relevant_count = 0;
irrelevant_count = 0;

relevancy_label_mapping = {
  "relevant": 1,
  "irrelevant": 0
}

for data, annotation in zip(rawdata, annotations):
  label = annotation["result"][0]["value"]["choices"][0]
  if label == "relevant":
    relevant_count += 1
  if label == "irrelevant":
    irrelevant_count += 1
  data["relevancy_label"] = relevancy_label_mapping[label]
print(f"Relevant count: {relevant_count} Irrelevant Count: {irrelevant_count}")

with open("out/labelstudio/p1-training/labelstudio-dump-cleaned.json", "w", encoding="utf-8") as file:
  json.dump(rawdata, file, ensure_ascii=False, indent=2)


Relevant count: 332 Irrelevant Count: 368


## Train initial model for pre-annotation on 700 manually labeled data

In [54]:
from datasets import load_dataset

# Load the full dataset
ds = load_dataset("json", data_files="out/labelstudio/p1-training/labelstudio-dump-cleaned.json")["train"]

# First split: 80% train, 20% temp (val + test)
split_ds = ds.train_test_split(test_size=0.2, seed=42)

# Second split: split the 20% into 10% val, 10% test
temp_split = split_ds["test"].train_test_split(test_size=0.3, seed=42)

train_dataset = split_ds["train"]
val_dataset = temp_split["train"]
test_dataset = temp_split["test"]

print(f"Train: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")
print(f"Test: {len(test_dataset)} samples")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 700 examples [00:00, 15613.85 examples/s]

Train: 560 samples
Validation: 98 samples
Test: 42 samples





In [55]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
def tokenizer_function(examples):
  return tokenizer(
    examples["content"],
    padding="max_length",
    truncation=True,
    max_length=256,
  )

In [57]:
train_dataset = train_dataset.map(tokenizer_function, batched=True)
test_dataset = test_dataset.map(tokenizer_function, batched=True)

Map: 100%|██████████| 560/560 [00:00<00:00, 9945.00 examples/s]
Map: 100%|██████████| 42/42 [00:00<00:00, 5274.27 examples/s]


In [58]:
train_dataset = train_dataset.rename_column("relevancy_label", "labels")
test_dataset = test_dataset.rename_column("relevancy_label", "labels")


train_dataset = train_dataset.rename_column("content", "text")
test_dataset = test_dataset.rename_column("content", "text")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])

In [59]:
print(train_dataset[0])

{'labels': tensor(1), 'input_ids': tensor([    2, 30459,  3588,    32,  7506,  5820, 30356, 30459, 18122,  7506,
         5820, 30356, 30459, 10348,  7506,  5820, 30356, 30459, 10348,   757,
         2324, 11230,    63,   784, 30459,  5546, 12475,  3147, 30459,   300,
        18881,    36, 30459, 10348, 30364,  2723,  8615,   104,   301, 30459,
          888, 20409,  2626,    44, 30459, 10348,  8299,   110, 20955, 30459,
        10348,  8299, 11096,  3667,   887,     5,     3,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=10,
  per_device_train_batch_size=16,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=5,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  eval_strategy="epoch"
)

In [61]:
from transformers import Trainer, default_data_collator
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)
  
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator
)

In [62]:
from transformers import Trainer, default_data_collator
import evaluate

accuracy_metric = evaluate.load("accuracy")
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator
)

In [None]:
train_dataloader = trainer.get_train_dataloader()
batch = next(iter(train_dataloader))
print(batch.keys())

In [None]:
trainer.train()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
from transformers import Trainer

evaluationTrainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)
metrics = evaluationTrainer.evaluate()
print(metrics)

In [None]:
trainer.save_model("models/ruu-tni-relevancy-classification-p1")
tokenizer.save_pretrained("tokenizers/ruu-tni-relevancy-classification-p1")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification-p1")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification-p1")
model.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1")
tokenizers.push_to_hub("tianharjuno/ruu-tni-relevancy-classification-p1")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification-p1", cache_dir="cache/")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification-p1", cache_dir="cache/")

In [None]:
import torch
def predict(text):
  inputs = tokenizers(text, return_tensors="pt", truncation=True, padding=True)
  with torch.no_grad():
    outputs = model(**inputs)
  logits = outputs.logits
  
  probabilities = torch.softmax(logits, dim=1)
  predicted_class_idx = torch.argmax(probabilities, dim=1).item()
  
  label_mappings = {
    1: "relevant",
    0: "irrelevant"
  }
  
  print(f"Predicted class index: {label_mappings[predicted_class_idx]}")
  print(f"Probabilities: {probabilities.squeeze().tolist()}")


In [None]:
predict("ruu tni merusak masa depan bangsa!!!!! #cabutruutni")

## Pre-annotation on new sampled data using relevancy stage 1 model.

In [None]:
import json
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm
import numpy as np

# Load input JSON
with open("out/bucketing/tweet-with-bucket-labels.json", "r", encoding="utf-8") as file:
    documents = json.load(file)

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification-p1", cache_dir="cache/", device_map=None
)
tokenizer = AutoTokenizer.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification-p1", cache_dir="cache/"
)
model.eval()
device = torch.device("mps" if torch.backends.mps.is_available() and torch.backends.mps.is_built() else "cpu")
model.to(device)

# Prediction function with batching
def predict_in_batches(texts, batch_size=32):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Batch predicting"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=256
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        indices = torch.argmax(probs, dim=1)
        label_map = ["irrelevant", "relevant"]
        batch_results = [
            {"label": label_map[i.item()], "score": probs[j][i].item()}
            for j, i in enumerate(indices)
        ]
        results.extend(batch_results)
    return results

# Run prediction
texts = [doc["content"] for doc in documents]
results = predict_in_batches(texts, batch_size=32)

#calculate entropy
def binary_entropy(p):
    p = np.clip(p, 1e-12, 1 - 1e-12)  # Avoid log(0)
    return -p * np.log2(p) - (1 - p) * np.log2(1 - p)
    
parsed_result = []
for doc, result in zip(documents, results):
    doc["prediction_result"] = result["label"]
    doc["prediction_score"] = result["score"]
    doc["entropy_score"] = binary_entropy(result["score"])
    parsed_result.append(doc)

# Save output
with open("out/labelstudio/p2-training/all-tweets-annotated.json", "w", encoding="utf-8") as file:
    json.dump(parsed_result, file, ensure_ascii=False, indent=2)

In [32]:
import json
import numpy as np

def binary_entropy(p):
  # Clip p to avoid log(0) issues
  p = np.clip(p, 1e-12, 1 - 1e-12)
  return -p * np.log2(p) - (1 - p) * np.log2(1 - p)

with open("out/labelstudio/p2-training/all-tweets-annotated.json", "r", encoding="utf-8") as file:
  documents = json.load(file)
  
for doc in documents:
  doc["entropy_score"] = binary_entropy(doc["prediction_score"])
    
with open("out/labelstudio/p2-training/all-tweets-annotated.json", "w", encoding="utf-8") as file:
  json.dump(documents, file, ensure_ascii=False, indent=2)

In [51]:

import json
from collections import defaultdict
import math
import pandas as pd
import random
with open("out/labelstudio/p2-training/all-tweets-annotated.json", "r", encoding="utf-8") as file:
  documents = json.load(file)
  
SAMPLE_COUNT = 8000  

buckets = defaultdict(list)
buckets_sampled = defaultdict(list)
bucket_content_count = defaultdict(int)
bucket_ratio_count = defaultdict(float)
data_count = 0

for doc in documents:
  label = doc["bucket_label"]
  buckets[label].append(doc)
  bucket_content_count[label] += 1
  data_count += 1
  
for bucket_name, count in bucket_content_count.items():
  bucket_ratio_count[bucket_name] = count / data_count
  
for bucket_name, data in buckets.items():
  data.sort(key=lambda x: x["entropy_score"], reverse=True)

for bucket_name, data in buckets.items():
  bucket_sample_total = math.ceil(bucket_ratio_count[bucket_name] * SAMPLE_COUNT)
  data_df = pd.DataFrame(data)
  
  high_entropy_threshold = data_df["entropy_score"].quantile(0.30)   # 30th percentile (start of high entropy)
  medium_entropy_lower = data_df["entropy_score"].quantile(0.50)     # 50th percentile
  medium_entropy_upper = data_df["entropy_score"].quantile(0.80)     # 80th percentile
  low_entropy_lower = data_df["entropy_score"].quantile(0.10)        # 10th percentile
  low_entropy_upper = data_df["entropy_score"].quantile(0.20)        # 20th percentile
  
  
  # High entropy: top 70% entropy, i.e. > 30th percentile
  # Print thresholds firs

  low_entropy_tweets = data_df[
    (data_df["entropy_score"] >= low_entropy_lower) & 
    (data_df["entropy_score"] <= low_entropy_upper)
  ]

  medium_entropy_tweets = data_df[
    (data_df["entropy_score"] >= medium_entropy_lower) & 
    (data_df["entropy_score"] <= medium_entropy_upper)
  ]

  high_entropy_tweets = data_df[
    data_df["entropy_score"] > high_entropy_threshold
  ]

  high_entropy_count = math.ceil(bucket_sample_total * 0.7)
  medium_entropy_count = math.ceil(bucket_sample_total * 0.2)
  low_entropy_count = bucket_sample_total - high_entropy_count - medium_entropy_count

  high_entropy_tweets_adjusted = high_entropy_tweets[:high_entropy_count]
  medium_entropy_tweets_adjusted = medium_entropy_tweets.sample(n=medium_entropy_count, random_state=42)
  low_entropy_tweets_adjusted = low_entropy_tweets.sample(n = low_entropy_count, random_state=42)
  
  print(high_entropy_count, medium_entropy_count, low_entropy_count)
  
  concat = pd.concat([high_entropy_tweets_adjusted, medium_entropy_tweets_adjusted, low_entropy_tweets_adjusted], ignore_index=True)
  buckets_sampled[bucket_name] = concat.to_dict(orient="records")
  
merged = []
for dd in buckets_sampled.values():
  merged.extend(dd)
  
formatted = []
for doc in merged:
  formatted.append({
    "data": doc,
    "predictions": [{
      "model_version": "1.0.0",
      "score": doc["prediction_score"],
      "result": [{
        "from_name": "sentiment",
        "to_name": "text",
        "type": "choices",
        "value": {"choices": [doc["prediction_result"]]},
      }]
    }]
  })  

with open("out/labelstudio/p2-training/all-tweets-annotated-sampled.json", "w", encoding="utf-8") as file:
  json.dump(formatted, file, ensure_ascii=False, indent=2)


2730 780 389
827 237 117
252 72 36
395 113 55
765 219 108
325 93 46
312 89 44


In [48]:
import json
with open("out/labelstudio/p2-training/all-tweets-annotated-sampled.json", "r", encoding="utf-8") as file:
  documents = json.load(file)
print(len(documents))

8004
