In [None]:
labels = [
  "probable_spam",
  "medium_no_hashtags",
  "long_with_hashtags",
  "medium_with_hashtags",
  "short_no_hashtag",
  "short_with_hashtag",
  "shotgun"
]

# Start parsing label studio

In [None]:
import json
with open("out/labelstudio/labelstudio-dump.json", "r", encoding="utf-8") as file:
  documents = json.load(file)

rawdata = [doc["data"] for doc in documents]
annotations = [doc["annotations"][0] for doc in documents]

relevant_count = 0;
irrelevant_count = 0;

relevancy_label_mapping = {
  "relevant": 1,
  "irrelevant": 0
}

for data, annotation in zip(rawdata, annotations):
  label = annotation["result"][0]["value"]["choices"][0]
  if label == "relevant":
    relevant_count += 1
  if label == "irrelevant":
    irrelevant_count += 1
  data["relevancy_label"] = relevancy_label_mapping[label]
print(f"Relevant count: {relevant_count} Irrelevant Count: {irrelevant_count}")

with open("out/labelstudio/labelstudio-dump-cleaned.json", "w", encoding="utf-8") as file:
  json.dump(rawdata, file, ensure_ascii=False, indent=2)


In [None]:
from datasets import load_dataset
import pandas as pd

# Load the full dataset
ds = load_dataset("json", data_files="out/labelstudio/labelstudio-dump-cleaned.json")["train"]

# First split: 80% train, 20% temp (val + test)
split_ds = ds.train_test_split(test_size=0.2, seed=42)

# Second split: split the 20% into 10% val, 10% test
temp_split = split_ds["test"].train_test_split(test_size=0.3, seed=42)

train_dataset = split_ds["train"]
val_dataset = temp_split["train"]
test_dataset = temp_split["test"]

print(f"Train: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")
print(f"Test: {len(test_dataset)} samples")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/", num_labels=2)

In [None]:
def tokenizer_function(examples):
  return tokenizer(
    examples["content"],
    padding="max_length",
    truncation=True,
    max_length=256,
  )

In [None]:
train_dataset = train_dataset.map(tokenizer_function, batched=True)
test_dataset = test_dataset.map(tokenizer_function, batched=True)

In [None]:
train_dataset = train_dataset.rename_column("relevancy_label", "labels")
test_dataset = test_dataset.rename_column("relevancy_label", "labels")


train_dataset = train_dataset.rename_column("content", "text")
test_dataset = test_dataset.rename_column("content", "text")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])

In [None]:
print(train_dataset[0])

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=4,
  per_device_train_batch_size=16,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=10,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  eval_strategy="epoch"
)

In [None]:
from transformers import Trainer, default_data_collator
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)
  
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator
)

In [None]:
from transformers import Trainer, default_data_collator
import evaluate

accuracy_metric = evaluate.load("accuracy")
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator
)

In [None]:
train_dataloader = trainer.get_train_dataloader()
batch = next(iter(train_dataloader))
print(batch.keys())

In [None]:
trainer.train()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
from transformers import Trainer

evaluationTrainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)
metrics = evaluationTrainer.evaluate()
print(metrics)

In [None]:
trainer.save_model("models/ruu-tni-relevancy-classification")
tokenizer.save_pretrained("tokenizers/ruu-tni-relevancy-classification")

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification")
model.push_to_hub("tianharjuno/ruu-tni-relevancy-classification")
tokenizers.push_to_hub("tianharjuno/ruu-tni-relevancy-classification")

  from .autonotebook import tqdm as notebook_tqdm
model.safetensors: 100%|██████████| 498M/498M [00:34<00:00, 14.4MB/s] 


CommitInfo(commit_url='https://huggingface.co/tianharjuno/ruu-tni-relevancy-classification/commit/bdc20a08c6b7f9ffafc5d0a9164a97ac74a5b6ad', commit_message='Upload tokenizer', commit_description='', oid='bdc20a08c6b7f9ffafc5d0a9164a97ac74a5b6ad', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tianharjuno/ruu-tni-relevancy-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='tianharjuno/ruu-tni-relevancy-classification'), pr_revision=None, pr_num=None)

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import torch
def predict(text):
  inputs = tokenizers(text, return_tensors="pt", truncation=True, padding=True)
  with torch.no_grad():
    outputs = model(**inputs)
  logits = outputs.logits
  
  probabilities = torch.softmax(logits, dim=1)
  predicted_class_idx = torch.argmax(probabilities, dim=1).item()
  
  label_mappings = {
    1: "relevant",
    0: "irrelevant"
  }
  
  print(f"Predicted class index: {label_mappings[predicted_class_idx]}")
  print(f"Probabilities: {probabilities.squeeze().tolist()}")


In [8]:
predict("ruu tni merusak masa depan bangsa!!!!! #cabutruutni")

Predicted class index: relevant
Probabilities: [0.00480042165145278, 0.9951995611190796]


## Pre annotate the entire training dataset before feeding into label-studio

In [1]:
import json
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from tqdm import tqdm

# Load input JSON
with open("out/labelstudio/labelstudio-sampled.json", "r", encoding="utf-8") as file:
    documents = json.load(file)

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification", cache_dir="cache/", device_map=None
)
tokenizer = AutoTokenizer.from_pretrained(
    "tianharjuno/ruu-tni-relevancy-classification", cache_dir="cache/"
)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prediction function with batching
def predict_in_batches(texts, batch_size=64):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Batch predicting"):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(
            batch,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=256
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)
        indices = torch.argmax(probs, dim=1)
        label_map = ["irrelevant", "relevant"]
        batch_results = [
            {"label": label_map[i.item()], "score": probs[j][i].item()}
            for j, i in enumerate(indices)
        ]
        results.extend(batch_results)
    return results

# Run prediction
texts = [doc["content"] for doc in documents]
results = predict_in_batches(texts, batch_size=64)

# Convert to Label Studio pre-annotated format
parsed_result = []
for result, doc in zip(results, documents):
    newdoc = {
        "data": doc,
        "predictions": [{
            "model_version": "0.0.1",
            "result": [{
                "from_name": "sentiment",
                "to_name": "text",
                "type": "choices",
                "value": {"choices": [result["label"]]},
                "score": result["score"]
            }]
        }]
    }
    parsed_result.append(newdoc)

# Save output
with open("out/labelstudio/labelstudio-sampled-preannotated.json", "w", encoding="utf-8") as file:
    json.dump(parsed_result, file, ensure_ascii=False, indent=2)


  from .autonotebook import tqdm as notebook_tqdm
Batch predicting: 100%|██████████| 126/126 [02:34<00:00,  1.22s/it]
