In [1]:
labels = [
  "probable_spam",
  "medium_no_hashtags",
  "long_with_hashtags",
  "medium_with_hashtags",
  "short_no_hashtag",
  "short_with_hashtag",
  "shotgun"
]

# Start parsing label studio

In [2]:
import json
with open("out/labelstudio/labelstudio-dump.json", "r", encoding="utf-8") as file:
  documents = json.load(file)

rawdata = [doc["data"] for doc in documents]
annotations = [doc["annotations"][0] for doc in documents]

relevant_count = 0;
irrelevant_count = 0;

relevancy_label_mapping = {
  "relevant": 1,
  "irrelevant": 0
}

for data, annotation in zip(rawdata, annotations):
  label = annotation["result"][0]["value"]["choices"][0]
  if label == "relevant":
    relevant_count += 1
  if label == "irrelevant":
    irrelevant_count += 1
  data["relevancy_label"] = relevancy_label_mapping[label]
print(f"Relevant count: {relevant_count} Irrelevant Count: {irrelevant_count}")

with open("out/labelstudio/labelstudio-dump-cleaned.json", "w", encoding="utf-8") as file:
  json.dump(rawdata, file, ensure_ascii=False, indent=2)


Relevant count: 332 Irrelevant Count: 368


In [3]:
from datasets import load_dataset
import pandas as pd

# Load the full dataset
ds = load_dataset("json", data_files="out/labelstudio/labelstudio-dump-cleaned.json")["train"]

# First split: 80% train, 20% temp (val + test)
split_ds = ds.train_test_split(test_size=0.2, seed=42)

# Second split: split the 20% into 10% val, 10% test
temp_split = split_ds["test"].train_test_split(test_size=0.3, seed=42)

train_dataset = split_ds["train"]
val_dataset = temp_split["train"]
test_dataset = temp_split["test"]

print(f"Train: {len(train_dataset)} samples")
print(f"Validation: {len(val_dataset)} samples")
print(f"Test: {len(test_dataset)} samples")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 700 examples [00:00, 50395.87 examples/s]

Train: 560 samples
Validation: 98 samples
Test: 42 samples





In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenizer_function(examples):
  return tokenizer(
    examples["content"],
    padding="max_length",
    truncation=True,
    max_length=256,
  )

In [6]:
train_dataset = train_dataset.map(tokenizer_function, batched=True)
test_dataset = test_dataset.map(tokenizer_function, batched=True)

Map: 100%|██████████| 560/560 [00:00<00:00, 14608.48 examples/s]
Map: 100%|██████████| 42/42 [00:00<00:00, 7974.32 examples/s]


In [7]:
train_dataset = train_dataset.rename_column("relevancy_label", "labels")
test_dataset = test_dataset.rename_column("relevancy_label", "labels")


train_dataset = train_dataset.rename_column("content", "text")
test_dataset = test_dataset.rename_column("content", "text")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])

In [8]:
print(train_dataset[0])

{'labels': tensor(1), 'input_ids': tensor([    2, 30459,  3588,    32,  7506,  5820, 30356, 30459, 18122,  7506,
         5820, 30356, 30459, 10348,  7506,  5820, 30356, 30459, 10348,   757,
         2324, 11230,    63,   784, 30459,  5546, 12475,  3147, 30459,   300,
        18881,    36, 30459, 10348, 30364,  2723,  8615,   104,   301, 30459,
          888, 20409,  2626,    44, 30459, 10348,  8299,   110, 20955, 30459,
        10348,  8299, 11096,  3667,   887,     5,     3,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=4,
  per_device_train_batch_size=16,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=10,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  eval_strategy="epoch"
)

In [10]:
from transformers import Trainer, default_data_collator
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)
  
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator
)

In [11]:
from transformers import Trainer, default_data_collator
import evaluate

accuracy_metric = evaluate.load("accuracy")
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator
)

In [12]:
train_dataloader = trainer.get_train_dataloader()
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])




In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2479,0.470826,0.809524
2,0.1877,0.13074,0.97619
3,0.1328,0.152003,0.904762
4,0.0215,0.172454,0.928571




TrainOutput(global_step=140, training_loss=0.1832751909536975, metrics={'train_runtime': 186.3254, 'train_samples_per_second': 12.022, 'train_steps_per_second': 0.751, 'total_flos': 294684382003200.0, 'train_loss': 0.1832751909536975, 'epoch': 4.0})

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }
from transformers import Trainer

evaluationTrainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)
metrics = evaluationTrainer.evaluate()
print(metrics)



{'eval_loss': 0.1307397484779358, 'eval_model_preparation_time': 0.0009, 'eval_accuracy': 0.9761904761904762, 'eval_f1': 0.9761769710720363, 'eval_precision': 0.9772727272727273, 'eval_recall': 0.9761904761904762, 'eval_runtime': 0.9842, 'eval_samples_per_second': 42.676, 'eval_steps_per_second': 6.097}


In [15]:
trainer.save_model("models/ruu-tni-relevancy-classification")
tokenizer.save_pretrained("tokenizers/ruu-tni-relevancy-classification")

('tokenizers/ruu-tni-relevancy-classification/tokenizer_config.json',
 'tokenizers/ruu-tni-relevancy-classification/special_tokens_map.json',
 'tokenizers/ruu-tni-relevancy-classification/vocab.txt',
 'tokenizers/ruu-tni-relevancy-classification/added_tokens.json',
 'tokenizers/ruu-tni-relevancy-classification/tokenizer.json')

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained("models/ruu-tni-relevancy-classification")
tokenizers = AutoTokenizer.from_pretrained("tokenizers/ruu-tni-relevancy-classification")
model.push_to_hub("tianharjuno/ruu-tni-relevancy-classification")
tokenizers.push_to_hub("tianharjuno/ruu-tni-relevancy-classification")

model.safetensors:  35%|███▌      | 176M/498M [02:21<20:58, 256kB/s]  

RuntimeError: Error while uploading 'model.safetensors' to the Hub.