In [1]:
labels = [
  "probable_spam",
  "medium_no_hashtags",
  "long_with_hashtags",
  "medium_with_hashtags",
  "short_no_hashtag",
  "short_with_hashtag",
  "shotgun"
]

In [2]:
import json
import random
from collections import defaultdict

# Parameters
INPUT_FILE = 'out/feature-only-cleaned-with-labels.json'
OUTPUT_FILE = 'out/feature-only-labelstudio-prepped.json'
SAMPLES_PER_BUCKET = 100  # Change this as needed

# Load data
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    tweets = json.load(f)

# Group tweets by bucket
buckets = defaultdict(list)
for tweet in tweets:
  bucket_label = labels[tweet["metadata"]["bucket_label"]]
  tweet["bucket_label"] = bucket_label
  buckets[bucket_label].append(tweet)

# Sample tweets
sampled_tweets = []
for bucket_label, tweets_in_bucket in buckets.items():
  if len(tweets_in_bucket) < SAMPLES_PER_BUCKET:
    print(f"Warning: Bucket '{bucket_label}' has only {len(tweets_in_bucket)} tweets. Sampling all.")
    sampled = tweets_in_bucket
  else:
    sampled = random.sample(tweets_in_bucket, SAMPLES_PER_BUCKET)
  sampled_tweets.extend(sampled)

# Save to output JSON
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
  json.dump(sampled_tweets, f, ensure_ascii=False, indent=2)

print(f"Sampled tweets saved to {OUTPUT_FILE}")


Sampled tweets saved to out/feature-only-labelstudio-prepped.json


# Start parsing label studio

In [3]:
import json
with open("out/feature-only-relevancy-labeled.json", "r", encoding="utf-8") as file:
  documents = json.load(file)

rawdata = [doc["data"] for doc in documents]
annotations = [doc["annotations"][0] for doc in documents]

for data, annotation in zip(rawdata, annotations):
  data["relevancy_label"] = annotation["result"][0]["value"]["choices"][0]
with open("out/feature-only-relevancy-labeled-cleaned.json", "w", encoding="utf-8") as file:
  json.dump(rawdata, file, ensure_ascii=False, indent=2)


In [4]:
from datasets import load_dataset
import pandas as pd
ds = load_dataset("json", data_files="out/feature-only-relevancy-labeled-cleaned.json")["train"]
split_ds = ds.train_test_split(test_size=0.2)
train_dataset = split_ds["train"]
test_dataset = split_ds["test"]

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 700 examples [00:00, 43416.73 examples/s]


In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
model = AutoModelForSequenceClassification.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def tokenizer_function(examples):
  return tokenizer(
    examples["content"],
    padding="max_length",
    truncation=True,
    max_length=256
  )

In [7]:
train_dataset = train_dataset.map(tokenizer_function, batched=True)
test_dataset = test_dataset.map(tokenizer_function, batched=True)

label2id = {"irrelevant": 0, "relevant": 1}  # adjust to your label names

def convert_labels(example):
  example["relevancy_label"] = label2id[example["relevancy_label"]]
  return example

train_dataset = train_dataset.map(convert_labels)
test_dataset = test_dataset.map(convert_labels)

Map: 100%|██████████| 560/560 [00:00<00:00, 13191.93 examples/s]
Map: 100%|██████████| 140/140 [00:00<00:00, 15411.74 examples/s]
Map: 100%|██████████| 560/560 [00:00<00:00, 11164.93 examples/s]
Map: 100%|██████████| 140/140 [00:00<00:00, 10963.45 examples/s]


In [8]:
train_dataset = train_dataset.rename_column("relevancy_label", "labels")
test_dataset = test_dataset.rename_column("relevancy_label", "labels")


train_dataset = train_dataset.rename_column("content", "text")
test_dataset = test_dataset.rename_column("content", "text")

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])




In [9]:
print(train_dataset[0]["labels"])

tensor(1)


In [10]:
from transformers import create_optimizer

batch_size = 32
epochs=3
train_data_size = len(train_dataset)
steps_per_epoch = train_data_size // batch_size
num_train_steps = steps_per_epoch * epochs 
num_warmup_steps = int(0.1 * num_train_steps)

optimizer, schedule = create_optimizer(
  init_lr=2e-5,
  num_train_steps= num_train_steps,
  num_warmup_steps=num_warmup_steps,
  weight_decay_rate=0.01
)


In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./results",
  num_train_epochs=3,
  per_device_train_batch_size=16,
  save_strategy="epoch",
  logging_strategy="steps",
  logging_steps=100,
  load_best_model_at_end=True,
  metric_for_best_model="accuracy",
  eval_strategy="epoch"
)

In [27]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [28]:
from transformers import Trainer, default_data_collator
import evaluate

accuracy_metric = evaluate.load("accuracy")
trainer = Trainer(
  model=model,
  args = training_args,
  train_dataset=train_dataset,
  eval_dataset=test_dataset,
  compute_metrics=compute_metrics,
  data_collator=default_data_collator
)

In [18]:
train_dataloader = trainer.get_train_dataloader()
batch = next(iter(train_dataloader))
print(batch.keys())

dict_keys(['labels', 'input_ids', 'token_type_ids', 'attention_mask'])




In [29]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.338198,0.9
2,No log,0.43586,0.885714
3,0.081700,0.383238,0.921429




TrainOutput(global_step=105, training_loss=0.07939920453798203, metrics={'train_runtime': 141.433, 'train_samples_per_second': 11.878, 'train_steps_per_second': 0.742, 'total_flos': 221013286502400.0, 'train_loss': 0.07939920453798203, 'epoch': 3.0})