In [11]:
labels = [
  "generic",
  "probable_spam",
  "long_tweets_with_hashtag",
  "short_tweets_with_hashtag",
]

In [12]:
import json
import random
from collections import defaultdict

# Parameters
INPUT_FILE = 'out/cleaned-with-labels.json'
OUTPUT_FILE = 'out/labelstudio-prepped.json'
SAMPLES_PER_BUCKET = 3000  # Change this as needed

# Load data
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    tweets = json.load(f)

# Group tweets by bucket
buckets = defaultdict(list)
for tweet in tweets:
  bucket_label = labels[tweet["metadata"]["bucket_label"]]
  tweet["bucket_label"] = bucket_label
  buckets[bucket_label].append(tweet)

# Sample tweets
sampled_tweets = []
for bucket_label, tweets_in_bucket in buckets.items():
  if len(tweets_in_bucket) < SAMPLES_PER_BUCKET:
    print(f"Warning: Bucket '{bucket_label}' has only {len(tweets_in_bucket)} tweets. Sampling all.")
    sampled = tweets_in_bucket
  else:
    sampled = random.sample(tweets_in_bucket, SAMPLES_PER_BUCKET)
  sampled_tweets.extend(sampled)

# Save to output JSON
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
  json.dump(sampled_tweets, f, ensure_ascii=False, indent=2)

print(f"Sampled tweets saved to {OUTPUT_FILE}")


Sampled tweets saved to out/labelstudio-prepped.json
