In [113]:
import json
import pandas as pd
OUT_DIR ="OUT/"
ASSET_DIR="assets/"
with open(ASSET_DIR + "dump-formatted.json", "r", encoding="utf-8") as file:
  RAW_DOCUMENTS = json.load(file)
DOCUMENT_DF = pd.DataFrame.from_records(RAW_DOCUMENTS)
DATA_LEN = len(DOCUMENT_DF)
  
INDOBERT_OUT_FILE = OUT_DIR + "indobert_embeds.jsonl"
INDOBERTWEET_OUT_FILE = OUT_DIR + "indobertweet_embeds.jsonl"
INDOBERT_REDUCED_OUT_FILE = OUT_DIR + "indobert_reduced_embeds.jsonl"
TWEET_REDUCED_OUT_FILE = OUT_DIR + "indobertweet_reduced_embeds.jsonl"
INDOBERT_NORMALIZED = OUT_DIR + "indobert_normalized.jsonl"
TWEET_NORMALIZED = OUT_DIR + "indobertweet_normalized.jsonl"

INDOBERT_KMEANS_EMBED = OUT_DIR + "indobert/indobert-kmeans-embed.json"
INDOBERT_KMEANS_PROPS = OUT_DIR + "model-agnostic/mixed-kmeans-props.json"
INDOBERT_KMEANS_CONCAT = OUT_DIR + "indobert/indobert-kmeans-concat.json"
INDOBERTWEET_KMEANS_EMBED = OUT_DIR + "indobertweet/indobertweet-kmeans-embed.json"
INDOBERTWEET_KMEANS_PROPS = OUT_DIR + "model-agnostic/mixed-kmeans-props.json"
INDOBERTWEET_KMEANS_CONCAT = OUT_DIR + "indobertweet/indobertweet-kmeans-concat.json"

INDOBERT_HDBSCAN_EMBED = OUT_DIR + "indobert/indobert-hdbscan-embed.json"
INDOBERT_HDBSCAN_PROPS = OUT_DIR + "model-agnostic/mixed-hdbscan-props.json"
INDOBERT_HDBSCAN_CONCAT = OUT_DIR + "indobert/indobert-hdbscan-concat.json"
INDOBERTWEET_HDBSCAN_EMBED = OUT_DIR + "indobertweet/indobertweet-hdbscan-embed.json"
INDOBERTWEET_HDBSCAN_PROPS = OUT_DIR + "model-agnostic/mixed-hdbscan-props.json"
INDOBERTWEET_HDBSCAN_CONCAT = OUT_DIR + "indobertweet/indobertweet-hdbscan-concat.json"

### Text Cleaning Function

In [117]:
import re, unicodedata, jaconv, emoji

# ─── pre-compiled patterns ────────────────────────────────────────────────
_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_KUTI_BEF = re.compile(r'(?i)(?<!\s)(kutipan)')
_KUTI_AFT = re.compile(r'(?i)(kutipan)(?!\s)')
_REPEAT   = re.compile(r'(.)\1{2,}')       # ≥3 of same char
_WS       = re.compile(r'\s+')

# remove from the first token that *begins* with “kutipan” (any case) to the string-end
_KUTI_CUT = re.compile(r'(?i)\bkutipan\w*.*$', re.DOTALL)   # pre-compile once

def cleantext(text: str) -> str:
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)

    text = text.replace('\\n', ' ').replace('\\r', ' ')
    text = _URL.sub(' <url> ', text)
    text = _MENTION.sub(' ', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)

    # ⇣ one liner does all the “kutipan” work; the old _KUTI_BEF/_KUTI_AFT are no longer needed
    text = _KUTI_CUT.sub('', text)

    text = emoji.demojize(text, delimiters=(' ', ' '))
    text = _REPEAT.sub(r'\1\1', text)
    text = _WS.sub(' ', text).strip().lower()
    return text



### Apply cleaning function

In [118]:
DOCUMENT_DF["content"] = DOCUMENT_DF["content"].apply(cleantext)

### Generate splits and golden standard

In [119]:
from sklearn.model_selection import train_test_split
import json


DOCUMENT_DF = DOCUMENT_DF.drop_duplicates(subset=["content"]).reset_index(drop=True)
TRAIN_DF, TEST_DF = train_test_split(
  DOCUMENT_DF,
  test_size=0.90,
  random_state=42,
)
GOLDEN_STANDARD, UNUSED = train_test_split(
  TEST_DF,
  test_size=0.99,
  random_state=42
)
print(len(TRAIN_DF))
with open("out/golden_standard.json", "w", encoding="utf-8") as file:
  json.dump(GOLDEN_STANDARD.to_dict(orient="records"), file, ensure_ascii=False, indent=2)
with open("out/training_split_general.json", "w") as file:
  json.dump(TRAIN_DF.to_dict(orient="records"),file, ensure_ascii=False, indent=2)

15364


### Extract hashtags

In [189]:
import json
from collections import Counter
from typing import List, Hashable, Optional
with open("out/training_split_general.json", "r") as file:
  documents = json.load(file)
texts = [doc["content"] for doc in documents]

hashtags = []

for text in texts:
  text_split = text.split(" ")
  for token in text_split:
    if token.startswith("#"):
      hashtags.append(token)

def most_common_hashtags(
    tags: List[Hashable],
    *,
    top_n: Optional[int] = None,
    min_count: Optional[int] = None,
) -> List[Hashable]:
    if top_n is None and min_count is None:
        raise ValueError("Specify either top_n or min_count")

    freq = Counter(tags)
    # Sort once by (-count, tag) so result is deterministic for ties
    ranked = sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))

    if top_n is not None:
        selected = ranked[:top_n]
    else:
        selected = [kv for kv in ranked if kv[1] >= min_count]

    return [tag for tag, _ in selected]


cleaned_hashtags = most_common_hashtags(hashtags, min_count=20)
with open("out/hashtag_list.json", "w", encoding="utf-8") as file:
  json.dump(cleaned_hashtags, file, ensure_ascii=False, indent=2)

### Initialize all models and tokenizers from IndoBERT and IndoBERTweet

In [121]:
from transformers import AutoModel, AutoTokenizer
import torch
indobert_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
indobert_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")

tweet_model = AutoModel.from_pretrained("indolem/indobertweet-base-uncased", cache_dir="cache/")
tweet_tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased", cache_dir="cache/")

indobert_tokenizer.add_tokens(cleaned_hashtags)
tweet_tokenizer.add_tokens(cleaned_hashtags)


device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

indobert_model = indobert_model.to(device)
tweet_model = tweet_model.to(device)

#Turn on evaluation mode as default
indobert_model.eval()
tweet_model.eval()


Using device: mps


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(31923, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### Create functions to get encodings for both indobert and indobertweet

In [122]:
import torch
def getEncodings(textArray):
  indobert_inputs = indobert_tokenizer(
    textArray,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
  )
  tweet_inputs = tweet_tokenizer(
    textArray,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="pt"
  )
  indobert_inputs = {k: v.to("mps") for k, v in indobert_inputs.items()}
  tweet_inputs = {k: v.to("mps") for k, v in tweet_inputs.items()}
  with torch.no_grad():
    indobert_outputs = indobert_model(**indobert_inputs)
    tweet_outputs = tweet_model(**tweet_inputs)
    
    
  indobert_embeddings = indobert_outputs.last_hidden_state[:, 0, :]
  tweet_embeddings = tweet_outputs.last_hidden_state[:, 0, :]
  return (indobert_embeddings.cpu().numpy(), tweet_embeddings.cpu().numpy())
  

In [123]:
import os

def removeFile(file_path):
  if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} deleted.")
  else:
    print(f"{file_path} does not exist.")

In [124]:
from tqdm import tqdm
import json
def get_batch_embeddings(documents, batch_size=32):
  #Document is in the shape of a Pandas Dataframe. Convert to a list first before processing
  removeFile(INDOBERT_OUT_FILE)
  removeFile(INDOBERTWEET_OUT_FILE)
  documents_list = documents.to_dict(orient="records")
  for i in tqdm(range(0, len(documents_list), batch_size), desc="Generating embeddings"):
    batched = documents_list[i:i+batch_size]
    texts = [doc["content"] for doc in batched]
    indobert_embedding, tweet_embedding = getEncodings(texts)
    
    with open(INDOBERT_OUT_FILE, "a", encoding="utf-8") as file:
      for doc, embed in zip(batched, indobert_embedding):
        doc_copy = doc.copy()
        doc_copy["embedding"] = embed.tolist()
        file.write(json.dumps(doc_copy, ensure_ascii=False)+ "\n")
        
    with open(INDOBERTWEET_OUT_FILE, "a", encoding="utf-8") as file:
      for doc, embed in zip(batched, tweet_embedding):
        doc_copy = doc.copy()
        doc_copy["embedding"] = embed.tolist()
        file.write(json.dumps(doc_copy, ensure_ascii=False)+ "\n")

In [125]:
get_batch_embeddings(TRAIN_DF)

OUT/indobert_embeds.jsonl deleted.
OUT/indobertweet_embeds.jsonl deleted.


Generating embeddings: 100%|██████████| 481/481 [03:37<00:00,  2.21it/s]


### Create function to reduce embed size

In [126]:
import umap
import numpy as np
def reduce_embed_size(embeds):
  umap_model = umap.UMAP(n_components=45)
  reduced_embedding = umap_model.fit_transform(np.array(embeds))
  return reduced_embedding

### Process Indobert Embeddings

In [130]:
import json
import pandas as pd

with open(INDOBERT_OUT_FILE, "r", encoding="utf-8") as file:
  embedded_documents = []
  for line in file:
    doc = json.loads(line)
    embedded_documents.append(doc)

embeddings = [doc["embedding"] for doc in embedded_documents]
reduced_embeddings = reduce_embed_size(embeddings)

for doc, reduced in zip(embedded_documents, reduced_embeddings):
  doc["embedding"] = reduced.tolist()
removeFile(INDOBERT_REDUCED_OUT_FILE)
with open(INDOBERT_REDUCED_OUT_FILE, "a", encoding="utf-8") as file:
  for doc in embedded_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")



OUT/indobert_reduced_embeds.jsonl deleted.


### Process IndoBERTweet embeddings

In [131]:
import json
import pandas as pd

with open(INDOBERTWEET_OUT_FILE, "r", encoding="utf-8") as file:
  embedded_documents = []
  for line in file:
    doc = json.loads(line)
    embedded_documents.append(doc)

embeddings = [doc["embedding"] for doc in embedded_documents]
reduced_embeddings = reduce_embed_size(embeddings)

for doc, reduced in zip(embedded_documents, reduced_embeddings):
  doc["embedding"] = reduced.tolist()
removeFile(TWEET_REDUCED_OUT_FILE)
with open(TWEET_REDUCED_OUT_FILE, "a", encoding="utf-8") as file:
  for doc in embedded_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")



OUT/indobertweet_reduced_embeds.jsonl deleted.


In [132]:
import nltk
from nltk.corpus import stopwords
import emoji
try:
  stopwords.words('english')
except LookupError:
  nltk.download('stopwords')
stopwords_combined = set(stopwords.words("indonesian")) | set(stopwords.words("english"))

def extract_structural_features(tweet):
  words = tweet.split()
  word_lengths = [len(w) for w in words]
  
  length = len(tweet)
  num_hashtags = tweet.count("#")
  num_mentions = tweet.count("@")
  num_urls = len(re.findall(r"http\S+", tweet))
  num_emojis = len([c for c in tweet if c in emoji.EMOJI_DATA])
  num_upper = sum(1 for c in tweet if c.isupper())
  num_punct = len(re.findall(r"[^\w\s]", tweet))
  avg_word_len = np.mean(word_lengths) if words else 0

  # Content/structure-oriented features
  is_question = int(tweet.strip().endswith('?'))
  is_exclamatory = int(tweet.strip().endswith('!'))
  contains_ellipsis = int("..." in tweet)
  contains_repeated_chars = int(bool(re.search(r"(.)\1{2,}", tweet)))  # e.g., sooo, yessss
  contains_short_link = int(bool(re.search(r"\b(?:https?:\/\/)?(?:www\.)?(bit\.ly|t\.co|tinyurl\.com|goo\.gl|ow\.ly|is\.gd|buff\.ly|adf\.ly|bitly\.com|cutt\.ly|rb\.gy|rebrand\.ly)\/[A-Za-z0-9]+", tweet)))
  contains_digit = int(bool(re.search(r"\d", tweet)))
  is_all_caps = int(tweet.isupper() and len(tweet) > 3)
  is_emoji_only = int(all(c in emoji.EMOJI_DATA or c.isspace() for c in tweet.strip()) and tweet.strip() != "")
  contains_quote_or_rt = int(bool(re.search(r"(RT\s@|\".+\")", tweet)))
  word_count = len(words)
  stopword_ratio = np.mean([w.lower() in stopwords_combined for w in words]) if words else 0

  return [
    length, num_hashtags, num_mentions, num_urls,
    num_emojis, num_upper, num_punct, avg_word_len,
    is_question, is_exclamatory, contains_ellipsis,
    contains_repeated_chars, contains_short_link,
    contains_digit, is_all_caps, is_emoji_only,
    contains_quote_or_rt, word_count, stopword_ratio
  ]

### Generate structural features of each cleaned content

In [133]:
import json
import numpy as np
from sklearn.preprocessing import StandardScaler
concat_scaler = StandardScaler()
props_scaler = StandardScaler()
with open(INDOBERT_REDUCED_OUT_FILE, "r", encoding="utf-8") as file:
  indobert_reduced_documents = [json.loads(line) for line in file]

structural_properties = [extract_structural_features(doc["content"]) for doc in indobert_reduced_documents]

for doc, props in zip(indobert_reduced_documents, structural_properties):
  doc["structural_property"] = props
  doc["concatenated_features"] = np.concatenate([np.array(doc["embedding"]), np.array(props) * 2])

props_scaled = props_scaler.fit_transform(np.array(structural_properties))
concat_scaled = concat_scaler.fit_transform(
    np.array([doc["concatenated_features"] for doc in indobert_reduced_documents])
)
for doc, scaled_prop, scaled_concat in zip(indobert_reduced_documents, props_scaled, concat_scaled):
  doc["structural_property"] = scaled_prop.tolist()
  doc["concatenated_features"] = scaled_concat.tolist()
  
removeFile(INDOBERT_NORMALIZED)
with open(INDOBERT_NORMALIZED, "a", encoding="utf-8") as file:
  for doc in indobert_reduced_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")




OUT/indobert_normalized.jsonl deleted.


In [134]:
import json
import numpy as np
from sklearn.preprocessing import StandardScaler
concat_scaler = StandardScaler()
props_scaler = StandardScaler()
with open(TWEET_REDUCED_OUT_FILE, "r", encoding="utf-8") as file:
  indobert_reduced_documents = [json.loads(line) for line in file]

structural_properties = [extract_structural_features(doc["content"]) for doc in indobert_reduced_documents]

for doc, props in zip(indobert_reduced_documents, structural_properties):
  doc["structural_property"] = props
  doc["concatenated_features"] = np.concatenate([np.array(doc["embedding"]), np.array(props) * 2])

props_scaled = props_scaler.fit_transform(np.array(structural_properties))
concat_scaled = concat_scaler.fit_transform(
    np.array([doc["concatenated_features"] for doc in indobert_reduced_documents])
)
for doc, scaled_prop, scaled_concat in zip(indobert_reduced_documents, props_scaled, concat_scaled):
  doc["structural_property"] = scaled_prop.tolist()
  doc["concatenated_features"] = scaled_concat.tolist()
  
removeFile(TWEET_NORMALIZED)
with open(TWEET_NORMALIZED, "a", encoding="utf-8") as file:
  for doc in indobert_reduced_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")

OUT/indobertweet_normalized.jsonl deleted.


### Utilize KMeans and generate buckets on indobert
1. KMeans + IndoBERT Embeddings
2. KMeans + IndoBERT Embeddings + Structure Properties
3. KMeans + Structure Properties

In [135]:
def save_clustered_docs(docs, cluster_labels, output_path):
    for doc, label in zip(docs, cluster_labels):
        doc["bucket_label"] = int(label)
    for doc in docs:
        del doc["__v"]
        del doc["_id"]
        if "embedding" in doc:
            del doc["embedding"]
        if "structural_property" in doc:
            del doc["structural_property"]
        if "concatenated_features" in doc:
            del doc["concatenated_features"]
            
    # Sort by label for better organization (optional)
    docs_sorted = sorted(docs, key=lambda x: x["bucket_label"])
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(docs_sorted, f, ensure_ascii=False, indent=2)


In [136]:
from sre_parse import Verbose
from sklearn.cluster import KMeans
import json
import matplotlib.pyplot as plt
import numpy as np
import copy

with open(INDOBERT_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only
kmeans_embed = KMeans(n_clusters=10, random_state=42).fit(embeds)

# 2. Embeddings + Props
kmeans_concat = KMeans(n_clusters=10, random_state=42).fit(concats)

# 3. Props only
kmeans_props = KMeans(n_clusters=10, random_state=42).fit(properties)


# Cluster label predictions
labels_embed = kmeans_embed.labels_
labels_concat = kmeans_concat.labels_
labels_props = kmeans_props.labels_
# Save output files
print(len(labels_embed), len(labels_concat), len(labels_props))
  
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERT_KMEANS_EMBED)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERT_KMEANS_CONCAT)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERT_KMEANS_PROPS)

15364 15364 15364


### Utilize KMeans and generate buckets on indobertweet
1. KMeans + IndoBERTweet Embeddings
2. KMeans + IndoBERTweet Embeddings + Structure Properties


In [137]:
from sklearn.cluster import KMeans
import json
import matplotlib.pyplot as plt
import numpy as np
import copy


with open(TWEET_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only
kmeans_embed = KMeans(n_clusters=10, random_state=42).fit(embeds)

# 2. Embeddings + Props
kmeans_concat = KMeans(n_clusters=10, random_state=42).fit(concats)

# 3. Props only
kmeans_props = KMeans(n_clusters=10, random_state=42).fit(properties)

# Cluster label predictions
labels_embed = kmeans_embed.labels_
labels_concat = kmeans_concat.labels_
labels_props = kmeans_props.labels_
# Save output files
print(len(labels_embed), len(labels_concat), len(labels_props))
assert len(labels_embed) == len(normalized_indobert_documents)

save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERTWEET_KMEANS_EMBED)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERTWEET_KMEANS_CONCAT)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERTWEET_KMEANS_PROPS)

15364 15364 15364


### Utilize HDBSCAN and generate buckets on IndoBERT
1. HDBSCAN + IndoBERT Embeddings
2. HDBSCAN + IndoBERT Embeddings + Structure Properties


In [176]:
import json
import copy
import hdbscan

with open(INDOBERT_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only
kmeans_embed = hdbscan.HDBSCAN(min_cluster_size=105, metric="euclidean").fit(embeds)

# 2. Embeddings + Props
kmeans_concat = hdbscan.HDBSCAN(min_cluster_size=18, metric="euclidean").fit(concats)

# 3. Props only
kmeans_props = hdbscan.HDBSCAN(min_cluster_size=30, metric="euclidean").fit(properties)

# Cluster label predictions
labels_embed = kmeans_embed.labels_
labels_concat = kmeans_concat.labels_
labels_props = kmeans_props.labels_
# Save output files
print(len(labels_embed), len(labels_concat), len(labels_props))
  
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERT_HDBSCAN_EMBED)
# save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERT_HDBSCAN_CONCAT)
# save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERT_HDBSCAN_PROPS)



15364 15364 15364


In [164]:
import json
import copy
import hdbscan

with open(TWEET_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only #before waas  18
kmeans_embed = hdbscan.HDBSCAN(min_cluster_size=34, metric="euclidean").fit(embeds)

# 2. Embeddings + Props #before was 17
kmeans_concat = hdbscan.HDBSCAN(min_cluster_size=25, metric="euclidean").fit(concats)

# 3. Props only
kmeans_props = hdbscan.HDBSCAN(min_cluster_size=14, metric="euclidean").fit(properties)

# Cluster label predictions
labels_embed = kmeans_embed.labels_
labels_concat = kmeans_concat.labels_
labels_props = kmeans_props.labels_
# Save output files
print(len(labels_embed), len(labels_concat), len(labels_props))
  
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERTWEET_HDBSCAN_EMBED)
# save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERTWEET_HDBSCAN_CONCAT)
# save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERTWEET_HDBSCAN_PROPS)



15364 15364 15364


In [178]:
from sklearn.metrics import silhouette_score
import json
import pandas as pd
import numpy as np

def gini_coefficient(array):
    """Compute Gini coefficient of array of values."""
    # Based on mean absolute difference formula
    array = np.array(array, dtype=np.float64)
    if np.amin(array) < 0:
        array -= np.amin(array)  # Ensure non-negative
    array += 1e-10  # Avoid division by zero
    array = np.sort(array)
    n = array.size
    cumvals = np.cumsum(array)
    gini = (n + 1 - 2 * np.sum(cumvals) / cumvals[-1]) / n
    return gini

def evaluate_gini(bucketed):
    with open(bucketed, "r") as file:
        labels_data = pd.DataFrame.from_dict(json.load(file))
    labels = labels_data["bucket_label"].to_numpy()

    # Exclude noise points (-1)
    valid_labels = labels[labels >= 0]

    counts = pd.Series(valid_labels).value_counts().sort_index()
    gini = gini_coefficient(counts.values)

    print(f"{bucketed} Gini Coefficient of cluster sizes: {gini:.4f}")
    print(f"Cluster counts: {counts.to_dict()}")

def get_silhouette_score_embedding(bucketed, model_features):
    with open(bucketed, "r") as file:
        labels_data = pd.DataFrame.from_dict(json.load(file))
    with open(model_features, "r") as file:
        features_data = pd.DataFrame.from_dict([json.loads(doc) for doc in file])

    merged_df = pd.merge(labels_data, features_data, on='tweet_id', how='inner')

    features_embedding = np.stack(merged_df["embedding"].to_numpy())
    labels = merged_df["bucket_label"].to_numpy()

    embedding_score = silhouette_score(features_embedding, labels, metric='euclidean')
    print(f"{bucketed} Silhouette Score: {embedding_score:.4f}")

def get_silhouette_score_properties(bucketed, model_features):
    with open(bucketed, "r") as file:
        labels_data = pd.DataFrame.from_dict(json.load(file))
    with open(model_features, "r") as file:
        features_data = pd.DataFrame.from_dict([json.loads(doc) for doc in file])

    merged_df = pd.merge(labels_data, features_data, on='tweet_id', how='inner')

    features_properties = np.stack(merged_df["structural_property"].to_numpy())
    labels = merged_df["bucket_label"].to_numpy()

    # valid_mask = labels >= 0
    # features_properties_valid = features_properties[valid_mask]
    # labels_valid = labels[valid_mask]

    properties_score = silhouette_score(features_properties, labels, metric='euclidean')
    print(f"{bucketed} Silhouette Score: {properties_score:.4f}")

def get_silhouette_score_concat(bucketed, model_features):
    with open(bucketed, "r") as file:
        labels_data = pd.DataFrame.from_dict(json.load(file))
    with open(model_features, "r") as file:
        features_data = pd.DataFrame.from_dict([json.loads(doc) for doc in file])

    merged_df = pd.merge(labels_data, features_data, on='tweet_id', how='inner')

    features_concat = np.stack(merged_df["concatenated_features"].to_numpy())
    labels = merged_df["bucket_label"].to_numpy()

    concat_score = silhouette_score(features_concat, labels, metric='euclidean')
    print(f"{bucketed} Silhouette Score: {concat_score:.4f}")


tests = [
  ["IndoBERT HDBSCAN + Concat", INDOBERT_HDBSCAN_CONCAT, INDOBERT_NORMALIZED, get_silhouette_score_concat],
  ["IndoBERT HDBSCAN + Embed",  INDOBERT_HDBSCAN_EMBED, INDOBERT_NORMALIZED, get_silhouette_score_embedding],
  ["IndoBERT KMeans + Concat",  INDOBERT_KMEANS_CONCAT, INDOBERT_NORMALIZED, get_silhouette_score_concat],
  ["IndoBERT KMeans + Embed",   INDOBERT_KMEANS_EMBED,  INDOBERT_NORMALIZED, get_silhouette_score_embedding],
  ["IndoBERTweet HDBSCAN + Concat", INDOBERTWEET_HDBSCAN_CONCAT, TWEET_NORMALIZED, get_silhouette_score_concat],
  ["IndoBERTweet HDBSCAN + Embed",  INDOBERTWEET_HDBSCAN_EMBED,  TWEET_NORMALIZED, get_silhouette_score_embedding],
  ["IndoBERTweet KMeans + Concat",  INDOBERTWEET_KMEANS_CONCAT, TWEET_NORMALIZED, get_silhouette_score_concat],
  ["IndoBERTweet KMeans + Embed",   INDOBERTWEET_KMEANS_EMBED,  TWEET_NORMALIZED, get_silhouette_score_embedding],
  ["KMeans + Properties", INDOBERTWEET_KMEANS_PROPS, TWEET_NORMALIZED, get_silhouette_score_properties],
  ["HDBSCAN + Properties", INDOBERTWEET_HDBSCAN_PROPS, TWEET_NORMALIZED, get_silhouette_score_properties]
]

print("Running silhouette score and gini coefficient tests...\n")
for name, label_file, feature_file, scorer in tests:
    print(f"--- {name} ---")
    scorer(label_file, feature_file)
    evaluate_gini(label_file)
    print()


Running silhouette score and gini coefficient tests...

--- IndoBERT HDBSCAN + Concat ---
OUT/indobert/indobert-hdbscan-concat.json Silhouette Score: 0.1186
OUT/indobert/indobert-hdbscan-concat.json Gini Coefficient of cluster sizes: 0.8460
Cluster counts: {0: 22, 1: 33, 2: 220, 3: 460, 4: 30, 5: 347, 6: 65, 7: 191, 8: 13606, 9: 86}

--- IndoBERT HDBSCAN + Embed ---
OUT/indobert/indobert-hdbscan-embed.json Silhouette Score: 0.2125
OUT/indobert/indobert-hdbscan-embed.json Gini Coefficient of cluster sizes: 0.5369
Cluster counts: {0: 507, 1: 225, 2: 302, 3: 3339, 4: 1504, 5: 122, 6: 323, 7: 367, 8: 268, 9: 348, 10: 801, 11: 2795}

--- IndoBERT KMeans + Concat ---
OUT/indobert/indobert-kmeans-concat.json Silhouette Score: 0.2859
OUT/indobert/indobert-kmeans-concat.json Gini Coefficient of cluster sizes: 0.3392
Cluster counts: {0: 3242, 1: 3022, 2: 758, 3: 1531, 4: 1285, 5: 546, 6: 1486, 7: 476, 8: 776, 9: 2242}

--- IndoBERT KMeans + Embed ---
OUT/indobert/indobert-kmeans-embed.json Silho

In [179]:
from sklearn.metrics import davies_bouldin_score
import json
import pandas as pd
import numpy as np

# ──────────────────────────────────────────────────────────────────────────────
# DBI SCORERS
# ──────────────────────────────────────────────────────────────────────────────
def dbi_embedding(bucketed: str, model_features: str) -> None:
    """
    Compute the Davies–Bouldin Index using only the embedding vectors.
    """
    labels_df   = pd.read_json(bucketed)
    labels_df["tweet_id"] = labels_df["tweet_id"].astype(str)
    
    features_df = pd.DataFrame([json.loads(line) for line in open(model_features)])

    merged_df   = pd.merge(labels_df, features_df, on="tweet_id", how="inner")
    X           = np.stack(merged_df["embedding"].to_numpy())
    y           = merged_df["bucket_label"]

    score = davies_bouldin_score(X, y)
    print(f"{bucketed} DBI (Embedding): {score:.4f}")

def dbi_properties(bucketed: str, model_features: str) -> None:
    """
    Compute the Davies–Bouldin Index on structural-property feature vectors.
    """
    labels_df   = pd.read_json(bucketed)
    labels_df["tweet_id"] = labels_df["tweet_id"].astype(str)
    features_df = pd.DataFrame([json.loads(line) for line in open(model_features)])

    merged_df   = pd.merge(labels_df, features_df, on="tweet_id", how="inner")
    X           = np.stack(merged_df["structural_property"].to_numpy())
    y           = merged_df["bucket_label"]

    score = davies_bouldin_score(X, y)
    print(f"{bucketed} DBI (Properties): {score:.4f}")

def dbi_concat(bucketed: str, model_features: str) -> None:
    """
    Compute the Davies–Bouldin Index on concatenated feature vectors.
    """
    labels_df   = pd.read_json(bucketed)
    labels_df["tweet_id"] = labels_df["tweet_id"].astype(str)
    features_df = pd.DataFrame([json.loads(line) for line in open(model_features)])

    merged_df   = pd.merge(labels_df, features_df, on="tweet_id", how="inner")
    X           = np.stack(merged_df["concatenated_features"].to_numpy())
    y           = merged_df["bucket_label"]

    score = davies_bouldin_score(X, y)
    print(f"{bucketed} DBI (Concat): {score:.4f}")

# ──────────────────────────────────────────────────────────────────────────────
# TEST MATRIX
# ──────────────────────────────────────────────────────────────────────────────
tests = [
    ["IndoBERT HDBSCAN + Concat",  INDOBERT_HDBSCAN_CONCAT,  INDOBERT_NORMALIZED, dbi_concat],
    ["IndoBERT HDBSCAN + Embed",   INDOBERT_HDBSCAN_EMBED,   INDOBERT_NORMALIZED, dbi_embedding],
    ["IndoBERT KMeans  + Concat",  INDOBERT_KMEANS_CONCAT,   INDOBERT_NORMALIZED, dbi_concat],
    ["IndoBERT KMeans  + Embed",   INDOBERT_KMEANS_EMBED,    INDOBERT_NORMALIZED, dbi_embedding],
    ["IndoBERTweet HDBSCAN + Concat", INDOBERTWEET_HDBSCAN_CONCAT, TWEET_NORMALIZED, dbi_concat],
    ["IndoBERTweet HDBSCAN + Embed",  INDOBERTWEET_HDBSCAN_EMBED,  TWEET_NORMALIZED, dbi_embedding],
    ["IndoBERTweet KMeans  + Concat", INDOBERTWEET_KMEANS_CONCAT,  TWEET_NORMALIZED, dbi_concat],
    ["IndoBERTweet KMeans  + Embed",  INDOBERTWEET_KMEANS_EMBED,   TWEET_NORMALIZED, dbi_embedding],
    ["KMeans  + Properties",          INDOBERTWEET_KMEANS_PROPS,   TWEET_NORMALIZED, dbi_properties],
    ["HDBSCAN + Properties",          INDOBERTWEET_HDBSCAN_PROPS,  TWEET_NORMALIZED, dbi_properties],
]

print("Running Davies–Bouldin Index tests...\n")
for name, label_file, feature_file, scorer in tests:
    scorer(label_file, feature_file)


Running Davies–Bouldin Index tests...

OUT/indobert/indobert-hdbscan-concat.json DBI (Concat): 1.1871
OUT/indobert/indobert-hdbscan-embed.json DBI (Embedding): 1.0515
OUT/indobert/indobert-kmeans-concat.json DBI (Concat): 1.1228
OUT/indobert/indobert-kmeans-embed.json DBI (Embedding): 0.8676
OUT/indobertweet/indobertweet-hdbscan-concat.json DBI (Concat): 1.2532
OUT/indobertweet/indobertweet-hdbscan-embed.json DBI (Embedding): 0.4677
OUT/indobertweet/indobertweet-kmeans-concat.json DBI (Concat): 1.3443
OUT/indobertweet/indobertweet-kmeans-embed.json DBI (Embedding): 0.8790
OUT/model-agnostic/mixed-kmeans-props.json DBI (Properties): 1.1380
OUT/model-agnostic/mixed-hdbscan-props.json DBI (Properties): 0.8193


### Do a bit of bucket analization

In [183]:
# Utility functions
from collections import Counter
import re
def count_hashtags(text):
    return len(re.findall(r"#\w+", text))

def hashtag_ratio(text):
    hashtags = ''.join(re.findall(r"#\w+", text))
    return len(hashtags) / len(text) if len(text) > 0 else 0

def extract_emojis(text):
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
                               "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
    return emoji_pattern.findall(text)

def emoji_ratio(text):
    emojis = extract_emojis(text)
    return len(''.join(emojis)) / len(text) if len(text) > 0 else 0

def url_ratio(tweets):
    return sum(1 for t in tweets if re.search(r"http\S+", t)) / len(tweets)

def mention_ratio(tweets):
    return sum(1 for t in tweets if re.search(r"@\w+", t)) / len(tweets)

def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0

def repeated_char_abuse(text):
    return bool(re.search(r"(.)\1{3,}", text))

def duplicate_ratio(tweets):
    freq = Counter(tweets)
    return sum(count for tweet, count in freq.items() if count > 1) / len(tweets)


In [184]:
results = {}
from collections import defaultdict
import json
buckets = defaultdict(list)

with open(INDOBERT_KMEANS_EMBED, "r") as file:
  documents = json.load(file)

for doc in documents:
  buckets[doc["bucket_label"]].append(doc)

for label, bucket_tweets in buckets.items():
    bucket_tweets = [doc["content"] for doc in bucket_tweets]
    
    hashtag_counts = [count_hashtags(t) for t in bucket_tweets]
    tweet_lengths = [len(t) for t in bucket_tweets]
    hashtag_ratios = [hashtag_ratio(t) for t in bucket_tweets]
    emoji_ratios = [emoji_ratio(t) for t in bucket_tweets]
    lexical_divs = [lexical_diversity(t) for t in bucket_tweets]
    repeated_abuse_count = sum(1 for t in bucket_tweets if repeated_char_abuse(t))

    avg_hashtags = sum(hashtag_counts) / len(bucket_tweets)
    avg_length = sum(tweet_lengths) / len(bucket_tweets)
    avg_hashtag_ratio = sum(hashtag_ratios) / len(bucket_tweets)
    avg_emoji_ratio = sum(emoji_ratios) / len(bucket_tweets)
    avg_lexical_div = sum(lexical_divs) / len(bucket_tweets)
    url_ratio_val = url_ratio(bucket_tweets)
    mention_ratio_val = mention_ratio(bucket_tweets)
    dup_ratio = duplicate_ratio(bucket_tweets)

    # Heuristic label tagging
    label_tags = []
    if avg_length > 200:
        label_tags.append("long tweets")
    if avg_hashtag_ratio > 0.4:
        label_tags.append("hashtag-heavy")
    if avg_emoji_ratio > 0.2:
        label_tags.append("emoji spam")
    if url_ratio_val > 0.3:
        label_tags.append("link drop")
    if mention_ratio_val > 0.3:
        label_tags.append("mention spam")
    if avg_lexical_div < 0.4:
        label_tags.append("low diversity (copypasta)")
    if repeated_abuse_count / len(bucket_tweets) > 0.3:
        label_tags.append("repeated char abuse")
    if dup_ratio > 0.3:
        label_tags.append("high duplication")

    longest = max(bucket_tweets, key=len)
    shortest = min(bucket_tweets, key=len)

    results[label] = {
        "label": ", ".join(label_tags) if label_tags else "generic",
        "avg_hashtags": avg_hashtags,
        "avg_length": avg_length,
        "longest_tweet": longest,
        "shortest_tweet": shortest,
        "hashtag_ratio": avg_hashtag_ratio,
        "emoji_ratio": avg_emoji_ratio,
        "url_ratio": url_ratio_val,
        "mention_ratio": mention_ratio_val,
        "lexical_diversity": avg_lexical_div,
        "duplication_ratio": dup_ratio,
        "repeated_char_abuse_count": repeated_abuse_count,
    }

# Output the results
for label, metrics in results.items():
    print(f"\nCluster {label}: {metrics['label']}")
    for k, v in metrics.items():
        if k != 'label':
            print(f"  {k}: {v}")


Cluster 0: generic
  avg_hashtags: 0.2755244755244755
  avg_length: 70.04568764568765
  longest_tweet: sorry bgt ye gw bukannya tone deaf atau apalah cuma badan gw sakit sakitan puki. semalem sakin overwhelmed nya seharian ngikutin berita ruu tni, ditambah timnas kalah, ditambah berita yang ga udah udah gw muntah muntah tengah malem sampe ga kuat makan lagi sakin lemesnya
  shortest_tweet: pap
  hashtag_ratio: 0.05821706207157399
  emoji_ratio: 0.0
  url_ratio: 0.0
  mention_ratio: 0.0
  lexical_diversity: 0.9720739969124486
  duplication_ratio: 0.0
  repeated_char_abuse_count: 0

Cluster 1: generic
  avg_hashtags: 0.8354037267080745
  avg_length: 188.0
  longest_tweet: selain kempen derma darah, turut diadakan demonstrasi pertolongan cemas cardiopulmonary resuscitation (cpr) serta penggunaan automated external defibrillator (aed) dalam program pada 8 disember lalu ini.nabalunews.comkerjasama komuniti, kerajaan dan swasta: kempen derma darah berjaya dilaksanakan di kampung..11 disembe

In [185]:
import json
import random
import math
from collections import defaultdict
# Parameters
INPUT_FILE = INDOBERT_KMEANS_EMBED
OUTPUT_FILE = 'out/labelstudio-training-sampled.json'

TOTAL_SAMPLE = 1100  # Change this as needed

# Load data
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    tweets = json.load(f)
    
bucket_content_count = defaultdict(int)
bucket_ratio_count = defaultdict(float)

total_tweet = 0
for tweet in tweets:
  label = tweet["bucket_label"]
  bucket_content_count[label] += 1
  total_tweet += 1

for label, bucket_tweet_count in bucket_content_count.items():
  bucket_ratio_count[label] = bucket_tweet_count / total_tweet

# Group tweets by bucket
buckets = defaultdict(list)
for tweet in tweets:
  bucket_label = str(tweet["bucket_label"])
  buckets[bucket_label].append(tweet)

# Sample tweets
sampled_tweets = []
for (bucket_label, tweets_in_bucket), ratio in zip(buckets.items(), bucket_ratio_count.values()):
  # ratiod_total = math.ceil(TOTAL_SAMPLE * ratio)
  ratiod_total = int(TOTAL_SAMPLE / bucket_content_count.__len__())
  print(f"Bucket {bucket_label} sampled with ratio {ratio} and total sample {ratiod_total}")
  if len(tweets_in_bucket) < ratiod_total:
    print(f"Warning: Bucket '{bucket_label}' has only {len(tweets_in_bucket)} tweets. Sampling all.")
    sampled = tweets_in_bucket
  else:
    sampled = random.sample(tweets_in_bucket, ratiod_total)
  sampled_tweets.extend(sampled)

# Save to output JSON
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
  json.dump(sampled_tweets, f, ensure_ascii=False, indent=2)

print(f"Sampled tweets saved to {OUTPUT_FILE}")


Bucket 0 sampled with ratio 0.1396120801874512 and total sample 110
Bucket 1 sampled with ratio 0.041916167664670656 and total sample 110
Bucket 2 sampled with ratio 0.19617287164800834 and total sample 110
Bucket 3 sampled with ratio 0.10205675605311117 and total sample 110
Bucket 4 sampled with ratio 0.13362405623535537 and total sample 110
Bucket 5 sampled with ratio 0.16512626920072898 and total sample 110
Bucket 6 sampled with ratio 0.03306430617026816 and total sample 110
Bucket 7 sampled with ratio 0.021218432699817755 and total sample 110
Bucket 8 sampled with ratio 0.08532934131736528 and total sample 110
Bucket 9 sampled with ratio 0.08187971882322312 and total sample 110
Sampled tweets saved to out/labelstudio-training-sampled.json


### Check for data leakage

In [186]:
import json

def load_ids_from_json_or_jsonl(file_path, id_key="tweet_id"):
    ids = set()
    with open(file_path, "r", encoding="utf-8") as f:
        first_char = f.read(1)
        f.seek(0)
        if first_char == "[":  # JSON array
            data = json.load(f)
            ids = {entry[id_key] for entry in data if id_key in entry}
        else:  # JSONL
            for line in f:
                try:
                    obj = json.loads(line)
                    if id_key in obj:
                        ids.add(obj[id_key])
                except json.JSONDecodeError:
                    continue
    return ids

def check_data_leakage(file1, file2, id_key="tweet_id"):
    ids_1 = load_ids_from_json_or_jsonl(file1, id_key)
    ids_2 = load_ids_from_json_or_jsonl(file2, id_key)

    intersection = ids_1 & ids_2

    if intersection:
        print(f"⚠️ Data leakage detected! {len(intersection)} shared {id_key}s.")
    else:
        print("✅ No data leakage detected.")

    return intersection

# Example usage:
file_a = "out/labelstudio-training-sampled.json"
file_b = "out/golden_standard.json"
leaked_ids = check_data_leakage(file_a, file_b)


✅ No data leakage detected.


### Convert to a Label Studio Processable data

In [187]:
import json

def convert_to_label_studio_format(raw_data):
    converted = []
    for entry in raw_data:
        new_entry = {
            "data": {
                "text": entry["content"],
                "bucket_label" : entry["bucket_label"] if entry.get("bucket_label") is not None else -10
            },
            "meta": {k: v for k, v in entry.items() if k != "content" and k != "bucket_label"}
        }
        converted.append(new_entry)
    return converted


In [188]:
import json
with open("out/labelstudio-training-sampled.json", "r") as file:
  training_documents = json.load(file)
parsed_training_documents = convert_to_label_studio_format(training_documents)
with open("out/labelstudio/p1/p1_training_prepped.json", "w", encoding="utf-8") as file:
  json.dump(parsed_training_documents, file, ensure_ascii=False, indent=2)

with open("out/golden_standard.json", "r") as file:
  training_documents = json.load(file)
  
parsed_training_documents = convert_to_label_studio_format(training_documents)
with open("out/labelstudio/golden_standard_prepped.json", "w", encoding="utf-8") as file:
  json.dump(parsed_training_documents, file, ensure_ascii=False, indent=2)
  