In [225]:
import json
import pandas as pd
from sympy import O
OUT_DIR ="OUT/"
ASSET_DIR="assets/"
with open(ASSET_DIR + "dump-formatted.json", "r", encoding="utf-8") as file:
  RAW_DOCUMENTS = json.load(file)
with open(OUT_DIR + "golden-standard.json", "r", encoding="utf-8") as file:
  GOLDEN_STANDARD = json.load(file)

print("Loaded documents:", len(RAW_DOCUMENTS))
print("Loaded golden standard:", len(GOLDEN_STANDARD))

INDOBERT_OUT_FILE = OUT_DIR + "indobert_embeds.jsonl"
INDOBERTWEET_OUT_FILE = OUT_DIR + "indobertweet_embeds.jsonl"
INDOBERT_REDUCED_OUT_FILE = OUT_DIR + "indobert_reduced_embeds.jsonl"
TWEET_REDUCED_OUT_FILE = OUT_DIR + "indobertweet_reduced_embeds.jsonl"
INDOBERT_NORMALIZED = OUT_DIR + "indobert_normalized.jsonl"
TWEET_NORMALIZED = OUT_DIR + "indobertweet_normalized.jsonl"

INDOBERT_KMEANS_EMBED = OUT_DIR + "indobert/indobert-kmeans-embed.json"
INDOBERT_KMEANS_PROPS = OUT_DIR + "model-agnostic/mixed-kmeans-props.json"
INDOBERT_KMEANS_CONCAT = OUT_DIR + "indobert/indobert-kmeans-concat.json"
INDOBERTWEET_KMEANS_EMBED = OUT_DIR + "indobertweet/indobertweet-kmeans-embed.json"
INDOBERTWEET_KMEANS_PROPS = OUT_DIR + "model-agnostic/mixed-kmeans-props.json"
INDOBERTWEET_KMEANS_CONCAT = OUT_DIR + "indobertweet/indobertweet-kmeans-concat.json"

INDOBERT_HDBSCAN_EMBED = OUT_DIR + "indobert/indobert-hdbscan-embed.json"
INDOBERT_HDBSCAN_PROPS = OUT_DIR + "model-agnostic/mixed-hdbscan-props.json"
INDOBERT_HDBSCAN_CONCAT = OUT_DIR + "indobert/indobert-hdbscan-concat.json"
INDOBERTWEET_HDBSCAN_EMBED = OUT_DIR + "indobertweet/indobertweet-hdbscan-embed.json"
INDOBERTWEET_HDBSCAN_PROPS = OUT_DIR + "model-agnostic/mixed-hdbscan-props.json"
INDOBERTWEET_HDBSCAN_CONCAT = OUT_DIR + "indobertweet/indobertweet-hdbscan-concat.json"

Loaded documents: 201583
Loaded golden standard: 1100


### Text Cleaning Function

In [226]:
import re, unicodedata, jaconv, emoji

# ─── pre-compiled patterns ────────────────────────────────────────────────
_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_REPEAT   = re.compile(r'(.)\1{2,}')       # ≥3 of same char
_WS       = re.compile(r'\s+')

# remove from the first token that *begins* with “kutipan” (any case) to the string-end
_KUTI_CUT = re.compile(r'(?i)kutipan.*$', re.DOTALL)

def cleantext(text: str) -> str:
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)

    text = text.replace('\\n', ' ').replace('\\r', ' ')
    text = _URL.sub(' <url> ', text)
    text = _MENTION.sub(' ', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)

    # ⇣ one liner does all the “kutipan” work; the old _KUTI_BEF/_KUTI_AFT are no longer needed
    text = _KUTI_CUT.sub('', text)

    text = emoji.demojize(text, delimiters=(' ', ' '))
    text = _REPEAT.sub(r'\1\1', text)
    text = _WS.sub(' ', text).strip().lower()
    return text

### Apply cleaning function

In [227]:
#Remove golden standard from the raw documents
GOLDEN_STANDARD_DF = pd.DataFrame(GOLDEN_STANDARD)
DOCUMENT_DF = pd.DataFrame(RAW_DOCUMENTS)

DOCUMENT_DF["content"] = DOCUMENT_DF["content"].apply(cleantext)
DOCUMENT_DF.drop_duplicates(subset=["content"], inplace=True)

#Iterate through document df and remove golden standard
for index, row in GOLDEN_STANDARD_DF.iterrows():
  if row["text"] in DOCUMENT_DF["content"].values:
    DOCUMENT_DF = DOCUMENT_DF[DOCUMENT_DF["content"] != row["text"]]
    
print("Remaining documents after removing golden standard:", len(DOCUMENT_DF))

Remaining documents after removing golden standard: 152551


#### Check for leaks between

In [228]:
## Check for leaks between golden standard and documents
GOLDEN_STANDARD_CONTENT = set(GOLDEN_STANDARD_DF["text"].values)
LEAKS = DOCUMENT_DF[DOCUMENT_DF["content"].isin(GOLDEN_STANDARD_CONTENT)]
if not LEAKS.empty:
    print("Leaks found between golden standard and documents:")
    print(LEAKS[["tweet_id", "content"]])
else:
    print("No leaks found between golden standard and documents.")

No leaks found between golden standard and documents.


### Generate splits and golden standard

In [229]:
from sklearn.model_selection import train_test_split
import json


DOCUMENT_DF = DOCUMENT_DF.drop_duplicates(subset=["content"]).reset_index(drop=True)
print(DOCUMENT_DF)
TRAIN_DF, TEST_DF = train_test_split(
  DOCUMENT_DF,
  test_size=0.90,
  random_state=42,
)
print("Final dataset size for model training:", len(TRAIN_DF))

with open("out/training_split_general.json", "w") as file:
  json.dump(TRAIN_DF.to_dict(orient="records"),file, ensure_ascii=False, indent=2)

                             _id             tweet_id                 time  \
0       6822dc79c7778784da9569f0  1911247544715514098  2025-04-13T02:38:18   
1       6822dc79c7778784da9569f1  1915217941312033078  2025-04-24T01:35:14   
2       6822dc83c7778784da9569f3  1912379519576731805  2025-04-16T05:36:22   
3       6822dc83c7778784da9569f4  1916851056434164088  2025-04-28T13:44:39   
4       6822dc8dc7778784da9569f6  1914923849457492430  2025-04-23T06:06:37   
...                          ...                  ...                  ...   
152546  68360fa9dc3752db9ae74d42  1669665839547822081  2023-06-16T11:18:51   
152547  68360fabdc3752db9ae74d47  1669660827341963270  2023-06-16T10:58:56   
152548  68360faddc3752db9ae74d4c  1669653586920484865  2023-06-16T10:30:10   
152549  68360faddc3752db9ae74d4d  1669650029014044674  2023-06-16T10:16:02   
152550  68360fb8dc3752db9ae74d51  1669645998317248519  2023-06-16T10:00:01   

                  author                                       

### Extract hashtags

In [230]:
import json
from collections import Counter
from typing import List, Hashable, Optional
with open("out/training_split_general.json", "r") as file:
  documents = json.load(file)
texts = [doc["content"] for doc in documents]

hashtags = []

for text in texts:
  text_split = text.split(" ")
  for token in text_split:
    if token.startswith("#"):
      hashtags.append(token)

def most_common_hashtags(
    tags: List[Hashable],
    *,
    top_n: Optional[int] = None,
    min_count: Optional[int] = None,
) -> List[Hashable]:
    if top_n is None and min_count is None:
        raise ValueError("Specify either top_n or min_count")

    freq = Counter(tags)
    # Sort once by (-count, tag) so result is deterministic for ties
    ranked = sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))

    if top_n is not None:
        selected = ranked[:top_n]
    else:
        selected = [kv for kv in ranked if kv[1] >= min_count]

    return [tag for tag, _ in selected]


cleaned_hashtags = most_common_hashtags(hashtags, min_count=15)

print("Total hashtags found:", len(hashtags))
print("Total cleaned hashtags:", len(cleaned_hashtags))

with open("out/hashtag_list.json", "w", encoding="utf-8") as file:
  json.dump(cleaned_hashtags, file, ensure_ascii=False, indent=2)

Total hashtags found: 30783
Total cleaned hashtags: 102


### Initialize all models and tokenizers from IndoBERT and IndoBERTweet

In [231]:
from transformers import AutoModel, AutoTokenizer
import torch
indobert_model = AutoModel.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")
indobert_tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p2", cache_dir="cache/")

tweet_model = AutoModel.from_pretrained("indolem/indobertweet-base-uncased", cache_dir="cache/")
tweet_tokenizer = AutoTokenizer.from_pretrained("indolem/indobertweet-base-uncased", cache_dir="cache/")

indobert_tokenizer.add_tokens(cleaned_hashtags)
tweet_tokenizer.add_tokens(cleaned_hashtags)

indobert_model.resize_token_embeddings(len(indobert_tokenizer))
tweet_model.resize_token_embeddings(len(tweet_tokenizer))

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

indobert_model = indobert_model.to(device)
tweet_model = tweet_model.to(device)

#Turn on evaluation mode as default
indobert_model.eval()
tweet_model.eval()


Using device: mps


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32025, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

### Create functions to get encodings for both indobert and indobertweet

In [232]:
import torch
def getEncodings(textArray):
  indobert_inputs = indobert_tokenizer(
    textArray,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors="pt"
  )
  tweet_inputs = tweet_tokenizer(
    textArray,
    padding=True,
    truncation=True,
    max_length=256,
    return_tensors="pt"
  )
  indobert_inputs = {k: v.to("mps") for k, v in indobert_inputs.items()}
  tweet_inputs = {k: v.to("mps") for k, v in tweet_inputs.items()}
  with torch.no_grad():
    indobert_outputs = indobert_model(**indobert_inputs)
    tweet_outputs = tweet_model(**tweet_inputs)
    
    
  indobert_embeddings = indobert_outputs.last_hidden_state[:, 0, :]
  tweet_embeddings = tweet_outputs.last_hidden_state[:, 0, :]
  return (indobert_embeddings.cpu().numpy(), tweet_embeddings.cpu().numpy())
  

In [233]:
import os

def removeFile(file_path):
  if os.path.exists(file_path):
    os.remove(file_path)
    print(f"{file_path} deleted.")
  else:
    print(f"{file_path} does not exist.")

In [234]:
from tqdm import tqdm
import json
def get_batch_embeddings(documents, batch_size=32):
  #Document is in the shape of a Pandas Dataframe. Convert to a list first before processing
  removeFile(INDOBERT_OUT_FILE)
  removeFile(INDOBERTWEET_OUT_FILE)
  documents_list = documents.to_dict(orient="records")
  for i in tqdm(range(0, len(documents_list), batch_size), desc="Generating embeddings"):
    batched = documents_list[i:i+batch_size]
    texts = [doc["content"] for doc in batched]
    indobert_embedding, tweet_embedding = getEncodings(texts)
    
    with open(INDOBERT_OUT_FILE, "a", encoding="utf-8") as file:
      for doc, embed in zip(batched, indobert_embedding):
        doc_copy = doc.copy()
        doc_copy["embedding"] = embed.tolist()
        file.write(json.dumps(doc_copy, ensure_ascii=False)+ "\n")
        
    with open(INDOBERTWEET_OUT_FILE, "a", encoding="utf-8") as file:
      for doc, embed in zip(batched, tweet_embedding):
        doc_copy = doc.copy()
        doc_copy["embedding"] = embed.tolist()
        file.write(json.dumps(doc_copy, ensure_ascii=False)+ "\n")

In [235]:
get_batch_embeddings(TRAIN_DF)

OUT/indobert_embeds.jsonl deleted.
OUT/indobertweet_embeds.jsonl deleted.


Generating embeddings: 100%|██████████| 477/477 [03:18<00:00,  2.40it/s]


### Create function to reduce embed size

In [236]:
import umap
import numpy as np
def reduce_embed_size(embeds):
  umap_model = umap.UMAP(n_components=50)
  reduced_embedding = umap_model.fit_transform(np.array(embeds))
  return reduced_embedding

### Reduce embedding size to 50

In [237]:
import json
import pandas as pd

with open(INDOBERT_OUT_FILE, "r", encoding="utf-8") as file:
  embedded_documents = []
  for line in file:
    doc = json.loads(line)
    embedded_documents.append(doc)

embeddings = [doc["embedding"] for doc in embedded_documents]
reduced_embeddings = reduce_embed_size(embeddings)

for doc, reduced in zip(embedded_documents, reduced_embeddings):
  doc["embedding"] = reduced.tolist()
removeFile(INDOBERT_REDUCED_OUT_FILE)
with open(INDOBERT_REDUCED_OUT_FILE, "a", encoding="utf-8") as file:
  for doc in embedded_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")

with open(INDOBERTWEET_OUT_FILE, "r", encoding="utf-8") as file:
  embedded_documents = []
  for line in file:
    doc = json.loads(line)
    embedded_documents.append(doc)

embeddings = [doc["embedding"] for doc in embedded_documents]
reduced_embeddings = reduce_embed_size(embeddings)

for doc, reduced in zip(embedded_documents, reduced_embeddings):
  doc["embedding"] = reduced.tolist()
removeFile(TWEET_REDUCED_OUT_FILE)
with open(TWEET_REDUCED_OUT_FILE, "a", encoding="utf-8") as file:
  for doc in embedded_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")



OUT/indobert_reduced_embeds.jsonl deleted.




OUT/indobertweet_reduced_embeds.jsonl deleted.


#### Load stopwords

In [238]:
import nltk
from nltk.corpus import stopwords
try:
  stopwords.words('english')
except LookupError:
  nltk.download('stopwords')

# Read the javanese and sundanese stopwords from the assets folder
with open(ASSET_DIR + "local_languages_stopwords.csv", "r", encoding="utf-8") as file:
  #Read headers indonesian, javanese, sundanese
  local_stopwords = pd.read_csv(file, header=None, names=["indonesian", "javanese", "sundanese"])

javanese_stopwords = set(local_stopwords["javanese"].dropna().tolist())
sundanese_stopwords = set(local_stopwords["sundanese"].dropna().tolist())

stopwords_combined = set(stopwords.words("indonesian")) | set(stopwords.words("english")) | javanese_stopwords | sundanese_stopwords

#### Initialize the function to generate structural features

In [239]:
import re
import numpy as np
import emoji

# ── pre‑compiled patterns ───────────────────────────────────────────────────────
URL_RE   = re.compile(r"http\S+")
EMOJI_RE = re.compile("|".join(re.escape(e) for e in emoji.EMOJI_DATA))

def extract_structural_features(tweet: str, stopwords_set: set) -> list[float]:
    """
    Return a list of lightweight structural features for a single tweet.

    Features (18 → 10 after pruning):
      0. length                – total characters
      1. num_hashtags          – count of '#'
      2. num_mentions          – count of '@'
      3. num_urls              – URLs matched by URL_RE
      4. num_emojis            – emojis matched by EMOJI_RE
      5. num_upper             – uppercase characters
      6. num_punct             – non‑alphanum / non‑whitespace
      7. avg_word_len          – mean token length (0 if no tokens)
      8. word_count            – number of whitespace‑split tokens
      9. stopword_ratio        – fraction of tokens that are stopwords
    """
    # fast counters
    length        = len(tweet)
    num_hashtags  = tweet.count("#")
    num_mentions  = tweet.count("@")
    num_urls      = len(URL_RE.findall(tweet))
    num_emojis    = len(EMOJI_RE.findall(tweet))
    num_upper     = sum(c.isupper() for c in tweet)
    num_punct     = sum(1 for c in tweet if not c.isalnum() and not c.isspace())

    # token‑level stats
    words         = tweet.split()
    word_lengths  = [len(w) for w in words]
    avg_word_len  = np.mean(word_lengths) if word_lengths else 0.0
    word_count    = len(words)
    stopword_ratio= (
        np.mean([w.lower() in stopwords_set for w in words]) if words else 0.0
    )

    return [
        length, num_hashtags, num_mentions, num_urls,
        num_emojis, num_upper, num_punct, avg_word_len,
        word_count, stopword_ratio,
    ]


### Generate structural features of each cleaned content

In [240]:
import json
import numpy as np
from sklearn.preprocessing import StandardScaler
concat_scaler = StandardScaler()
props_scaler = StandardScaler()
with open(INDOBERT_REDUCED_OUT_FILE, "r", encoding="utf-8") as file:
  indobert_reduced_documents = [json.loads(line) for line in file]

structural_properties = [extract_structural_features(doc["content"], stopwords_combined) for doc in indobert_reduced_documents]

for doc, props in zip(indobert_reduced_documents, structural_properties):
  doc["structural_property"] = props
  doc["concatenated_features"] = np.concatenate([np.array(doc["embedding"]), np.array(props)])

props_scaled = props_scaler.fit_transform(np.array(structural_properties))
concat_scaled = concat_scaler.fit_transform(
    np.array([doc["concatenated_features"] for doc in indobert_reduced_documents])
)
for doc, scaled_prop, scaled_concat in zip(indobert_reduced_documents, props_scaled, concat_scaled):
  doc["structural_property"] = scaled_prop.tolist()
  doc["concatenated_features"] = scaled_concat.tolist()
  
removeFile(INDOBERT_NORMALIZED)
with open(INDOBERT_NORMALIZED, "a", encoding="utf-8") as file:
  for doc in indobert_reduced_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")

OUT/indobert_normalized.jsonl deleted.


In [241]:
import json
import numpy as np
from sklearn.preprocessing import StandardScaler
concat_scaler = StandardScaler()
props_scaler = StandardScaler()
with open(TWEET_REDUCED_OUT_FILE, "r", encoding="utf-8") as file:
  indobert_reduced_documents = [json.loads(line) for line in file]

structural_properties = [extract_structural_features(doc["content"], stopwords_combined) for doc in indobert_reduced_documents]

for doc, props in zip(indobert_reduced_documents, structural_properties):
  doc["structural_property"] = props
  doc["concatenated_features"] = np.concatenate([np.array(doc["embedding"]), np.array(props) * 2])

props_scaled = props_scaler.fit_transform(np.array(structural_properties))
concat_scaled = concat_scaler.fit_transform(
    np.array([doc["concatenated_features"] for doc in indobert_reduced_documents])
)
for doc, scaled_prop, scaled_concat in zip(indobert_reduced_documents, props_scaled, concat_scaled):
  doc["structural_property"] = scaled_prop.tolist()
  doc["concatenated_features"] = scaled_concat.tolist()
  
removeFile(TWEET_NORMALIZED)
with open(TWEET_NORMALIZED, "a", encoding="utf-8") as file:
  for doc in indobert_reduced_documents:
    file.write(json.dumps(doc, ensure_ascii=False) + "\n")

OUT/indobertweet_normalized.jsonl deleted.


#### Initialize method to save clustered docs

In [242]:
def save_clustered_docs(docs, cluster_labels, output_path):
    for doc, label in zip(docs, cluster_labels):
        doc["bucket_label"] = int(label)
    for doc in docs:
        del doc["__v"]
        del doc["_id"]
        if "embedding" in doc:
            del doc["embedding"]
        if "structural_property" in doc:
            del doc["structural_property"]
        if "concatenated_features" in doc:
            del doc["concatenated_features"]
            
    # Sort by label for better organization (optional)
    docs_sorted = sorted(docs, key=lambda x: x["bucket_label"])
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(docs_sorted, f, ensure_ascii=False, indent=2)


#### Generate code to evaluate

In [243]:
from pathlib import Path
from typing import Union

import json
import numpy as np
import pandas as pd
from sklearn.metrics import davies_bouldin_score, silhouette_score


# ── helpers ───────────────────────────────────────────────────────────────────
def _load_merged(bucket_file: Union[str, Path],
                 features_file: Union[str, Path]) -> pd.DataFrame:
    labels = pd.read_json(Path(bucket_file))
    labels["content"] = labels["content"].astype(str)

    with open(features_file) as f:
        feats = pd.DataFrame(json.loads(line) for line in f)

    return labels.merge(feats, on="content", how="inner")


def _gini(vals: np.ndarray) -> float:
    v = np.sort(vals.astype(np.float64))
    if v.size == 0:
        return np.nan
    v += 1e-10
    n = v.size
    cum = np.cumsum(v)
    return (n + 1 - 2 * np.sum(cum) / cum[-1]) / n


def _safe_score(func, X, y):
    return np.nan if len(np.unique(y)) < 2 else func(X, y)


# ── core ──────────────────────────────────────────────────────────────────────
def evaluate_views(bucket_embed: Union[str, Path],
                   bucket_props: Union[str, Path],
                   bucket_concat: Union[str, Path],
                   features_file: Union[str, Path]) -> None:
    """
    For each clustering run (embeds / props / concat) print Gini, DBI, Silhouette
    with its *own* labels and matching feature vectors.
    """
    spec = [
        (bucket_embed,  "Embedding",  "embedding"),
        (bucket_props,  "Properties", "structural_property"),
        (bucket_concat, "Concat",     "concatenated_features"),
    ]

    rows = []
    for bucket, label, col in spec:
        df = _load_merged(bucket, features_file)
        y  = df["bucket_label"].to_numpy()
        X  = np.stack(df[col].to_numpy())

        counts = pd.Series(y).value_counts().sort_index()
        rows.append({
            "View":           label,
            "Gini":           f"{_gini(counts.values):.4f}",
            "Clusters":       len(counts),
            "Noise":          (y == -1).sum(),
            "DBI":            f"{_safe_score(davies_bouldin_score, X, y):.4f}",
            "Silhouette":     f"{_safe_score(silhouette_score,       X, y):.4f}",
        })

    print(pd.DataFrame(rows).to_string(index=False))

### Utilize KMeans and generate buckets on indobert
1. KMeans + IndoBERT Embeddings
2. KMeans + IndoBERT Embeddings + Structure Properties
3. KMeans + Structure Properties

In [244]:
from sklearn.cluster import KMeans
import json
import copy

with open(INDOBERT_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only
kmeans_embed = KMeans(n_clusters=10, random_state=42).fit(embeds)

# 2. Embeddings + Props
kmeans_concat = KMeans(n_clusters=10, random_state=42).fit(concats)

# 3. Props only
kmeans_props = KMeans(n_clusters=10, random_state=42).fit(properties)


# Cluster label predictions
labels_embed = kmeans_embed.labels_
labels_concat = kmeans_concat.labels_
labels_props = kmeans_props.labels_
  
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERT_KMEANS_EMBED)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERT_KMEANS_CONCAT)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERT_KMEANS_PROPS)

print("Evaluating KMeans on Indobert Normalized")
evaluate_views(INDOBERT_KMEANS_EMBED, INDOBERT_KMEANS_PROPS, INDOBERT_KMEANS_CONCAT, INDOBERT_NORMALIZED)


Evaluating KMeans on Indobert Normalized
      View   Gini  Clusters  Noise    DBI Silhouette
 Embedding 0.3544        10      0 0.8870     0.3831
Properties 0.5000        10      0 0.7502     0.2640
    Concat 0.3724        10      0 1.3181     0.3238


### Utilize KMeans and generate buckets on indobertweet
1. KMeans + IndoBERTweet Embeddings
2. KMeans + IndoBERTweet Embeddings + Structure Properties


In [245]:
from sklearn.cluster import KMeans
import json
import copy

with open(TWEET_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only
kmeans_embed = KMeans(n_clusters=10, random_state=42).fit(embeds)

# 2. Embeddings + Props
kmeans_concat = KMeans(n_clusters=10, random_state=42).fit(concats)

# 3. Props only
kmeans_props = KMeans(n_clusters=10, random_state=42).fit(properties)

labels_embed = kmeans_embed.labels_
labels_concat = kmeans_concat.labels_
labels_props = kmeans_props.labels_

save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERTWEET_KMEANS_EMBED)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERTWEET_KMEANS_CONCAT)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERTWEET_KMEANS_PROPS)

print("Evaluating KMeans on Indobertweet Normalized")
evaluate_views(INDOBERTWEET_KMEANS_EMBED, INDOBERTWEET_KMEANS_PROPS, INDOBERTWEET_KMEANS_CONCAT, TWEET_NORMALIZED)

Evaluating KMeans on Indobertweet Normalized
      View   Gini  Clusters  Noise    DBI Silhouette
 Embedding 0.3616        10      0 0.8610     0.3755
Properties 0.5000        10      0 0.7502     0.2640
    Concat 0.3310        10      0 1.2840     0.3026


### Utilize HDBSCAN and generate buckets on IndoBERT
1. HDBSCAN + IndoBERT Embeddings
2. HDBSCAN + IndoBERT Embeddings + Structure Properties


In [260]:
import json
import copy
import hdbscan

with open(INDOBERT_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only
hdbscan_embed = hdbscan.HDBSCAN(min_cluster_size=8, metric="euclidean").fit(embeds)

# 3. Props only
hdbscan_props = hdbscan.HDBSCAN(min_cluster_size=30, metric="euclidean").fit(properties)

# 2. Embeddings + Props
hdbscan_concat = hdbscan.HDBSCAN(min_cluster_size=3, metric="euclidean").fit(concats)

# Cluster label predictions
labels_embed = hdbscan_embed.labels_
labels_concat = hdbscan_concat.labels_
labels_props = hdbscan_props.labels_
  
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERT_HDBSCAN_EMBED)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERT_HDBSCAN_CONCAT)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERT_HDBSCAN_PROPS)

evaluate_views(INDOBERT_HDBSCAN_EMBED, INDOBERT_HDBSCAN_PROPS, INDOBERT_HDBSCAN_CONCAT, INDOBERT_NORMALIZED)



      View   Gini  Clusters  Noise    DBI Silhouette
 Embedding 0.8575         8     12 1.1851     0.0830
Properties 0.7518        10   6682 1.4964    -0.0508
    Concat 0.8953        11     16 1.5901     0.2624


In [275]:
import json
import copy
import hdbscan

with open(TWEET_NORMALIZED, "r", encoding="utf-8") as file:
  normalized_indobert_documents = [json.loads(line) for line in file]

embeds = [doc["embedding"] for doc in normalized_indobert_documents]
properties = [doc["structural_property"] for doc in normalized_indobert_documents]
concats = [doc["concatenated_features"] for doc in normalized_indobert_documents]

# 1. Embeddings only #before waas  18
hdbscan_embed = hdbscan.HDBSCAN(min_cluster_size=29, metric="euclidean").fit(embeds)

# 3. Props only
hdbscan_props = hdbscan.HDBSCAN(min_cluster_size=27, metric="euclidean").fit(properties)

# 2. Embeddings + Props #before was 17
hdbscan_concat = hdbscan.HDBSCAN(min_cluster_size=20, metric="euclidean").fit(concats)

# Cluster label predictions
labels_embed = hdbscan_embed.labels_
labels_concat = hdbscan_concat.labels_
labels_props = hdbscan_props.labels_
  
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_embed, INDOBERTWEET_HDBSCAN_EMBED)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_concat, INDOBERTWEET_HDBSCAN_CONCAT)
save_clustered_docs(copy.deepcopy(normalized_indobert_documents), labels_props, INDOBERTWEET_HDBSCAN_PROPS)

evaluate_views(INDOBERTWEET_HDBSCAN_EMBED, INDOBERTWEET_HDBSCAN_PROPS, INDOBERTWEET_HDBSCAN_CONCAT, TWEET_NORMALIZED)



      View   Gini  Clusters  Noise    DBI Silhouette
 Embedding 0.8329         7     53 0.7990     0.2109
Properties 0.7696        11   6265 1.5427    -0.0434
    Concat 0.8771        11    157 1.3879     0.2162


### Do a bit of bucket analization

In [276]:
# Utility functions
from collections import Counter
import re
def count_hashtags(text):
    return len(re.findall(r"#\w+", text))

def hashtag_ratio(text):
    hashtags = ''.join(re.findall(r"#\w+", text))
    return len(hashtags) / len(text) if len(text) > 0 else 0

def extract_emojis(text):
    emoji_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF"
                               "\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
    return emoji_pattern.findall(text)

def emoji_ratio(text):
    emojis = extract_emojis(text)
    return len(''.join(emojis)) / len(text) if len(text) > 0 else 0

def url_ratio(tweets):
    return sum(1 for t in tweets if re.search(r"http\S+", t)) / len(tweets)

def mention_ratio(tweets):
    return sum(1 for t in tweets if re.search(r"@\w+", t)) / len(tweets)

def lexical_diversity(text):
    words = text.split()
    return len(set(words)) / len(words) if words else 0

def repeated_char_abuse(text):
    return bool(re.search(r"(.)\1{3,}", text))

def duplicate_ratio(tweets):
    freq = Counter(tweets)
    return sum(count for tweet, count in freq.items() if count > 1) / len(tweets)


In [277]:
results = {}
from collections import defaultdict
import json
buckets = defaultdict(list)

with open(INDOBERT_KMEANS_EMBED, "r") as file:
  documents = json.load(file)

for doc in documents:
  buckets[doc["bucket_label"]].append(doc)

for label, bucket_tweets in buckets.items():
    bucket_tweets = [doc["content"] for doc in bucket_tweets]
    
    hashtag_counts = [count_hashtags(t) for t in bucket_tweets]
    tweet_lengths = [len(t) for t in bucket_tweets]
    hashtag_ratios = [hashtag_ratio(t) for t in bucket_tweets]
    emoji_ratios = [emoji_ratio(t) for t in bucket_tweets]
    lexical_divs = [lexical_diversity(t) for t in bucket_tweets]
    repeated_abuse_count = sum(1 for t in bucket_tweets if repeated_char_abuse(t))

    avg_hashtags = sum(hashtag_counts) / len(bucket_tweets)
    avg_length = sum(tweet_lengths) / len(bucket_tweets)
    avg_hashtag_ratio = sum(hashtag_ratios) / len(bucket_tweets)
    avg_emoji_ratio = sum(emoji_ratios) / len(bucket_tweets)
    avg_lexical_div = sum(lexical_divs) / len(bucket_tweets)
    url_ratio_val = url_ratio(bucket_tweets)
    mention_ratio_val = mention_ratio(bucket_tweets)
    dup_ratio = duplicate_ratio(bucket_tweets)

    # Heuristic label tagging
    label_tags = []
    if avg_length > 200:
        label_tags.append("long tweets")
    if avg_hashtag_ratio > 0.4:
        label_tags.append("hashtag-heavy")
    if avg_emoji_ratio > 0.2:
        label_tags.append("emoji spam")
    if url_ratio_val > 0.3:
        label_tags.append("link drop")
    if mention_ratio_val > 0.3:
        label_tags.append("mention spam")
    if avg_lexical_div < 0.4:
        label_tags.append("low diversity (copypasta)")
    if repeated_abuse_count / len(bucket_tweets) > 0.3:
        label_tags.append("repeated char abuse")
    if dup_ratio > 0.3:
        label_tags.append("high duplication")

    longest = max(bucket_tweets, key=len)
    shortest = min(bucket_tweets, key=len)

    results[label] = {
        "label": ", ".join(label_tags) if label_tags else "generic",
        "avg_hashtags": avg_hashtags,
        "avg_length": avg_length,
        "longest_tweet": longest,
        "shortest_tweet": shortest,
        "hashtag_ratio": avg_hashtag_ratio,
        "emoji_ratio": avg_emoji_ratio,
        "url_ratio": url_ratio_val,
        "mention_ratio": mention_ratio_val,
        "lexical_diversity": avg_lexical_div,
        "duplication_ratio": dup_ratio,
        "repeated_char_abuse_count": repeated_abuse_count,
    }

# Output the results
for label, metrics in results.items():
    print(f"\nCluster {label}: {metrics['label']}")
    for k, v in metrics.items():
        if k != 'label':
            print(f"  {k}: {v}")


Cluster 0: long tweets
  avg_hashtags: 1.0710659898477157
  avg_length: 230.65989847715736
  longest_tweet: buat yang suka demo atau protes, mending kalian serang yang ini. ada puluhan ribu triliun di situ. rebut, kaya jaman penjajahan belanda duludaftar perusahaan asing yang menguasai sda indonesia dan jumlah keuntungannyaanswer by grokberikut adalah daftar beberapa perusahaan asing yang mengelola sumber daya alam (sda) di indonesia, khususnya di sektor pertambangan dan migas, berdasarkan informasi yang tersedia. namun, data spesifik mengenai jumlah keuntungan sering kali tidak diungkap secara rinci dalam sumber publik karena bersifat rahasia perusahaan atau hanya dilaporkan secara agregat. saya akan mencantumkan perusahaan-perusahaan yang dikenal memiliki peran besar dalam pengelolaan sda indonesia beserta informasi keuntungan yang tersedia dari sumber terpercaya. jika data keuntungan tidak tersedia, saya akan menjelaskan kontribusi atau operasi mereka. pt freeport indonesia (freepo

In [278]:
import json
import random
from collections import defaultdict
# Parameters
INPUT_FILE = INDOBERT_KMEANS_EMBED
OUTPUT_FILE = 'out/labelstudio-training-sampled.json'

TOTAL_SAMPLE = 1100  # Change this as needed

# Load data
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    tweets = json.load(f)
    
bucket_content_count = defaultdict(int)
bucket_ratio_count = defaultdict(float)

total_tweet = 0
for tweet in tweets:
  label = tweet["bucket_label"]
  bucket_content_count[label] += 1
  total_tweet += 1

for label, bucket_tweet_count in bucket_content_count.items():
  bucket_ratio_count[label] = bucket_tweet_count / total_tweet

# Group tweets by bucket
buckets = defaultdict(list)
for tweet in tweets:
  bucket_label = str(tweet["bucket_label"])
  buckets[bucket_label].append(tweet)

# Sample tweets
sampled_tweets = []
for (bucket_label, tweets_in_bucket), ratio in zip(buckets.items(), bucket_ratio_count.values()):
  # ratiod_total = math.ceil(TOTAL_SAMPLE * ratio)
  ratiod_total = int(TOTAL_SAMPLE / bucket_content_count.__len__())
  print(f"Bucket {bucket_label} sampled with ratio {ratio} and total sample {ratiod_total}")
  if len(tweets_in_bucket) < ratiod_total:
    print(f"Warning: Bucket '{bucket_label}' has only {len(tweets_in_bucket)} tweets. Sampling all.")
    sampled = tweets_in_bucket
  else:
    sampled = random.sample(tweets_in_bucket, ratiod_total)
  sampled_tweets.extend(sampled)

# Save to output JSON
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
  json.dump(sampled_tweets, f, ensure_ascii=False, indent=2)

print(f"Sampled tweets saved to {OUTPUT_FILE}")


Bucket 0 sampled with ratio 0.18079318256309407 and total sample 110
Bucket 1 sampled with ratio 0.09183874139626352 and total sample 110
Bucket 2 sampled with ratio 0.15319567354965585 and total sample 110
Bucket 3 sampled with ratio 0.03716814159292035 and total sample 110
Bucket 4 sampled with ratio 0.20603080957063258 and total sample 110
Bucket 5 sampled with ratio 0.07046869878728286 and total sample 110
Bucket 6 sampled with ratio 0.07079646017699115 and total sample 110
Bucket 7 sampled with ratio 0.13313667649950836 and total sample 110
Bucket 8 sampled with ratio 0.05473615208128482 and total sample 110
Bucket 9 sampled with ratio 0.0018354637823664373 and total sample 110
Sampled tweets saved to out/labelstudio-training-sampled.json


### Check for data leakage

In [279]:
import json

def load_ids_from_json_or_jsonl(file_path, id_key="tweet_id"):
    ids = set()
    with open(file_path, "r", encoding="utf-8") as f:
        first_char = f.read(1)
        f.seek(0)
        if first_char == "[":  # JSON array
            data = json.load(f)
            ids = {entry[id_key] for entry in data if id_key in entry}
        else:  # JSONL
            for line in f:
                try:
                    obj = json.loads(line)
                    if id_key in obj:
                        ids.add(obj[id_key])
                except json.JSONDecodeError:
                    continue
    return ids

def check_data_leakage(file1, file2, id_key="tweet_id"):
    ids_1 = load_ids_from_json_or_jsonl(file1, id_key)
    ids_2 = load_ids_from_json_or_jsonl(file2, id_key)

    intersection = ids_1 & ids_2

    if intersection:
        print(f"⚠️ Data leakage detected! {len(intersection)} shared {id_key}s.")
    else:
        print("✅ No data leakage detected.")

    return intersection

# Example usage:
file_a = "out/labelstudio-training-sampled.json"
file_b = "out/golden-standard.json"
leaked_ids = check_data_leakage(file_a, file_b)


✅ No data leakage detected.


### Convert to a Label Studio Processable data

In [280]:
def convert_to_label_studio_format(raw_data):
    converted = []
    for entry in raw_data:
        new_entry = {
            "data": {
                "text": entry["content"],
                "bucket_label" : entry["bucket_label"] if entry.get("bucket_label") is not None else -10
            },
            "meta": {k: v for k, v in entry.items() if k != "content" and k != "bucket_label"}
        }
        converted.append(new_entry)
    return converted


In [281]:
import json
with open("out/labelstudio-training-sampled.json", "r") as file:
  training_documents = json.load(file)
parsed_training_documents = convert_to_label_studio_format(training_documents)
with open("out/labelstudio/p1/p1_training_prepped.json", "w", encoding="utf-8") as file:
  json.dump(parsed_training_documents, file, ensure_ascii=False, indent=2)
  