train the pipeline, with a larger amount

mix in the training data with the labeled training data

then we can retrieve more samples of a certain type

in this case, we only care about relevant samples

In [1]:
from datasets import load_dataset, ClassLabel
class_labels = ClassLabel(3, ["Negative", "Neutral", "Positive"])
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

In [2]:
import re, unicodedata, jaconv, emoji

_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_WS       = re.compile(r'\s+')
_KUTI_CUT = re.compile(r'(?i)kutipan.*$', re.DOTALL)

# --- (MODIFIED) ---
# Catches "word" + "dari" + "domain.com" -> replaces with "word"
# Changed \w+ to \S+ to include punctuation like '!'
_DARI_URL_ATTACHED = re.compile(r'(\S+)dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# Catches " dari " + "domain.com" -> replaces with empty string
_DARI_URL_SPACED = re.compile(r'\s+dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# --- (NEW) ---
# Catches any word ending in "dari" (e.g., "anarko!dari", "negaradari")
_DARI_STUCK = re.compile(r'(\S+)dari\b', re.I)

def cleantext(row: str):
    text = row["content"] #type: ignore
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    text = text.replace("tanya grok", " ")
    text = text.replace("grokproductivitypasang", " ")
    text = text.replace('\\n', ' ').replace('\\r', ' ')

    # Handle standard URLs first
    text = _URL.sub(' <url> ', text)
    text = text.replace('ini tidak tersedia', ' ')

    text = _MENTION.sub('@USER', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)
    text = _KUTI_CUT.sub('', text)

    # text = _DARI_URL_ATTACHED.sub(r'\1', text)
    # text = _DARI_URL_SPACED.sub('', text)
    # text = _DARI_STUCK.sub(r'\1', text)

    text = _WS.sub(' ', text).strip()
    row["content"] = text #type: ignore
    return row

In [3]:
from datasets import Dataset, concatenate_datasets
source_ds = dataset["source_labeled"]
source_ds = source_ds.map(cleantext, num_proc=16)

relevant_ds = source_ds.filter(lambda ex: ex["relevant"] == True)
relevant_df = relevant_ds.to_pandas()
relevant_df = relevant_df.drop_duplicates(subset="content", keep="first").reset_index(drop=True)
relevant_ds = Dataset.from_pandas(relevant_df)

train_ds = dataset["train_sentiment"]
test_ds = dataset["test_sentiment"]

train_ids = set(train_ds["content"])
test_ids = set(test_ds["content"])

filtered = relevant_ds.filter(
    lambda ex: ex["content"] not in train_ids and ex["content"] not in test_ids
)

dataset_list = []

dataset_list.append(train_ds)
unlabeled_ds = filtered.train_test_split(train_size=50000, seed=42)["train"]
def reset_sentiment(row):
    row["sentiment"] = -1
    return row
unlabeled_ds = unlabeled_ds.map(reset_sentiment)
dataset_list.append(unlabeled_ds)

concat_ds = concatenate_datasets(dataset_list)

Filter:   0%|          | 0/113876 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [4]:
import re
# We are NOT importing replace_word_elongation anymore
from indoNLP.preprocessing import emoji_to_words

def clean_tweet_for_nusabert(row):
    text = row['content']

    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # 3. Remove mentions and RT
    # This regex is safe and does not affect 'ruu tni'
    text = re.sub(r'rt @\S+|@\S+', '', text)

    # 4. Remove hashtags (keep the word)
    text = re.sub(r'#(\S+)', r'\1', text)

    # 5. Convert emojis to words (Preserves sentiment)
    text = emoji_to_words(text)

    # 6. Normalize word elongation (CUSTOM, SAFER REGEX)
    # This replaces 3 or more repeated chars (e.g., 'bangeeet' -> 'banget')
    # It will NOT affect 'uu' or 'ruu', fixing your bug.
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    # 7. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    row["content"] = text
    return row

In [5]:
concat_ds = concat_ds.map(clean_tweet_for_nusabert, num_proc=16)

In [6]:
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer("LazarusNLP/all-nusabert-large-v4",
                                           cache_folder="/data/cache/",
                                           device="cuda",
                                           )
embeddings = sentence_transformer.encode(concat_ds["content"], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, device="cuda", batch_size=128)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [8]:
from dream_pipeline import DreamCluster
clusterer = DreamCluster("stability")
clusterer.fit(embeddings)

Finding intrinsic dimension with TwoNN
Found intrinsic dimension: 20
Running first stage reduction
Running second stage reduction
Tuning HDBSCAN with mode stability
  -> Using 'stability' mode (Bayesian Optimization on relative_validity)
Transforming full dataset with trained reducers...
Fitting final clusterer on full reduced dataset...


(np.int64(146), np.int64(32))

In [9]:
cluster_labels, cluster_probabilities, reduced_embeddings = clusterer.predict(embeddings=embeddings)

In [18]:
import numpy as np
unique_labels, unique_label_counts = np.unique(cluster_labels, return_counts=True)
for label, count in zip(unique_labels, unique_label_counts):
    indices = np.where(cluster_labels == label)[0]
    cluster_ds = concat_ds.select(indices)
    cluster_classes, cluster_class_counts = np.unique(cluster_ds["sentiment"], return_counts=True)
    print(f"CLUSTER {label}: {dict(zip(cluster_classes, cluster_class_counts))}")

CLUSTER -1: {np.int64(-1): np.int64(7318), np.int64(0): np.int64(3771), np.int64(1): np.int64(391), np.int64(2): np.int64(184)}
CLUSTER 0: {np.int64(-1): np.int64(260), np.int64(1): np.int64(17), np.int64(2): np.int64(165)}
CLUSTER 1: {np.int64(-1): np.int64(385), np.int64(0): np.int64(2), np.int64(1): np.int64(12), np.int64(2): np.int64(212)}
CLUSTER 2: {np.int64(-1): np.int64(451), np.int64(0): np.int64(244), np.int64(1): np.int64(6), np.int64(2): np.int64(1)}
CLUSTER 3: {np.int64(-1): np.int64(195), np.int64(1): np.int64(2), np.int64(2): np.int64(108)}
CLUSTER 4: {np.int64(-1): np.int64(142), np.int64(0): np.int64(90), np.int64(1): np.int64(4)}
CLUSTER 5: {np.int64(-1): np.int64(1677), np.int64(0): np.int64(777), np.int64(1): np.int64(138), np.int64(2): np.int64(84)}
CLUSTER 6: {np.int64(-1): np.int64(370), np.int64(0): np.int64(234), np.int64(1): np.int64(9), np.int64(2): np.int64(3)}
CLUSTER 7: {np.int64(-1): np.int64(89), np.int64(0): np.int64(28), np.int64(1): np.int64(27), np.i

In [23]:
import json
from datetime import datetime
from collections import Counter, defaultdict


def build_labeling_json(concat_ds, cluster_labels, cluster_probabilities, output_path):
    """
    Build labeling JSON for HTML labeling suite.

    Parameters:
    -----------
    concat_ds : HuggingFace Dataset
        Must contain columns: tweet_id, time, author, content, sentiment
    cluster_labels : Sequence[int]
        Cluster assignment per sample
    cluster_probabilities : Sequence[float]
        Cluster probability per sample
    output_path : str
        Where to save the output JSON file
    """

    # Convert to Python lists for safety
    cluster_labels = list(cluster_labels)
    cluster_probabilities = list(cluster_probabilities)

    # Basic checks
    assert len(concat_ds) == len(cluster_labels)
    assert len(concat_ds) == len(cluster_probabilities)

    # Organize samples by cluster
    clusters = defaultdict(list)
    for idx, row in enumerate(concat_ds):
        cid = int(cluster_labels[idx])
        prob = float(cluster_probabilities[idx])

        original_label = int(row["sentiment"])
        is_unlabeled = original_label == -1

        sample = {
            "tweet_id": row["tweet_id"],
            "content": row["content"],
            "author": row.get("author", None),
            "time": row.get("time", None),

            "cluster_probability": prob,
            "original_label": original_label,
            "current_label": original_label,
            "is_unlabeled": is_unlabeled,
            "needs_check": False  # Will fill below
        }

        clusters[cid].append(sample)

    # Build cluster stats + mark minority labels
    cluster_blocks = []
    for cid, samples in clusters.items():
        # Collect stats
        label_counts = Counter(s["original_label"] for s in samples if s["original_label"] != -1)
        size = len(samples)

        # Determine "dominant" class for minority detection
        if len(label_counts) > 0:
            dominant_label, dominant_count = label_counts.most_common(1)[0]
        else:
            dominant_label, dominant_count = None, 0  # All unlabeled cluster

        # Mark minority samples
        for s in samples:
            orig = s["original_label"]
            if orig != -1 and orig != dominant_label:
                s["needs_check"] = True

        cluster_blocks.append({
            "cluster_id": cid,
            "stats": {
                "size": size,
                "labels": {str(k): v for k, v in label_counts.items()}
            },
            "samples": samples
        })

    # Compose final JSON
    output_json = {
        "metadata": {
            "version": 1,
            "generated_at": datetime.utcnow().isoformat(),
            "num_clusters": len(cluster_blocks),
            "num_samples": len(concat_ds),

            "label_schema": {
                "-1": "Unlabeled",
                "0": "Negative",
                "1": "Neutral",
                "2": "Positive"
            }
        },
        "clusters": cluster_blocks
    }

    # Write to file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_json, f, indent=2, ensure_ascii=False)

    print(f"[OK] Labeling JSON exported to: {output_path}")


In [24]:

build_labeling_json(
    concat_ds=concat_ds,
    cluster_labels=cluster_labels,
    cluster_probabilities=cluster_probabilities,
    output_path="labeling_data.json"
)

[OK] Labeling JSON exported to: labeling_data.json
