In [1]:
from datasets import load_dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

data/test-00000-of-00001.parquet:   0%|          | 0.00/755k [00:00<?, ?B/s]

Generating source_stage_1 split:   0%|          | 0/201583 [00:00<?, ? examples/s]

Generating source_stage_2 split:   0%|          | 0/247820 [00:00<?, ? examples/s]

Generating cleaned split:   0%|          | 0/195952 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/19999 [00:00<?, ? examples/s]

In [2]:
from datasets import ClassLabel
class_labels = ClassLabel(2, ["Irrelevant", "Relevant"])
train_ds = dataset["train"].select_columns(["content", "relevant"])
train_ds = train_ds.rename_column("relevant", "label")
train_ds = train_ds.cast_column("label", class_labels)

train_ds

Casting the dataset:   0%|          | 0/19999 [00:00<?, ? examples/s]

Dataset({
    features: ['content', 'label'],
    num_rows: 19999
})

In [4]:
import re
# We are NOT importing replace_word_elongation anymore
from indoNLP.preprocessing import emoji_to_words

def clean_tweet_for_nusabert(row):
    text = row['content']

    # 1. Lowercase
    text = text.lower()

    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # 3. Remove mentions and RT
    # This regex is safe and does not affect 'ruu tni'
    text = re.sub(r'rt @\S+|@\S+', '', text)

    # 4. Remove hashtags (keep the word)
    text = re.sub(r'#(\S+)', r'\1', text)

    # 5. Convert emojis to words (Preserves sentiment)
    text = emoji_to_words(text)

    # 6. Normalize word elongation (CUSTOM, SAFER REGEX)
    # This replaces 3 or more repeated chars (e.g., 'bangeeet' -> 'banget')
    # It will NOT affect 'uu' or 'ruu', fixing your bug.
    text = re.sub(r'(\w)\1{2,}', r'\1', text)

    # 7. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    row["content"] = text
    return row

In [6]:
sentence_train_ds = train_ds.map(clean_tweet_for_nusabert, num_proc=30)

Map (num_proc=30):   0%|          | 0/19999 [00:00<?, ? examples/s]

In [7]:
sentence_train_ds["content"]

['hasil dr uu tni yg baru..',
 'sudah saatnya kita dukung ruu tni agar prajurit bisa cepat bertindak dan negara aman',
 'mari kita lihat bersama, kuat mana? ambisi atau anjloknya rupiah ininegrikriminal indonesiagelap darkindonesia darkpik2',
 'gini : revisi uu tni, berdasarkan wacana dan draf yang sedang dibahas hingga saat ini, tidak secara eksplisit bertujuan menjadikan tni sebagai lembaga berpolitik. tni tetap diatur sebagai alat negara di bidang pertahanan, sesuai dengan pasal 30 uud 1945 dan uu no. 34 tahun 2004',
 '"menyerang! menyerang! al-qassam. setiap petugas dan setiap pengecut.” situasi demonstrasi besar-besaran hari ini di ibu kota yordania, amman untuk mendukung gaza dan mujahidin.',
 'ksad mantune lbp anggotanya melakukan tindakan anarkis dan pidana kok bangga. tni di masa akhir kekuasaan dibawa kembali ke era dwifungsi abri era soeharto yg serba militeristik dan merasa plg benar. mental preman',
 '“emang mana pemilu yg ga curang?” minimal nggak sebanyak kali ini dan sa

In [10]:
from sentence_transformers import SentenceTransformer
sentence_transformer = SentenceTransformer("LazarusNLP/all-nusabert-large-v4",
                                           cache_folder="cache/",
                                           device="cuda",
                                           )
embeddings = sentence_transformer.encode(sentence_train_ds["content"], show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True, device="cuda", batch_size=256)



model.safetensors:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/297 [00:00<?, ?B/s]

Batches:   0%|          | 0/79 [00:00<?, ?it/s]

In [12]:
from dream_pipeline import DreamCluster
clusterer = DreamCluster("stability")
clusterer.fit(embeddings)

Finding intrinsic dimension with TwoNN
Found intrinsic dimension: 21
Running first stage reduction
Running second stage reduction
Tuning HDBSCAN with mode stability
  -> Using 'stability' mode (Bayesian Optimization on relative_validity)
Transforming full dataset with trained reducers...
Fitting final clusterer on full reduced dataset...


(np.int64(174), np.int64(500))

In [14]:
train_labels, train_probabilities, reduced_embeddings = clusterer.predict(embeddings)

In [15]:
import numpy as np
unique_train_labels, unique_train_labels_counts = np.unique(train_labels, return_counts=True)
for label, count in zip(unique_train_labels, unique_train_labels_counts):
    print(label, count)

-1 834
0 852
1 3901
2 6895
3 540
4 3405
5 1153
6 1265
7 1154


In [16]:
import numpy as np

# Assuming 'train_labels' is your full array of labels from the clusterer
unique_train_labels, unique_train_labels_counts = np.unique(train_labels, return_counts=True)

for cluster_id, count in zip(unique_train_labels, unique_train_labels_counts):

    current_cluster_indices = np.where(train_labels == cluster_id)[0]
    cluster_ds = train_ds.select(current_cluster_indices)

    print(f"CLUSTER {cluster_id}, COUNT {len(cluster_ds)}")

    # 1. Get masks/datasets FOR THE CURRENT CLUSTER
    relevant_mask = np.where(np.array(cluster_ds["label"]) == 0)[0]
    relevant_ds = cluster_ds.select(relevant_mask)

    irrelevant_mask = np.where(np.array(cluster_ds["label"]) == 1)[0]
    irrelevant_ds = cluster_ds.select(irrelevant_mask)

    # 2. Print the length of the LOCAL datasets
    print(f"\tRELEVANT IN CLUSTER {cluster_id}, COUNT {len(relevant_ds)}")

    for i, row in enumerate(relevant_ds):
        if i >= 10:
            break
        print(f"\t\tLABEL: {row['label']} CONTENT: {row['content']}")
    # -----------------------

    print(f"\tIRRELEVANT IN CLUSTER {cluster_id}, COUNT {len(irrelevant_ds)}")

    for i, row in enumerate(irrelevant_ds):
        if i >= 10:
            break
        print(f"\t\tLABEL: {row['label']} CONTENT: {row['content']}")

    print("="*50) # Added for readability

CLUSTER -1, COUNT 834
	RELEVANT IN CLUSTER -1, COUNT 422
		LABEL: 0 CONTENT: cuba pi buat demonstrasi kat jalan tar tu
		LABEL: 0 CONTENT: Ada kawan kongsi video ni. Ketika itu hadir di Dataran Merdeka menyokong demonstrasi mahasiswa untuk pendidikan percuma. Mungkin perlu lagi demonstrasi Pendidikan Percuma, Pendidikan Untuk Semua.
		LABEL: 0 CONTENT: Apanya yg aneh? Kau kira VAR d intimidasi ma #peringatandarurat ???
		LABEL: 0 CONTENT: Hasil Pengeluaran POIPET 12 Hari Ini Sabtu, 29-3-2025 Result : 3181,0575,1283 Selamat Kepada Pemenang #siapwd #CabutUUTNI #INDONESIAGELAP #TolakRUUPolri
		LABEL: 0 CONTENT: Kemeriahan suasana aktiviti Demonstrasi Mengarang Destar Versi Pendidikan di Pentas Utama Reruai Kementerian Perpaduan Negara yang telah diadakan sempena Program 2 Tahun Kerajaan MADANI dan Konvensyen Nasional Reformasi Perkhidmatan Awam Tahun 2024.
		LABEL: 0 CONTENT: *mau retweet *ada logo AD
		LABEL: 0 CONTENT: Dah ditutup yaa ges, terima kasih semuanya yg sudah ikut berpartisip

In [61]:
import pandas as pd
import numpy as np
import json
# from datasets import Dataset # Uncomment if your test_ds is a Hugging Face Dataset

def prepare_data_for_suite(dataset, cluster_labels, cluster_probabilities, output_filename="data_for_suite.json"):
    """
    Combines the dataset, cluster labels, and probabilities into a single
    JSON file and prints its content to the console.

    Args:
        dataset: Your test_ds (either a Pandas DataFrame or Hugging Face Dataset).
        cluster_labels: The NumPy array of cluster assignments.
        cluster_probabilities: The NumPy array of cluster probabilities.
        output_filename: The name of the JSON file to create.
    """
    print("Starting data preparation...")

    # --- Convert to Pandas DataFrame ---
    try:
        df = dataset.to_pandas()
        print(f"Successfully converted dataset to Pandas. Shape: {df.shape}")
    except Exception as e:
        print(f"Error converting dataset to Pandas: {e}")
        print("Please ensure 'dataset' is a Pandas DataFrame or Hugging Face Dataset.")
        return

    # --- Add cluster and probability data ---
    if len(df) != len(cluster_labels) or len(df) != len(cluster_probabilities):
        print("Error: Dataset length does not match cluster labels or probabilities length.")
        print(f"Dataset: {len(df)}, Clusters: {len(cluster_labels)}, Probs: {len(cluster_probabilities)}")
        return

    df['cluster'] = cluster_labels
    df['probability'] = cluster_probabilities

    # --- Add a unique ID and status columns for the app ---
    df['id'] = df.index
    df['original_label'] = df['label'] # Preserve the Llama-generated label
    df['status'] = 'uncorrected' # 'uncorrected' or 'corrected'

    # --- Reorder columns for clarity in the app ---
    try:
        cols = ['id', 'cluster', 'probability', 'content', 'label', 'original_label', 'status']
        other_cols = [col for col in df.columns if col not in cols]
        final_cols = cols + other_cols
        df = df[final_cols]
    except KeyError as e:
        print(f"Error: The dataset must contain 'content' and 'label' columns. Found: {df.columns}")
        return

    # --- Save to JSON on the kernel's filesystem ---
    json_data = []
    try:
        # We must use .to_dict('records') to ensure JSON compatibility
        json_data = df.to_dict('records')

        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(json_data, f, indent=2)

        print(f"\nSuccess! Data prepared and saved to kernel as '{output_filename}'.")
        print(f"Total records: {len(df)}")
    except Exception as e:
        print(f"Error saving JSON file: {e}")
        return

    # --- NEW: Print the raw JSON data to the console for copy-pasting ---
    print("\n" + "="*80)
    print("COPY THE JSON DATA BLOCK BELOW AND SAVE IT AS 'data_for_suite.json' ON YOUR LOCAL MACHINE")
    print("="*80 + "\n")

    # Print the JSON data directly
    print(json.dumps(json_data, indent=2))

    print("\n" + "="*80)
    print("END OF JSON DATA BLOCK")
    print("="*80)
    print("\nInstructions:")
    print("1. Copy all the text between the '====' markers (including the opening '[' and closing ']').")
    print("2. Open a text editor (like Notepad, VSCode, etc.) on your LOCAL machine.")
    print("3. Paste the text and save the file as 'data_for_suite.json'.")
    print("4. You can now load this local file into the 'label_suite.html' application.")

In [64]:
prepare_data_for_suite(train_ds, train_labels, train_probabilities, output_filename="data_for_suite.json", )

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

