In [6]:
from datasets import load_dataset
ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

Generating train split:   0%|          | 0/201583 [00:00<?, ? examples/s]

Generating cleaned split:   0%|          | 0/201583 [00:00<?, ? examples/s]

Generating sampled_1000 split:   0%|          | 0/999 [00:00<?, ? examples/s]

Generating sampled_2000 split:   0%|          | 0/2002 [00:00<?, ? examples/s]

Generating sampled_3000 split:   0%|          | 0/2999 [00:00<?, ? examples/s]

Generating sampled_4000 split:   0%|          | 0/4000 [00:00<?, ? examples/s]

Generating sampled_5000 split:   0%|          | 0/4999 [00:00<?, ? examples/s]

Generating sampled_6000 split:   0%|          | 0/6002 [00:00<?, ? examples/s]

Generating sampled_7000 split:   0%|          | 0/7001 [00:00<?, ? examples/s]

Generating sampled_8000 split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating sampled_9000 split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating sampled_10000 split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating sampled_11000 split:   0%|          | 0/11000 [00:00<?, ? examples/s]

Generating sampled_12000 split:   0%|          | 0/12001 [00:00<?, ? examples/s]

Generating sampled_13000 split:   0%|          | 0/12998 [00:00<?, ? examples/s]

Generating sampled_14000 split:   0%|          | 0/14001 [00:00<?, ? examples/s]

Generating sampled_15000 split:   0%|          | 0/15000 [00:00<?, ? examples/s]

Generating sampled_16000 split:   0%|          | 0/16002 [00:00<?, ? examples/s]

Generating sampled_17000 split:   0%|          | 0/16999 [00:00<?, ? examples/s]

Generating sampled_18000 split:   0%|          | 0/18000 [00:00<?, ? examples/s]

Generating sampled_19000 split:   0%|          | 0/19001 [00:00<?, ? examples/s]

Generating sampled_20000 split:   0%|          | 0/19999 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [7]:
ds_test = ds["sampled_20000"]

In [8]:
import ollama
from pydantic import BaseModel, Field, ValidationError
from typing import Literal
import json # We need this to handle potential string cleaning

# 1. DEFINE YOUR JSON OUTPUT STRUCTURE (Unchanged)
class TweetLabel(BaseModel):
    is_related_to_ruu_tni: bool
    confidence: float

SYSTEM_PROMPT = """
You are a high-speed, accurate data-labeling bot. Your ONLY task is to analyze a tweet and return a single, valid JSON object.

**CRITICAL RULES:**
1.  Your response MUST be ONLY the JSON object.
2.  DO NOT include any other text, explanations, apologies, or markdown formatting (like ```json).
3.  The JSON MUST have these three keys: `is_related_to_ruu_tni` (boolean), `confidence` (float), `reasoning` (string).
4.  The `confidence` value MUST be a float (e.g., 1.0, 0.5).

---

**LABELING LOGIC (Apply in this strict priority order):**

**Layer 0: Inviolable Spam & Bait Filter**
If the tweet text contains ANY of the following keywords, it is **ALWAYS `false`**. This rule overrides ALL other rules.
*   **Keywords:** `giveaway`, `ga`, `raffle`, `jualan`, `olshop`, `jual`, `wts`, `jastip`, `promo`, `diskon`, `murah`, `cek bio`, `link di bio`, `klik bio`, `linkonbio`, `cek pinned`, `kak`, `join`, `ikut`, `ikutan`, `wish me luck`, `wml`, `bismillah win`
*   `Reasoning`: "Spam/Engagement bait detected."
*   `Confidence`: 1.0

**Layer 1: Explicit "Smoking Gun" Filter**
If the text (not just hashtags) contains ANY of the following core concepts or actors, it is **ALWAYS `true`**.
*   **Core Bill:** `ruu tni`, `revisi uu tni`, `revisi uu 34 2004`, `uu tni`, `ruu tentara nasional indonesia`
*   **Core Concepts:** `dwifungsi abri`, `dwifungsi tni`, `jabatan sipil`, `perluasan jabatan sipil`, `omsp`, `operasi militer selain perang`, `tni berpolitik`, `militer masuk politik`, `peradilan militer`, `impunitas`, `kemunduran reformasi`, `ancaman demokrasi`
*   **Key Actors:** `imparsial`, `kontras`, `komnas ham`, `koalisi sipil`, `koalisi masyarakat sipil`
*   `Reasoning`: "Explicitly mentions RUU TNI or its core controversial concepts."
*   `Confidence`: 1.0

**Layer 2: "Package" Context Filter**
If the text mentions a related "package" bill AND has a relevant political hashtag, it is `true`.
*   **Text Keywords:** `ruu polri`, `revisi uu polri`, `ruu kejaksaan`, `polisi superbody`
*   **AND**
*   **Hashtags:** `#tolakruutni`, `#ruutni`, `#dwifungsiabri`, `#tolakdwifungsiabri`, `#kembalikantnipromiliter`, `#saveourdemocracy`, `#tolakrevisiuutni`, `#tolakuutni`, `#tolakruupolri`, `#indonesiagelap`, `#tolakruukejaksaan`
*   `Reasoning`: "Discusses related 'package' bills (RUU Polri/Kejaksaan) within the RUU TNI protest context."
*   `Confidence`: 1.0

**Layer 3: General Irrelevance Filter**
If the text is a general/neutral mention of the TNI institution AND lacks any Layer 1 keywords, it is `false` (even if it has a relevant hashtag).
*   **Text Keywords:** `dirgahayu tni`, `hut tni`, `tni hebat`, `prajurit`, `amankan perbatasan`, `tni bantu rakyat`, `tni jaya selalu`
*   `Reasoning`: "General/neutral mention of TNI institution, unrelated to the legislative bill."
*   `Confidence`: 1.0

**Layer 4: Final Adjudication (If no other layer triggered)**
*   **Case A (Context Match):** Text is a simple opinion (`ngeri banget`, `setuju`, `tolak`, `parah`, `enggak ada hati nuraninya`, `gila`) AND has a relevant hashtag (from Layer 2 list).
    *   `is_related_to_ruu_tni`: true
    *   `Reasoning`: "Relevant hashtag matches political opinion/sentiment in text."
    *   `Confidence`: 0.9
*   **Case B (Hashtag-Only):** Tweet has no significant text (or only the hashtag) AND has at least one relevant political hashtag (from Layer 2 list).
    *   `is_related_to_ruu_tni`: true
    *   `Reasoning`: "Hashtag-only tweet with relevant political hashtags."
    *   `Confidence`: 0.5
*   **Case C (Total Mismatch):** Text is *clearly and objectively* unrelated (e.g., `cuaca hari ini...`, `makan siang`, `jual hp`) AND has a relevant political hashtag.
    *   `is_related_to_ruu_tni`: false
    *   `Reasoning`: "Context-Hashtag Mismatch. Text is unrelated to the hashtag."
    *   `Confidence`: 1.0

---

**EXAMPLES (User Tweet -> Your JSON Output):**

User: "gila, baca draf ruu tni serem banget. mau balik ke orde baru?"
Assistant: {"is_related_to_ruu_tni": true, "confidence": 1.0, "reasoning": "Explicitly mentions RUU TNI or its core controversial concepts. (Layer 1)"}

User: "Ngeri kalo dwifungsi abri dihidupkan lagi, militer jangan ikut politik."
Assistant: {"is_related_to_ruu_tni": true, "confidence": 1.0, "reasoning": "Explicitly mentions RUU TNI or its core controversial concepts. (Layer 1)"}

User: "Ayo menangkan giveaway hp baru! Cek bio! #tolakruutni #giveaway"
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "Spam/Engagement bait detected. (Layer 0)"}

User: "kak tara, terima kasih banyak untuk raffle-nya!! aku mau join ya ^^ #tolakrevisiuutni"
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "Spam/Engagement bait detected. (Layer 0)"}

User: "Pemerintah ngebut bahas RUU Polri, mau jadi superbody? Ngeri. #tolakruutni"
Assistant: {"is_related_to_ruu_tni": true, "confidence": 1.0, "reasoning": "Discusses related 'package' bills (RUU Polri/Kejaksaan) within the RUU TNI protest context. (Layer 2)"}

User: "Dirgahayu TNI yang ke-70! Jaya selalu di darat, laut, dan udara. #tolakruutni"
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "General/neutral mention of TNI institution, unrelated to the legislative bill. (Layer 3)"}

User: "Cuaca hari ini panas banget ya. #tolakruutni"
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "Context-Hashtag Mismatch. Text is unrelated to the hashtag. (Layer 4C)"}

User: "Gila sih ini. #tolakruutni #dwifungsiabri #saveourdemocracy"
Assistant: {"is_related_to_ruu_tni": true, "confidence": 0.9, "reasoning": "Relevant hashtag matches political opinion/sentiment in text. (Layer 4A)"}

User: "#tolakruutni #dwifungsitni"
Assistant: {"is_related_to_ruu_tni": true, "confidence": 0.5, "reasoning": "Hashtag-only tweet with relevant political hashtags. (Layer 4B)"}
---
You will now receive the user's tweet. Respond ONLY with the JSON object.
"""

In [9]:
def label_text(row):
    text = row["content"]
    try:
        response = ollama.chat(
            model="llama3:8b",
            messages=[
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT
                },
                {
                    "role": "user",
                    "content": text
                }
            ],
            format='json'  # <-- The correct argument for the ollama library
        )
        content_string = response['message']['content']
        if content_string.startswith("```json"):
            content_string = content_string[7:-3].strip()
        label = TweetLabel.model_validate_json(content_string)
        row["related"] = label.is_related_to_ruu_tni
        row["confidence"] = label.confidence
        return row
    except ValidationError as e:
        print(f"VALIDATION ERROR: LLM returned malformed JSON.\n{e}")
    
    except Exception as e:
        print(f"Error processing tweet: {e}\n")

In [None]:
ds_test = ds_test.map(label_text)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)


Map:   0%|          | 0/19999 [00:00<?, ? examples/s]

In [None]:
print("Loading dataset 'tianharjuno/twitter-parse'...")
new_ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

# 2. Load or create your new labeled split
# (Assuming 'ds_1000' is a Dataset object you've already prepared)
# ds_1000 = ... 

# 3. Add your new split to the DatasetDict
new_ds["sampled_20000_labeled"] = ds_test
new_ds.push_to_hub("tianharjuno/twitter-parse", commit_description="Labeled 1000 set data using llama")

Loading dataset 'tianharjuno/twitter-parse'...


README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/255k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/201583 [00:00<?, ? examples/s]

Generating cleaned split:   0%|          | 0/201583 [00:00<?, ? examples/s]

Generating sampled_1000 split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating sampled_2000 split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating sampled_3000 split:   0%|          | 0/2988 [00:00<?, ? examples/s]

Generating sampled_4000 split:   0%|          | 0/3920 [00:00<?, ? examples/s]

Generating sampled_5000 split:   0%|          | 0/4840 [00:00<?, ? examples/s]

Generating sampled_6000 split:   0%|          | 0/5737 [00:00<?, ? examples/s]

Generating sampled_7000 split:   0%|          | 0/6617 [00:00<?, ? examples/s]

Generating sampled_8000 split:   0%|          | 0/7497 [00:00<?, ? examples/s]

Generating sampled_9000 split:   0%|          | 0/8365 [00:00<?, ? examples/s]

Generating sampled_10000 split:   0%|          | 0/9155 [00:00<?, ? examples/s]

Generating sampled_11000 split:   0%|          | 0/9906 [00:00<?, ? examples/s]

Generating sampled_12000 split:   0%|          | 0/10560 [00:00<?, ? examples/s]

Generating sampled_13000 split:   0%|          | 0/11120 [00:00<?, ? examples/s]

Generating sampled_14000 split:   0%|          | 0/11680 [00:00<?, ? examples/s]

Generating sampled_15000 split:   0%|          | 0/12213 [00:00<?, ? examples/s]

Generating sampled_16000 split:   0%|          | 0/12691 [00:00<?, ? examples/s]

Generating sampled_17000 split:   0%|          | 0/13131 [00:00<?, ? examples/s]

Generating sampled_18000 split:   0%|          | 0/13571 [00:00<?, ? examples/s]

Generating sampled_19000 split:   0%|          | 0/14011 [00:00<?, ? examples/s]

Generating sampled_20000 split:   0%|          | 0/14451 [00:00<?, ? examples/s]

Generating sampled_1000_labeled split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating sampled_2000_labeled split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating sampled_20000_labeled split:   0%|          | 0/14450 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/13 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Upload 0 LFS files: 0it [00:00, ?it/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/c68b0faa1d8ea51b2df3397630449451e5fb3bf6', commit_message='Upload dataset', commit_description='Labeled 1000 set data using llama', oid='c68b0faa1d8ea51b2df3397630449451e5fb3bf6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)

In [None]:
for row in new_ds["sampled_2000_labeled"]:
    print(row["content"])
    print(row["related"])
    print("=====================================================================================================================")

berani maju? dukung ruu tni dan jadikan tni pelopor pertahananruu tni bakal jadi langkah maju buat militer kita, dukung penuh reformasi ini demi masa depan yang lebih aman dan sejahtera #cabutuutni #sinergitasuntukbangsa
True
reformasi politik suatu keharusan. kata "reformasi" menjadi khazanah dan diskursus masyarakat kita. tidak saja itu, tetapi balikan telah memotivasi kalangan birokrasi, tokoh kritis, maupun mahasiswa untuk menyelenggarakan dialog, musyawarah/ silaturahmi, bahkan demonstrasi.tampilkan lebih banyak
True
darurat demokrasi reformasi dihabisi #putusanmk #mahkamahkonstitusi #koalisiindonesiamaju #peringatandarurat #kawalputusanmk #kaesang #aniesbaswedan
True
dwifungsi abri dimasa orba mau dikembalikan.. tapi nyicil dulu.. main kecil2an dulu biar enggak frontal aja.. lama-lama jadi tuh barang.. ayo mahasiswa dan rakyat kita kembalikan ke reformasi yang lurus..
True
#indonesiagelap #adilijokowi #indonesiadarurat #turunkanfufufafa feb. revolusi atau reformasi.[lagi] ''yang 