In [119]:
from datasets import load_dataset
ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

Generating train split: 100%|██████████| 201583/201583 [00:00<00:00, 2316483.83 examples/s]
Generating cleaned split: 100%|██████████| 201583/201583 [00:00<00:00, 3313543.72 examples/s]
Generating sampled_1000 split: 100%|██████████| 1000/1000 [00:00<00:00, 356113.43 examples/s]
Generating sampled_2000 split: 100%|██████████| 2000/2000 [00:00<00:00, 872631.64 examples/s]
Generating sampled_3000 split: 100%|██████████| 2988/2988 [00:00<00:00, 1410849.98 examples/s]
Generating sampled_4000 split: 100%|██████████| 3920/3920 [00:00<00:00, 1195062.63 examples/s]
Generating sampled_5000 split: 100%|██████████| 4840/4840 [00:00<00:00, 1399643.64 examples/s]
Generating sampled_6000 split: 100%|██████████| 5737/5737 [00:00<00:00, 1239579.75 examples/s]
Generating sampled_7000 split: 100%|██████████| 6617/6617 [00:00<00:00, 1653876.98 examples/s]
Generating sampled_8000 split: 100%|██████████| 7497/7497 [00:00<00:00, 1681264.88 examples/s]
Generating sampled_9000 split: 100%|██████████| 8365/836

In [None]:
ds_2000 = ds["sampled_2000"]

In [98]:
import ollama
from pydantic import BaseModel, Field, ValidationError
from typing import Literal
import json # We need this to handle potential string cleaning

# 1. DEFINE YOUR JSON OUTPUT STRUCTURE (Unchanged)
class TweetLabel(BaseModel):
    is_related_to_ruu_tni: bool
    confidence: float

SYSTEM_PROMPT = """
You are a high-speed, accurate data-labeling bot. Your ONLY task is to analyze a tweet and return a single, valid JSON object.

**CRITICAL RULES:**
1.  Your response MUST be ONLY the JSON object.
2.  DO NOT include any other text, explanations, apologies, or markdown formatting (like ```json).
3.  The JSON MUST have these three keys: `is_related_to_ruu_tni` (boolean), `confidence` (float), `reasoning` (string).
4.  The `confidence` value MUST be a float (e.g., 1.0, 0.5).

---

**LABELING LOGIC (Apply in this priority):**

1.  **Spam Filter:** If the tweet contains spam/commercial keywords ('giveaway', 'jualan', 'olshop', '#giveaway', '#jual'), it is **ALWAYS `false`**.
    * `Reasoning`: "Spam/engagement bait detected."
    * `Confidence`: 1.0

2.  **Explicit/Core Concept Mention:** If the text (not hashtags) contains 'ruu tni', 'rancangan undang-undang tni', 'revisi uu tni', 'dwifungsi abri', 'tni berpolitik', or 'militer masuk politik', it is **ALWAYS `true`**.
    * `Reasoning`: "Explicitly mentions RUU TNI or its core concepts (dwifungsi)."
    * `Confidence`: 1.0

3.  **Hashtag + Context:** If the tweet has a relevant hashtag, analyze the text.
    * **Relevant Hashtags:** '#tolakruutni', '#ruutni', '#dwifungsiabri', '#tolakdwifungsiabri', '#kembalikantnipromiliter', '#saveourdemocracy', '#tolakrevisiuutni', '#tolakuutni', '#tolakruupolri', '#indonesiagelap', '#tolakruukejaksaan'
    * **Case A (Context Match):** If the text is a political statement, an opinion (e.g., "ngeri banget", "setuju"), or a general expression of sentiment (e.g., "enggak ada hati nuraninya"), it is `true`. **Assume the text is related unless it's obviously about something else.**
        * `Reasoning`: "Relevant hashtag matches political context/opinion in text."
        * `Confidence`: 1.0
    * **Case B (Mismatch/Bait):** If the text is *clearly and objectively* unrelated (e.g., "cuaca hari ini...", "jual hp", "makan siang"), it is `false`.
        * `Reasoning`: "Context-Hashtag Mismatch. Text is unrelated to the hashtag."
        * `Confidence`: 1.0

4.  **Hashtag-Only:** If the tweet has no significant text:
    * **Case A (Political):** If the tweet contains **AT LEAST ONE** relevant hashtag (from Rule 3's list) AND it does **NOT** contain **ANY** spam hashtags (from Rule 1's list), it is `true`.
        * `Reasoning`: "Hashtag-only tweet with relevant political hashtags."
        * `Confidence`: 0.5
    * **Case B (Mixed/Spam):** If the tweet contains **ANY** spam hashtags (from Rule 1's list), it is `false`.
        * `Reasoning`: "Hashtag-only tweet mixed with spam/bait hashtags."
        * `Confidence`: 1.0

5.  **General/Unrelated:** If the tweet is *only* general praise ("Dirgahayu TNI", "TNI hebat") and does **NOT** contain any of the Rule 2 keywords, it is `false`.
    * `Reasoning`: "General/neutral mention of TNI, unrelated to the bill."
    * `Confidence`: 1.0
---

**EXAMPLES (User Tweet -> Your JSON Output):**

User: "gila, baca draf ruu tni serem banget. mau balik ke orde baru?"
Assistant: {"is_related_to_ruu_tni": true, "confidence": 1.0, "reasoning": "Explicitly mentions RUU TNI in the text. (Rule 2)"}

User: "Ayo menangkan giveaway hp baru! Cek bio! #tolakruutni #giveaway"
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "Spam/engagement bait detected. (Rule 1)"}

User: "Cuaca hari ini panas banget ya. #tolakruutni"
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "Context-Hashtag Mismatch. Text is unrelated to the hashtag. (Rule 3B)"}

User: "#tolakruutni #dwifungsitni #savedemokrasi"
Assistant: {"is_related_to_ruu_tni": true, "confidence": 0.5, "reasoning": "Hashtag-only tweet with purely relevant political hashtags. (Rule 4A)"}

User: "Ngeri kalo dwifungsi abri dihidupkan lagi, militer jangan ikut politik."
Assistant: {"is_related_to_ruu_tni": true, "confidence": 0.5, "reasoning": "Discusses core concepts (dwifungsi) related to RUU TNI. (Rule 5)"}

User: "Dirgahayu TNI yang ke-70! Jaya selalu di darat, laut, dan udara."
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "General/neutral mention of TNI, unrelated to the bill. (Rule 6)"}

User: "kak tara, terima kasih banyak untuk raffle-nya!! aku mau join ya ^^ #cabutuutni #tolakrevisiuutni #tolakuutni #tolakruupolri #tolakruukejaksaan"
Assistant: {"is_related_to_ruu_tni": false, "confidence": 1.0, "reasoning": "Context-Hashtag Mismatch. Text is unrelated to the hashtag. (Rule 3B)"}
---
You will now receive the user's tweet. Respond ONLY with the JSON object.
"""



In [100]:
def label_text(row):
    text = row["content"]
    try:
        response = ollama.chat(
            model="llama3:8b",
            messages=[
                {
                    "role": "system",
                    "content": SYSTEM_PROMPT
                },
                {
                    "role": "user",
                    "content": text
                }
            ],
            format='json'  # <-- The correct argument for the ollama library
        )
        content_string = response['message']['content']
        if content_string.startswith("```json"):
            content_string = content_string[7:-3].strip()
        label = TweetLabel.model_validate_json(content_string)
        row["related"] = label.is_related_to_ruu_tni
        row["confidence"] = label.confidence
        return row
    except ValidationError as e:
        print(f"VALIDATION ERROR: LLM returned malformed JSON.\n{e}")
    
    except Exception as e:
        print(f"Error processing tweet: {e}\n")

In [None]:
ds_2000 = ds_2000.map(label_text)

  StockPickler.save(self, obj, save_persistent_id)
  StockPickler.save(self, obj, save_persistent_id)
Map: 100%|██████████| 1000/1000 [39:37<00:00,  2.38s/ examples]


In [None]:
from datasets import load_dataset

# 1. Load your original dataset
print("Loading dataset 'tianharjuno/twitter-parse'...")
new_ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

# 2. Load or create your new labeled split
# (Assuming 'ds_1000' is a Dataset object you've already prepared)
# ds_1000 = ... 

# 3. Add your new split to the DatasetDict
new_ds["sampled_2000_labeled"] = ds_2000
print(f"Added new split: 'sampled_2000_labeled'")

# --- ✨ Here is the 2-step fix ---

# 4. Get the target features from your *new*, correct split
target_features = new_ds['sampled_2000_labeled'].features
print(f"Target features set: {target_features}")

# 5. Get all other splits that need to be updated
other_splits = [split for split in new_ds.keys() if split != 'sampled_2000_labeled']

# 6. Define a function to add the new columns with default 'None' values
def add_default_labels(example):
    example['related'] = None
    example['confidence'] = None
    return example

# ---
# STEP 1: Add the columns to all other splits using .map()
# This will add them with 'null' data types.
# ---
print("\n--- Step 1: Adding missing columns (as 'null') ---")
for split in other_splits:
    print(f"Mapping new columns to split: '{split}'...")
    new_ds[split] = new_ds[split].map(add_default_labels)

# ---
# STEP 2: Cast all splits to the target features
# Now that the columns exist, .cast() will work.
# It will change the 'null' types to 'bool' and 'float64'.
# ---
print("\n--- Step 2: Casting columns to correct data types ---")
for split in other_splits:
    print(f"Casting split '{split}' to match target features...")
    new_ds[split] = new_ds[split].cast(target_features)

# --- End of fix ---

# 8. Now all splits have identical features and data types
print("\nAll splits synchronized. Pushing to hub...")
new_ds.push_to_hub("tianharjuno/twitter-parse", commit_description="Labeled 1000 set data using llama")

print("Successfully pushed to hub!")

Loading dataset 'tianharjuno/twitter-parse'...
Added new split: 'sampled_1000_labeled'
Target features set: {'tweet_id': Value(dtype='string', id=None), 'time': Value(dtype='string', id=None), 'author': Value(dtype='string', id=None), 'content': Value(dtype='string', id=None), 'comment_count': Value(dtype='int64', id=None), 'repost_count': Value(dtype='int64', id=None), 'like_count': Value(dtype='int64', id=None), 'view_count': Value(dtype='int64', id=None), 'related': Value(dtype='bool', id=None), 'confidence': Value(dtype='float64', id=None)}

--- Step 1: Adding missing columns (as 'null') ---
Mapping new columns to split: 'train'...
Mapping new columns to split: 'cleaned'...
Mapping new columns to split: 'sampled_1000'...
Mapping new columns to split: 'sampled_2000'...
Mapping new columns to split: 'sampled_3000'...
Mapping new columns to split: 'sampled_4000'...
Mapping new columns to split: 'sampled_5000'...
Mapping new columns to split: 'sampled_6000'...
Mapping new columns to sp

Casting the dataset: 100%|██████████| 201583/201583 [00:00<00:00, 2763190.54 examples/s]


Casting split 'cleaned' to match target features...


Casting the dataset: 100%|██████████| 201583/201583 [00:00<00:00, 3230639.61 examples/s]


Casting split 'sampled_1000' to match target features...


Casting the dataset: 100%|██████████| 1000/1000 [00:00<00:00, 410642.65 examples/s]


Casting split 'sampled_2000' to match target features...


Casting the dataset: 100%|██████████| 2000/2000 [00:00<00:00, 649977.37 examples/s]


Casting split 'sampled_3000' to match target features...


Casting the dataset: 100%|██████████| 2988/2988 [00:00<00:00, 680342.02 examples/s]


Casting split 'sampled_4000' to match target features...


Casting the dataset: 100%|██████████| 3920/3920 [00:00<00:00, 988853.77 examples/s]


Casting split 'sampled_5000' to match target features...


Casting the dataset: 100%|██████████| 4840/4840 [00:00<00:00, 1196324.55 examples/s]


Casting split 'sampled_6000' to match target features...


Casting the dataset: 100%|██████████| 5737/5737 [00:00<00:00, 924280.63 examples/s]


Casting split 'sampled_7000' to match target features...


Casting the dataset: 100%|██████████| 6617/6617 [00:00<00:00, 1530226.03 examples/s]


Casting split 'sampled_8000' to match target features...


Casting the dataset: 100%|██████████| 7497/7497 [00:00<00:00, 1246618.18 examples/s]


Casting split 'sampled_9000' to match target features...


Casting the dataset: 100%|██████████| 8365/8365 [00:00<00:00, 1231065.02 examples/s]


Casting split 'sampled_10000' to match target features...


Casting the dataset: 100%|██████████| 9155/9155 [00:00<00:00, 600648.42 examples/s]


Casting split 'sampled_11000' to match target features...


Casting the dataset: 100%|██████████| 9906/9906 [00:00<00:00, 1888323.20 examples/s]


Casting split 'sampled_12000' to match target features...


Casting the dataset: 100%|██████████| 10560/10560 [00:00<00:00, 1531530.09 examples/s]


Casting split 'sampled_13000' to match target features...


Casting the dataset: 100%|██████████| 11120/11120 [00:00<00:00, 1678264.92 examples/s]


Casting split 'sampled_14000' to match target features...


Casting the dataset: 100%|██████████| 11680/11680 [00:00<00:00, 1741476.33 examples/s]


Casting split 'sampled_15000' to match target features...


Casting the dataset: 100%|██████████| 12213/12213 [00:00<00:00, 1471814.58 examples/s]


Casting split 'sampled_16000' to match target features...


Casting the dataset: 100%|██████████| 12691/12691 [00:00<00:00, 1771554.97 examples/s]


Casting split 'sampled_17000' to match target features...


Casting the dataset: 100%|██████████| 13131/13131 [00:00<00:00, 1659647.61 examples/s]


Casting split 'sampled_18000' to match target features...


Casting the dataset: 100%|██████████| 13571/13571 [00:00<00:00, 1790584.78 examples/s]


Casting split 'sampled_19000' to match target features...


Casting the dataset: 100%|██████████| 14011/14011 [00:00<00:00, 1752912.55 examples/s]


Casting split 'sampled_20000' to match target features...


Casting the dataset: 100%|██████████| 14451/14451 [00:00<00:00, 1387507.72 examples/s]



All splits synchronized. Pushing to hub...


Creating parquet from Arrow format: 100%|██████████| 202/202 [00:00<00:00, 1395.61ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:04<00:00,  4.90s/it]
Creating parquet from Arrow format: 100%|██████████| 202/202 [00:00<00:00, 1380.40ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:03<00:00,  3.78s/it]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 246.85ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.46s/it]
Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00, 191.79ba/s]
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 

Successfully pushed to hub!


In [112]:
for single_text in relevant_text:
    print(single_text)
    print(f"RELATED: {True}")
    print("---------------------------------------------------------------------")
    

berani maju? dukung ruu tni dan jadikan tni pelopor pertahananruu tni bakal jadi langkah maju buat militer kita, dukung penuh reformasi ini demi masa depan yang lebih aman dan sejahtera #cabutuutni #sinergitasuntukbangsa
RELATED: True
---------------------------------------------------------------------
reformasi politik suatu keharusan. kata "reformasi" menjadi khazanah dan diskursus masyarakat kita. tidak saja itu, tetapi balikan telah memotivasi kalangan birokrasi, tokoh kritis, maupun mahasiswa untuk menyelenggarakan dialog, musyawarah/ silaturahmi, bahkan demonstrasi.tampilkan lebih banyak
RELATED: True
---------------------------------------------------------------------
darurat demokrasi reformasi dihabisi #putusanmk #mahkamahkonstitusi #koalisiindonesiamaju #peringatandarurat #kawalputusanmk #kaesang #aniesbaswedan
RELATED: True
---------------------------------------------------------------------
dwifungsi abri dimasa orba mau dikembalikan.. tapi nyicil dulu.. main kecil2an du

In [109]:
for single_text in irrelevant_text:
    print(single_text)
    print(f"RELATED: {False}")
    print("---------------------------------------------------------------------")
    

demonstrasi di kota al-faqih ben saleh, maroko untuk mendukung palestina & menolak normalisasi dengan zionist yahudi. #gazagenocide‌ #freepalestinefromzionists #freepalestine
RELATED: False
---------------------------------------------------------------------
yahya sinwar di jalanan maroko! sebuah video yang beredar menunjukkan simulasi karakter yahya sinwar di maroko selama demonstrasi solidaritas untuk palestina al fatihah untuk as syahid yahya sinwar
RELATED: False
---------------------------------------------------------------------
para aktivis norwegia menggelar demonstrasi di depan norges bank di oslo, menuntut penghentian investasi dengan lembaga-lembaga zionist yahudi karena genosida di gaza. #gazagenocide‌ #freepalestinefromzionists #freepalestine
RELATED: False
---------------------------------------------------------------------
kak tara, terima kasih banyak untuk raffle-nya!! aku mau join ya ^^ #cabutuutni #tolakrevisiuutni #tolakuutni #tolakruupolri #tolakruukejaksaan
REL