In [1]:
from datasets import load_dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 201583/201583 [00:00<00:00, 1336457.34 examples/s]
Generating cleaned split: 100%|██████████| 201583/201583 [00:00<00:00, 2207872.57 examples/s]
Generating sampled_1000 split: 100%|██████████| 999/999 [00:00<00:00, 148369.74 examples/s]
Generating sampled_2000 split: 100%|██████████| 2002/2002 [00:00<00:00, 495251.94 examples/s]
Generating sampled_3000 split: 100%|██████████| 2999/2999 [00:00<00:00, 615066.14 examples/s]
Generating sampled_4000 split: 100%|██████████| 4000/4000 [00:00<00:00, 1024312.60 examples/s]
Generating sampled_5000 split: 100%|██████████| 4999/4999 [00:00<00:00, 1225227.94 examples/s]
Generating sampled_6000 split: 100%|██████████| 6002/6002 [00:00<00:00, 801547.83 examples/s]
Generating sampled_7000 split: 100%|██████████| 7001/7001 [00:00<00:00, 1182186.17 examples/s]
Generating sampled_8000 split: 100%|██████████| 8000/8000 [00:00<00:00, 1060473.18 examples/s]
Generating

In [4]:
cleaned_dataset = dataset["cleaned_labeled"]
relevant_dataset = cleaned_dataset.filter(lambda row: row["related"] == True)
irrelevant_dataset = cleaned_dataset.filter(lambda row: row["related"] == False)

In [5]:
relevant_text = relevant_dataset["content"]

In [5]:
import re
from collections import Counter

def get_top_n_hashtags(list_of_texts, n=10):
    """
    Extracts all hashtags from a list of text strings, counts them,
    and returns the top N most frequent hashtags.

    Args:
        list_of_texts (list): A list of strings to process.
        n (int): The number of top hashtags to return.

    Returns:
        list: A list of (hashtag, count) tuples, sorted by count descending.
    """
    all_hashtags = []
    
    # Regex pattern: finds a '#' followed by one or more
    # "word" characters (letters, numbers, or underscore).
    pattern = r"#\w+"
    
    for text in list_of_texts:
        # Ensure the item is a string before processing
        if isinstance(text, str):
            # Find all matches and convert to lowercase for consistent counting
            found_hashtags = re.findall(pattern, text.lower())
            all_hashtags.extend(found_hashtags)
            
    # Count the frequency of all found hashtags
    hashtag_counts = Counter(all_hashtags)
    
    # Get the top N most common
    top_n = hashtag_counts.most_common(n)
    
    return top_n

# 2. Set how many top hashtags you want
top_n = 100

# 3. Run the function
top_hashtags = get_top_n_hashtags(relevant_text, n=top_n)

In [7]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
import requests

# Sastrawi
factory = StopWordRemoverFactory()
stopwords_id = set(factory.get_stop_words())

# English
stopwords_en = set(stopwords.words('english'))
# IndoNLU
indonlu_url = "https://raw.githubusercontent.com/IndoNLP/indonlu/master/utils/stopwordbahasa.txt"
stopwords_indonlu = set(requests.get(indonlu_url).text.split())

# IDN stopwords
idn_url = "https://raw.githubusercontent.com/Alir3z4/stop-words/master/indonesian.txt"
stopwords_idn = set(requests.get(idn_url).text.split())

# Colloquial extensions (dialect + slang)
extra_slang = {
    "ga","gak","nggak","ngga","aja","nih","dong","deh","lah","loh","kok",
    "kan","nya","ya","emang","tau","yg","pd","trs","pls","plis","thx","makasih",
    "makasi","terimakasih","bgt","bngt","bener","btw","rt","dm","gw","gue","lu",
    "loe","lo","gua","ny","nya","jd","jadi","trs","sih","kayak","kek","oke",
    "ok","bro","sis","min","kalo","kalau","dgn","dengan","bikin","nih","dong"
}

# Combine and lowercase normalize
combined_stopwords = set(
    word.lower() for word in (
        stopwords_id |
        stopwords_en |
        stopwords_indonlu |
        stopwords_idn |
        extra_slang
    )
)

print(len(combined_stopwords), "total stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/christianharjuno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


621 total stopwords


In [8]:
import requests
import wordsegment
from keybert import KeyBERT
from langdetect import detect
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import os # Added for ProcessPoolExecutor

# --- 1. Set Global Constants ---
MIN_AVG_WORD_LEN = 2.5

# --- 2. Define Worker Function (MUST be at top level) ---
# This function will be sent to each CPU core.
def process_line(text_line):
    """
    Processes a single line of text to find valid, non-noise hashtags
    and their segmented "meaning".
    """
    local_doc_parts = []
    local_candidates = set()
    
    # Ensure input is a string
    if not isinstance(text_line, str):
        return local_doc_parts, local_candidates

    hashtags = text_line.split()

    for tag in hashtags:
        # 1. Segment the hashtag
        segmented_words = wordsegment.segment(tag)
        
        if not segmented_words:
            continue
            
        # 2. Calculate average word length
        avg_len = sum(len(w) for w in segmented_words) / len(segmented_words)
        
        # 3. Apply the filter
        if avg_len >= MIN_AVG_WORD_LEN:
            segmented_sentence = " ".join(segmented_words)
            
            # 4. Optional: Language check
            try:
                lang = detect(segmented_sentence)
                if lang not in ['id', 'en']:
                    continue
            except:
                pass 
                
            # If all filters pass, add the data
            local_doc_parts.append(segmented_sentence)
            local_candidates.add(tag)
            
    return local_doc_parts, local_candidates

# --- 3. Main execution block (REQUIRED for multiprocessing) ---
if __name__ == "__main__":
    
    # --- Load Wordsegment Model (once in main process) ---
    try:
        wordsegment.load()
        print("Wordsegment model loaded.")
    except Exception as e:
        print(f"Could not load wordsegment data: {e}")
        print("Please run 'pip install wordsegment' and ensure you have internet.")
        raise e
    
    print(f"Loaded {len(relevant_text)} lines to process.")

    # --- 4. Process Data in Parallel ---
    doc_for_embedding_parts = []
    hashtag_candidates = set()

    print("Processing and filtering hashtags in parallel...")
    
    # Use ProcessPoolExecutor to run CPU-bound tasks in parallel
    # It will automatically use all available CPU cores
    with ProcessPoolExecutor() as executor:
        # 'executor.map' runs 'process_line' on each item in 'relevant_text'
        # We wrap the map iterator in tqdm to get a single, clean progress bar
        results = list(tqdm(executor.map(process_line, relevant_text), total=len(relevant_text)))

    # --- 5. Aggregate Results from All Processes ---
    print("Aggregating results...")
    for doc_parts, candidates in results:
        doc_for_embedding_parts.extend(doc_parts)
        hashtag_candidates.update(candidates)

    # --- 6. Check if We Have Any Data Left ---
    doc_for_embedding = " ".join(doc_for_embedding_parts)
    hashtag_candidates_list = list(hashtag_candidates)

    if not doc_for_embedding or not hashtag_candidates_list:
        print("\nError: After filtering, no valid hashtags or corpus content was found.")
        print("Your dataset might be 100% noise, or your MIN_AVG_WORD_LEN is too high.")
    else:
        print(f"\nFiltered candidate count: {len(hashtag_candidates_list)}")
        print(f"Filtered 'meaning' doc sample: '{doc_for_embedding[:200]}...'")

        # --- 7. Load KeyBERT and Extract Keywords ---
        model_name = "paraphrase-multilingual-MiniLM-L12-v2"
        kw_model = KeyBERT(model=model_name)

        print("\nRunning KeyBERT on filtered data...")
        
        # **FIX**: Use 'doc=' for a single string, not 'docs='
        keywords = kw_model.extract_keywords(
            docs=doc_for_embedding,      
            candidates=hashtag_candidates_list, 
            top_n=50
        )

        # --- 8. View Your (Now Correct) Results ---
        print("\n--- Top 50 Semantically-Ranked (and Filtered) Hashtags ---")
        for keyword, score in keywords:
            print(f"  - '{keyword}' (Score: {score:.4f})")

Wordsegment model loaded.
Loaded 111949 lines to process.
Processing and filtering hashtags in parallel...


100%|██████████| 111949/111949 [10:29<00:00, 177.78it/s]


Aggregating results...

Filtered candidate count: 29121
Filtered 'meaning' doc sample: 'dan memb uk tik an pertamina ber jalan seperti yang bukan se kedar kembali kan ten tara barak ken apa inst ansi yang mem per boleh kan tidak dengan ran ah masih engg ak men yang ka ten tara beberapa y...'

Running KeyBERT on filtered data...





--- Top 50 Semantically-Ranked (and Filtered) Hashtags ---
  - 'negara2an' (Score: 0.4902)
  - 'negara' (Score: 0.4331)
  - 'debat' (Score: 0.4330)
  - 'penjahat2' (Score: 0.4272)
  - 'negara2' (Score: 0.4215)
  - 'debate' (Score: 0.4155)
  - 'pemerintahyg' (Score: 0.4122)
  - 'kebangsaan' (Score: 0.4111)
  - 'kesewenangwenangan' (Score: 0.4081)
  - 'kerana' (Score: 0.4033)
  - 'treasonous' (Score: 0.4028)
  - 'masyarakat' (Score: 0.4001)
  - 'authoritarianism' (Score: 0.3983)
  - 'lagipemrintah' (Score: 0.3968)
  - 'pemerintahanpeneliti' (Score: 0.3909)
  - 'pemerintahnya' (Score: 0.3893)
  - 'penyelewengannya' (Score: 0.3876)
  - 'ada' (Score: 0.3859)
  - 'pokoknyaaw' (Score: 0.3856)
  - 'bandit' (Score: 0.3836)
  - 'kesewenang2an' (Score: 0.3825)
  - 'anarchism' (Score: 0.3816)
  - 'menyangkutpautkan' (Score: 0.3780)
  - 'menyebabkannya' (Score: 0.3779)
  - 'ketidakterbukaan' (Score: 0.3763)
  - 'ketidaktahuannya' (Score: 0.3743)
  - 'penyebabnya' (Score: 0.3725)
  - 'kenapa2' (Sco

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_tokens(list_of_texts, n=10, stop_words=None):
    """
    Extracts all words and hashtags from a list of text strings, 
    counts them, and returns the top N most frequent tokens.

    Args:
        list_of_texts (list): A list of strings to process.
        n (int): The number of top tokens to return.
        stop_words (list, optional): A list of stopwords to ignore.

    Returns:
        list: A list of (token, count) tuples, sorted by count descending.
    """
    
    # This regex pattern finds:
    # 1. (#\w+)   : A hashtag (e.g., #python)
    # 2. |         : OR
    # 3. (\b\w\w+\b): A regular word of 2 or more characters (e.g., python)
    # The order is important so "#python" isn't matched as just "python".
    token_pattern = r"(#\w+|\b\w\w+\b)"
    
    # Filter out any non-string items
    valid_texts = [text for text in list_of_texts if isinstance(text, str)]
    
    if not valid_texts:
        return []

    # Initialize the vectorizer with the new pattern and stopwords
    vectorizer = CountVectorizer(
        token_pattern=token_pattern, 
        lowercase=True,
        stop_words=stop_words,
        ngram_range=(1,5)
    )
    
    # 1. Fit and transform the text
    X = vectorizer.fit_transform(valid_texts)
    
    # 2. Get the feature names (words and hashtags)
    tokens = vectorizer.get_feature_names_out()
    
    # 3. Sum the counts for each token
    counts = X.sum(axis=0).A1
    
    # 4. Zip tokens and counts
    tag_counts = list(zip(tokens, counts))
    
    # 5. Sort by count in descending order
    sorted_tokens = sorted(tag_counts, key=lambda item: item[1], reverse=True)
    
    # 6. Return the top N
    return sorted_tokens[:n]

# --- Example Usage ---

# 3. Set how many top tokens you want
top_n = 200

# 4. Run the function
top_tokens = get_top_n_tokens(relevant_text, n=top_n, stop_words=list(combined_stopwords))

# 5. Print the results
print(f"Top {top_n} most used tokens (words and hashtags):")
for token, count in top_tokens:
    print(f"  {token}: {count} times")



Top 200 most used tokens (words and hashtags):
  #indonesiagelap: 54270 times
  #tolakruupolri: 45941 times
  #peringatandarurat: 39188 times
  tni: 37127 times
  #tolakdwifungsiabri: 36801 times
  #tolakrevisiuutni: 36717 times
  #peringatandarurat #indonesiagelap: 34013 times
  #tolakruutni: 33754 times
  ruu: 27235 times
  #cabutruutni: 25443 times
  #supremasisipil: 24765 times
  ruu tni: 23497 times
  #tolakruukejaksaan: 23470 times
  #cabutuutni: 23239 times
  #indonesiagelap #tolakdwifungsiabri: 23145 times
  #tolakrevisiuutni #peringatandarurat: 21731 times
  #peringatandarurat #indonesiagelap #tolakdwifungsiabri: 21497 times
  #tolakrevisiuutni #peringatandarurat #indonesiagelap: 21101 times
  #tolakruupolri #tolakruukejaksaan: 20598 times
  #tolakrevisiuutni #peringatandarurat #indonesiagelap #tolakdwifungsiabri: 20294 times
  #tolakdwifungsiabri #supremasisipil: 19547 times
  #tolakruutni #tolakrevisiuutni: 18881 times
  #indonesiagelap #tolakdwifungsiabri #supremasisipil: 1

In [15]:
import re
from collections import Counter

def reaggregate_hashtag_counts(top_tokens):
    """
    Re-processes a list of (token, count) tuples.
    
    - If a token is just words (e.g., 'ruu tni'), its count is kept as is.
    - If a token contains hashtags (e.g., '#tolakruu #tolakpolri'), its
      count is added to *each* individual hashtag found within it.
    """
    new_counts = Counter()
    pattern = re.compile(r"#\w+")
    
    for token, count in top_tokens:
        hashtags_found = pattern.findall(token)
        
        if not hashtags_found:
            # Case 1: No hashtags. This is a word n-gram.
            # Add it to the new counter directly.
            # e.g., new_counts['tolak ruu'] += 5
            new_counts[token] += count
        else:
            # Case 2: Hashtags were found.
            # Add the count to *each* individual hashtag.
            for tag in hashtags_found:
                # e.g., new_counts['#tolakruu'] += 1
                # e.g., new_counts['#tolakpolri'] += 1
                new_counts[tag] += count
                
    # Return the new, correctly aggregated counts, sorted descending
    return new_counts.most_common()
# 2. Run the function
reaggregated_tokens = reaggregate_hashtag_counts(top_tokens)

# 3. Print the results
print("Re-aggregated Top Tokens:")

for token, count in reaggregated_tokens:
    print(f"  {token}: {count} times")

Re-aggregated Top Tokens:
  #indonesiagelap: 453502 times
  #peringatandarurat: 416376 times
  #tolakrevisiuutni: 349856 times
  #tolakdwifungsiabri: 330818 times
  #tolakruupolri: 247640 times
  #tolakruutni: 221208 times
  #supremasisipil: 217982 times
  #cabutruutni: 133066 times
  #tolakuutni: 113151 times
  #tolakruukejaksaan: 106603 times
  #cabutuutni: 101564 times
  #gagalkanruutni: 62699 times
  #gagalkanuutni: 42117 times
  tni: 37127 times
  ruu: 27235 times
  ruu tni: 23497 times
  #tolakruukejaksan: 18548 times
  #kembalikantnikebarak: 16008 times
  dwifungsi: 12759 times
  #makzulkanprabowogibran: 9164 times
  indonesia: 8280 times
  rakyat: 6692 times
  demo: 6204 times
  uu: 5911 times
  abri: 5148 times
  orang: 5076 times
  negara: 5040 times
  banget: 4683 times
  dwifungsi abri: 4643 times
  mahasiswa: 4039 times
  sipil: 4035 times
  dpr: 3928 times
  uu tni: 3818 times
  pemerintah: 3807 times
  aksi: 3716 times
  polri: 3707 times
  demonstrasi: 3386 times
  teru

In [18]:
import wordsegment
from collections import Counter

# --- Load Wordsegment Model (Run this once) ---
try:
    wordsegment.load()
    print("Wordsegment model loaded.")
except Exception as e:
    print(f"Could not load wordsegment data: {e}")
    # This is critical, so we can't continue without it.
    raise e

def clean_and_segment_token(token):
    """
    Cleans a single token string.
    
    - If it's a hashtag, it's returned as is.
    - If it's a jumbled word, it's segmented.
    - If it's already a clean n-gram, it's returned as is.
    """
    if not isinstance(token, str):
        return None # Handle bad data
    
    # Rule 1: If it's a hashtag, leave it alone.
    if token.startswith("#"):
        return token
    
    # Rule 2: If it's not a hashtag, segment it.
    # wordsegment.segment("demomahasiswa") -> ["demo", "mahasiswa"]
    # wordsegment.segment("demo mahasiswa") -> ["demo", "mahasiswa"]
    # wordsegment.segment("ruutni") -> ["ruu", "tni"]
    segmented_words = wordsegment.segment(token)
    
    return " ".join(segmented_words)
# 2. Apply the cleaning function and re-aggregate counts
#    We use a new Counter because "demomahasiswa" and "demo mahasiswa"
#    will now both become "demo mahasiswa" and their counts should be combined.
final_clean_counts = Counter()

print("\nCleaning and re-aggregating final list...")
for token, count in reaggregated_tokens:
    # Use the regex from the last step to split multi-hashtag tokens
    hashtags_found = re.findall(r"#\w+", token)
    
    if hashtags_found:
        # If the token is '#tolakruu #tolakpolri', add its count to both
        for tag in hashtags_found:
            final_clean_counts[tag] += count
    else:
        # Not a hashtag, so clean and segment it
        clean_token = clean_and_segment_token(token)
        if clean_token:
            final_clean_counts[clean_token] += count

# 3. Print the final, clean list
print("\n--- Final Clean Token List ---")
for token, count in final_clean_counts.most_common(50):
    print(f"  '{token}': {count} times")

Wordsegment model loaded.

Cleaning and re-aggregating final list...

--- Final Clean Token List ---
  '#indonesiagelap': 453502 times
  '#peringatandarurat': 416376 times
  '#tolakrevisiuutni': 349856 times
  '#tolakdwifungsiabri': 330818 times
  '#tolakruupolri': 247640 times
  '#tolakruutni': 221208 times
  '#supremasisipil': 217982 times
  '#cabutruutni': 133066 times
  '#tolakuutni': 113151 times
  '#tolakruukejaksaan': 106603 times
  '#cabutuutni': 101564 times
  '#gagalkanruutni': 62699 times
  '#gagalkanuutni': 42117 times
  'tni': 37127 times
  'ruu': 27235 times
  'ruu tni': 23497 times
  '#tolakruukejaksan': 18548 times
  '#kembalikantnikebarak': 16008 times
  'dwi fun gsi': 12759 times
  '#makzulkanprabowogibran': 9164 times
  'indonesia': 8280 times
  'rakyat': 6692 times
  'demo': 6204 times
  'uu': 5911 times
  'abri': 5148 times
  'orang': 5076 times
  'negara': 5040 times
  'banget': 4683 times
  'dwi fun gsi abri': 4643 times
  'mah as is wa': 4039 times
  'sip il': 4