In [1]:
from datasets import load_dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")

In [2]:
cleaned_dataset = dataset["cleaned_labeled"]
relevant_dataset = cleaned_dataset.filter(lambda row: row["related"] == True)
irrelevant_dataset = cleaned_dataset.filter(lambda row: row["related"] == False)

In [3]:
relevant_text = relevant_dataset["content"]

In [5]:
import re
from collections import Counter

def get_top_n_hashtags(list_of_texts, n=10):
    """
    Extracts all hashtags from a list of text strings, counts them,
    and returns the top N most frequent hashtags.

    Args:
        list_of_texts (list): A list of strings to process.
        n (int): The number of top hashtags to return.

    Returns:
        list: A list of (hashtag, count) tuples, sorted by count descending.
    """
    all_hashtags = []
    
    # Regex pattern: finds a '#' followed by one or more
    # "word" characters (letters, numbers, or underscore).
    pattern = r"#\w+"
    
    for text in list_of_texts:
        # Ensure the item is a string before processing
        if isinstance(text, str):
            # Find all matches and convert to lowercase for consistent counting
            found_hashtags = re.findall(pattern, text.lower())
            all_hashtags.extend(found_hashtags)
            
    # Count the frequency of all found hashtags
    hashtag_counts = Counter(all_hashtags)
    
    # Get the top N most common
    top_n = hashtag_counts.most_common(n)
    
    return top_n

# 2. Set how many top hashtags you want
top_n = 100

# 3. Run the function
top_hashtags = get_top_n_hashtags(relevant_text, n=top_n)

In [10]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")
import requests

# Sastrawi
factory = StopWordRemoverFactory()
stopwords_id = set(factory.get_stop_words())

# English
stopwords_en = set(stopwords.words('english'))
# IndoNLU
indonlu_url = "https://raw.githubusercontent.com/IndoNLP/indonlu/master/utils/stopwordbahasa.txt"
stopwords_indonlu = set(requests.get(indonlu_url).text.split())

# IDN stopwords
idn_url = "https://raw.githubusercontent.com/Alir3z4/stop-words/master/indonesian.txt"
stopwords_idn = set(requests.get(idn_url).text.split())

# Colloquial extensions (dialect + slang)
extra_slang = {
    "ga","gak","nggak","ngga","aja","nih","dong","deh","lah","loh","kok",
    "kan","nya","ya","emang","tau","yg","pd","trs","pls","plis","thx","makasih",
    "makasi","terimakasih","bgt","bngt","bener","btw","rt","dm","gw","gue","lu",
    "loe","lo","gua","ny","nya","jd","jadi","trs","sih","kayak","kek","oke",
    "ok","bro","sis","min","kalo","kalau","dgn","dengan","bikin","nih","dong"
}

# Combine and lowercase normalize
combined_stopwords = set(
    word.lower() for word in (
        stopwords_id |
        stopwords_en |
        stopwords_indonlu |
        stopwords_idn |
        extra_slang
    )
)

print(len(combined_stopwords), "total stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


621 total stopwords


In [8]:
import requests
import wordsegment
from keybert import KeyBERT
from langdetect import detect
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
import os # Added for ProcessPoolExecutor

# --- 1. Set Global Constants ---
MIN_AVG_WORD_LEN = 2.5

# --- 2. Define Worker Function (MUST be at top level) ---
# This function will be sent to each CPU core.
def process_line(text_line):
    """
    Processes a single line of text to find valid, non-noise hashtags
    and their segmented "meaning".
    """
    local_doc_parts = []
    local_candidates = set()
    
    # Ensure input is a string
    if not isinstance(text_line, str):
        return local_doc_parts, local_candidates

    hashtags = text_line.split()

    for tag in hashtags:
        # 1. Segment the hashtag
        segmented_words = wordsegment.segment(tag)
        
        if not segmented_words:
            continue
            
        # 2. Calculate average word length
        avg_len = sum(len(w) for w in segmented_words) / len(segmented_words)
        
        # 3. Apply the filter
        if avg_len >= MIN_AVG_WORD_LEN:
            segmented_sentence = " ".join(segmented_words)
            
            # 4. Optional: Language check
            try:
                lang = detect(segmented_sentence)
                if lang not in ['id', 'en']:
                    continue
            except:
                pass 
                
            # If all filters pass, add the data
            local_doc_parts.append(segmented_sentence)
            local_candidates.add(tag)
            
    return local_doc_parts, local_candidates

# --- 3. Main execution block (REQUIRED for multiprocessing) ---
if __name__ == "__main__":
    
    # --- Load Wordsegment Model (once in main process) ---
    try:
        wordsegment.load()
        print("Wordsegment model loaded.")
    except Exception as e:
        print(f"Could not load wordsegment data: {e}")
        print("Please run 'pip install wordsegment' and ensure you have internet.")
        raise e
    
    print(f"Loaded {len(relevant_text)} lines to process.")

    # --- 4. Process Data in Parallel ---
    doc_for_embedding_parts = []
    hashtag_candidates = set()

    print("Processing and filtering hashtags in parallel...")
    
    # Use ProcessPoolExecutor to run CPU-bound tasks in parallel
    # It will automatically use all available CPU cores
    with ProcessPoolExecutor() as executor:
        # 'executor.map' runs 'process_line' on each item in 'relevant_text'
        # We wrap the map iterator in tqdm to get a single, clean progress bar
        results = list(tqdm(executor.map(process_line, relevant_text), total=len(relevant_text)))

    # --- 5. Aggregate Results from All Processes ---
    print("Aggregating results...")
    for doc_parts, candidates in results:
        doc_for_embedding_parts.extend(doc_parts)
        hashtag_candidates.update(candidates)

    # --- 6. Check if We Have Any Data Left ---
    doc_for_embedding = " ".join(doc_for_embedding_parts)
    hashtag_candidates_list = list(hashtag_candidates)

    if not doc_for_embedding or not hashtag_candidates_list:
        print("\nError: After filtering, no valid hashtags or corpus content was found.")
        print("Your dataset might be 100% noise, or your MIN_AVG_WORD_LEN is too high.")
    else:
        print(f"\nFiltered candidate count: {len(hashtag_candidates_list)}")
        print(f"Filtered 'meaning' doc sample: '{doc_for_embedding[:200]}...'")

        # --- 7. Load KeyBERT and Extract Keywords ---
        model_name = "paraphrase-multilingual-MiniLM-L12-v2"
        kw_model = KeyBERT(model=model_name)

        print("\nRunning KeyBERT on filtered data...")
        
        # **FIX**: Use 'doc=' for a single string, not 'docs='
        keywords = kw_model.extract_keywords(
            docs=doc_for_embedding,      
            candidates=hashtag_candidates_list, 
            top_n=50
        )

        # --- 8. View Your (Now Correct) Results ---
        print("\n--- Top 50 Semantically-Ranked (and Filtered) Hashtags ---")
        for keyword, score in keywords:
            print(f"  - '{keyword}' (Score: {score:.4f})")

Wordsegment model loaded.
Loaded 111949 lines to process.
Processing and filtering hashtags in parallel...


100%|██████████| 111949/111949 [10:29<00:00, 177.78it/s]


Aggregating results...

Filtered candidate count: 29121
Filtered 'meaning' doc sample: 'dan memb uk tik an pertamina ber jalan seperti yang bukan se kedar kembali kan ten tara barak ken apa inst ansi yang mem per boleh kan tidak dengan ran ah masih engg ak men yang ka ten tara beberapa y...'

Running KeyBERT on filtered data...





--- Top 50 Semantically-Ranked (and Filtered) Hashtags ---
  - 'negara2an' (Score: 0.4902)
  - 'negara' (Score: 0.4331)
  - 'debat' (Score: 0.4330)
  - 'penjahat2' (Score: 0.4272)
  - 'negara2' (Score: 0.4215)
  - 'debate' (Score: 0.4155)
  - 'pemerintahyg' (Score: 0.4122)
  - 'kebangsaan' (Score: 0.4111)
  - 'kesewenangwenangan' (Score: 0.4081)
  - 'kerana' (Score: 0.4033)
  - 'treasonous' (Score: 0.4028)
  - 'masyarakat' (Score: 0.4001)
  - 'authoritarianism' (Score: 0.3983)
  - 'lagipemrintah' (Score: 0.3968)
  - 'pemerintahanpeneliti' (Score: 0.3909)
  - 'pemerintahnya' (Score: 0.3893)
  - 'penyelewengannya' (Score: 0.3876)
  - 'ada' (Score: 0.3859)
  - 'pokoknyaaw' (Score: 0.3856)
  - 'bandit' (Score: 0.3836)
  - 'kesewenang2an' (Score: 0.3825)
  - 'anarchism' (Score: 0.3816)
  - 'menyangkutpautkan' (Score: 0.3780)
  - 'menyebabkannya' (Score: 0.3779)
  - 'ketidakterbukaan' (Score: 0.3763)
  - 'ketidaktahuannya' (Score: 0.3743)
  - 'penyebabnya' (Score: 0.3725)
  - 'kenapa2' (Sco

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_tokens(list_of_texts, n=10, stop_words=None):
    """
    Extracts all words and hashtags from a list of text strings, 
    counts them, and returns the top N most frequent tokens.

    Args:
        list_of_texts (list): A list of strings to process.
        n (int): The number of top tokens to return.
        stop_words (list, optional): A list of stopwords to ignore.

    Returns:
        list: A list of (token, count) tuples, sorted by count descending.
    """
    
    # This regex pattern finds:
    # 1. (#\w+)   : A hashtag (e.g., #python)
    # 2. |         : OR
    # 3. (\b\w\w+\b): A regular word of 2 or more characters (e.g., python)
    # The order is important so "#python" isn't matched as just "python".
    token_pattern = r"(#\w+|\b\w\w+\b)"
    
    # Filter out any non-string items
    valid_texts = [text for text in list_of_texts if isinstance(text, str)]
    
    if not valid_texts:
        return []

    # Initialize the vectorizer with the new pattern and stopwords
    vectorizer = CountVectorizer(
        token_pattern=token_pattern, 
        lowercase=True,
        stop_words=stop_words 
    )
    
    # 1. Fit and transform the text
    X = vectorizer.fit_transform(valid_texts)
    
    # 2. Get the feature names (words and hashtags)
    tokens = vectorizer.get_feature_names_out()
    
    # 3. Sum the counts for each token
    counts = X.sum(axis=0).A1
    
    # 4. Zip tokens and counts
    tag_counts = list(zip(tokens, counts))
    
    # 5. Sort by count in descending order
    sorted_tokens = sorted(tag_counts, key=lambda item: item[1], reverse=True)
    
    # 6. Return the top N
    return sorted_tokens[:n]

# --- Example Usage ---

# 3. Set how many top tokens you want
top_n = 100

# 4. Run the function
top_tokens = get_top_n_tokens(relevant_text, n=top_n, stop_words=list(combined_stopwords))

# 5. Print the results
print(f"Top {top_n} most used tokens (words and hashtags):")
for token, count in top_tokens:
    print(f"  {token}: {count} times")

InvalidParameterError: The 'stop_words' parameter of CountVectorizer must be a str among {'english'}, an instance of 'list' or None. Got {'other', "she'll", 'gue', 'gw', 'mungkin', 'his', 'berapalah', 'sekarang', 'percuma', 'bener', 'me', 'nantinya', 'dapat', 'diantaranya', "we'll", 'beginian', 'sudahlah', 'seperti', 'by', 'hasn', 'haven', 'justru', 'sedangkan', 'meski', 'telah', 'them', 'tetapi', 'loh', 'sambil', 'who', 'bagaikan', 'dimana', 'khususnya', "won't", 'mustn', 'don', 'sana', 'yang', 'saat', 'bagaimana', "weren't", 'such', 'pun', 'bukannya', 'waduh', 'demikian', 'because', 'it', 'terhadapnya', 'dm', 'himself', 'biasa', 'couldn', 'she', 'sela', 'nanti', 'mightn', 'him', 'sebuah', 'sama', 'deh', 'atau', 'manakala', "didn't", 'bukanlah', 'sebegitu', 'has', 'kami', 'seraya', 'sepanjang', 'than', 'saatnya', 'dsb', 'they', 't', "you'd", 'cuma', 'dengan', "wouldn't", 'kemana', 'enggak', "i've", 'do', 'pantas', 'makasi', 'no', 'sedang', 'kepada', 'did', 'per', 'tidak', 'bila', 'hendak', 'hendaklah', 'sesaat', 'begitu', 'doesn', 'both', 'dekat', 'itself', 'was', 'on', 'between', 'themselves', 'sebagainya', 'semua', 'terlebih', 'lah', 'jikalau', 'masihkah', 'selain', 'setiap', 'only', 'bagi', 'just', 'mengapa', "he's", 'hadn', 'dari', 'meskipun', 'lo', "shan't", 'nyaris', 'disinilah', 'nah', 'selamanya', 'mampukah', 'an', 'dini', "i'm", 'yg', 'antar', 'weren', 'your', 'dua', 'demikianlah', 've', 'oleh', 'memang', 'apa', 'up', 'walau', 'is', 'pasti', 'of', 'ga', 'yakni', 'but', 'aren', 'while', 'lalu', 'hanya', "it'll", 'malah', 'dialah', 'tidaklah', "they've", 'bagaimanapun', 'di', 'sekalian', 'sekaligus', 'seringnya', 'been', 'dst', 'begitulah', 'having', 'sepertinya', "we've", 'yourself', 'dan', '404:', 'yaitu', 'be', 'rt', 'sesama', 'tanpa', 'siapakah', 'we', 'anu', 'seluruhnya', 'antara', "we're", 'sudah', "that'll", "it's", 'boleh', 'what', 'entahlah', 'nih', 'tadinya', 'haruslah', 'kecuali', 'bahkan', 'bukankah', 'does', 'will', 'loe', 'hal', 'sering', 'seorang', 'didn', 'selagi', 'tapi', 'sesudah', 'isn', 'banyak', 'kiranya', 'sih', 'nor', 'wong', "he'd", 'sebelum', 'bilakah', 'ours', 'selaku', 'about', 'serupa', 'sehingga', 'those', 'kamilah', 'terdiri', 'sesuatunya', 'sekalipun', 'siapa', 'sis', 'kenapa', 'kan', 'sebanyak', 'kapan', 'kini', 'o', 'sampai', "you've", 'demi', "doesn't", "he'll", 'nggak', 'saling', 'semuanya', 'once', 'secara', "should've", 'seolah', 'ini', "mightn't", 'amat', 'pada', 'under', 'sudahkah', 'off', "needn't", 'buat', 'ngga', 'aku', 'ada', 'bagaimanakah', 'sedikitnya', 'ma', 'wasn', 'toh', 'more', 'bagai', 'these', 'any', 'apabila', 'll', 'bro', 'against', 'm', 'apakah', 'again', "haven't", 'you', 'dia', 'pernah', 'para', "mustn't", 'which', 'ataukah', 'jika', 'not', 'wah', 'over', 'berapakah', 'bisa', 'wahai', 'jd', 'sayalah', 'begini', 'kalau', 'dahulu', 'kemudian', 'se', 'seketika', 'where', 'whom', 'seharusnya', 'sepantasnya', 'sesegera', 'being', 'sedemikian', 'down', 'kala', 'apatah', 'seseorang', 'guna', 'adalah', 'he', 'below', 'tentang', 'during', 'bisakah', 'manalagi', 'pd', 'kek', 'lu', 'hampir', 'sebetulnya', 'nya', 'bahwa', 'bermacam', 'semasih', 'then', 'sekitar', 'akan', 'begitupun', 'all', 'that', "you'll", 'padahal', 'its', 'semaunya', 'kinilah', 'masing', 'with', 'bngt', 'depan', 'aja', 'can', 'oh', 'untuk', 'i', 'setidaknya', 'mampu', 'dikarenakan', 'dgn', 'kok', 'the', 'sebegini', 'pula', 'sebelumnya', 'begitukah', 'hingga', 'after', 'too', 'lainnya', 'terimakasih', 'suatu', 'their', 'sebenarnya', "don't", 'am', 'juga', 'few', 'setelah', "she's", 'ibarat', 'adanya', 'were', 'apaan', 'gua', 'agar', 'tiap', 'for', 'karenanya', 'hendaknya', 'inilah', 'kamulah', 'and', 'oke', 'karena', 'serta', 'my', 'ingin', 'her', 'sendirinya', 'namun', "they'd", 'apalagi', 'kalian', 'kepadanya', 'tersebutlah', 'trs', 'dulu', 'btw', 'bolehkah', 'sesuatu', 'inginkan', 'kitalah', 'rupanya', 'beginikah', 'tolong', "hadn't", 'plis', 'dong', 'bukan', 'jangan', 'merupakan', 'terhadap', 'andalah', 'bersama', 'ke', 'tentunya', "they're", 'sementara', 'or', 'sewaktu', 'herself', 'enggaknya', 'biasanya', 'ny', 'in', 'from', 'makanya', 'jadi', 'as', 'into', 'siapapun', 'kapankah', 'our', 'thx', 'gak', 'belum', 'bahwasanya', 'above', 'diri', 'kecil', 'lamanya', 'disini', 'myself', 'selama', 'betulkah', 'berapa', "shouldn't", 'lain', 'kalaupun', 'walaupun', 'have', 'how', 'some', 'why', 'dirinya', 'maka', 'wouldn', 'tertentu', 'sebagaimana', 'ourselves', 'min', "couldn't", "you're", 'maupun', 'kalo', 'sedikit', 'yours', 'tadi', 'a', 'tidakkah', 'needn', 'out', 'tentulah', 'tau', 'kah', 'saja', "aren't", 'kapanpun', 'merekalah', 's', "she'd", 'bgt', 'own', 'entah', 'adapun', 'sekiranya', 'seluruh', 'sesekali', 'ok', 'padanya', 'pastilah', 'sesudahnya', 'found', 'd', "we'd", 'to', 'kamu', 'terlalu', 'sekali', 'tak', 'ya', 'itu', 'shan', 'each', 'tersebut', 'there', 'anda', 'makin', 'sangatlah', 'segala', "it'd", 'ialah', 'inginkah', 'beberapa', 'seterusnya', 'if', 'until', 'mungkinkah', 'sangat', 'segalanya', 'shouldn', 'semacam', 'further', 'dulunya', 'yourselves', 'tentu', 'kita', 'y', 'akhirnya', 'mereka', 'daripada', 'sinilah', 'supaya', 'most', 'kembali', "isn't", 'at', 'kayak', 'sejenak', 'when', 're', 'lagi', 'ataupun', 'won', 'sini', 'agaknya', 'paling', 'belumlah', 'saya', 'sendiri', 'sepantasnyalah', 'same', 'harusnya', 'sebaliknya', 'through', 'ain', 'semula', 'berapapun', 'agak', 'melalui', 'akulah', 'melainkan', 'lama', 'sebisanya', 'macam', 'olehnya', 'before', 'lagian', 'mau', 'mana', 'here', 'lebih', 'now', 'janganlah', 'amatlah', 'sempat', 'malahan', 'mari', 'dalam', 'sebabnya', 'ketika', "i'd", "wasn't", 'antaranya', 'sejak', 'sebab', 'so', "hasn't", 'pls', 'theirs', 'are', 'very', 'segera', "they'll", 'sekitarnya', 'semakin', 'this', 'harus', 'itulah', 'makasih', 'seberapa', 'akankah', 'selalu', 'masih', 'should', 'had', 'sebagai', 'kalaulah', 'beginilah', 'sajalah', 'ia', 'dll', 'bolehlah', 'menurut', 'emang', 'diantara', 'hanyalah', 'hers', 'jangankan', 'bikin', 'doing', 'inikah', "i'll", 'itukah'} instead.