In [1]:
import re, unicodedata, jaconv, emoji

_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_WS       = re.compile(r'\s+')
_KUTI_CUT = re.compile(r'(?i)kutipan.*$', re.DOTALL)

# --- (MODIFIED) ---
# Catches "word" + "dari" + "domain.com" -> replaces with "word"
# Changed \w+ to \S+ to include punctuation like '!'
_DARI_URL_ATTACHED = re.compile(r'(\S+)dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# Catches " dari " + "domain.com" -> replaces with empty string
_DARI_URL_SPACED = re.compile(r'\s+dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# --- (NEW) ---
# Catches any word ending in "dari" (e.g., "anarko!dari", "negaradari")
_DARI_STUCK = re.compile(r'(\S+)dari\b', re.I)

def cleantext(row: str):
    text = row["content"] #type: ignore
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    text = text.replace("tanya grok", " ")
    text = text.replace("grokproductivitypasang", " ")
    text = text.replace('\\n', ' ').replace('\\r', ' ')

    # Handle standard URLs first
    text = _URL.sub(' <url> ', text)
    text = text.replace('ini tidak tersedia', ' ')

    text = _MENTION.sub('@USER', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)
    text = _KUTI_CUT.sub('', text)

    # text = _DARI_URL_ATTACHED.sub(r'\1', text)
    # text = _DARI_URL_SPACED.sub('', text)
    # text = _DARI_STUCK.sub(r'\1', text)

    text = _WS.sub(' ', text).strip()
    row["content"] = text #type: ignore
    return row

In [11]:
from datasets import load_dataset, Dataset
dataset = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
source_ds = dataset["source_labeled"]
source_ds = source_ds.map(cleantext, num_proc=30)
source_df = source_ds.to_pandas()
source_df = source_df.drop_duplicates(subset="content", keep="first").reset_index(drop=True)
source_ds = Dataset.from_pandas(source_df)

In [15]:
relevant_only = source_ds.filter(lambda x: x["relevant"] == True)
relevant_sampled_split = relevant_only.train_test_split(train_size=30000, test_size=10000, seed=42)

dataset["train_sentiment"] = relevant_sampled_split["train"]
dataset["test_sentiment"] = relevant_sampled_split["test"]
dataset

Filter:   0%|          | 0/195952 [00:00<?, ? examples/s]

DatasetDict({
    source_stage_1: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 201583
    })
    source_stage_2: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 247820
    })
    cleaned: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 195952
    })
    test: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 5000
    })
    train: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
 

In [17]:
import numpy as np
from datasets import Dataset

# Assuming 'dataset' is your Hugging Face DatasetDict containing the splits:
# dataset["train_sentiment"]
# dataset["test_sentiment"]

def check_for_leakage(train_ds: Dataset, test_ds: Dataset, id_column: str = "tweet_id"):
    """
    Checks for and quantifies data leakage by identifying overlapping IDs
    between the training and testing datasets.
    """
    print(f"Checking {train_ds.num_rows} training samples against {test_ds.num_rows} testing samples...")

    # 1. Extract and convert IDs to sets for fast lookup (O(1))
    try:
        # Use .to_list() to ensure we get a standard Python list of IDs
        train_ids = set(train_ds[id_column])
        test_ids = set(test_ds[id_column])
    except KeyError:
        print(f"Error: The datasets do not contain a column named '{id_column}'. Please verify the ID column name.")
        return

    # 2. Find the intersection of the two sets
    overlapping_ids = train_ids.intersection(test_ids)

    overlap_count = len(overlapping_ids)

    print("\n--- Leakage Detection Results ---")

    if overlap_count > 0:
        print(f"ðŸš¨ðŸš¨ **CRITICAL LEAKAGE DETECTED!** ðŸš¨ðŸš¨")
        print(f"Found **{overlap_count}** samples present in BOTH the training and testing sets.")
        print(f"This represents {overlap_count / train_ds.num_rows * 100:.4f}% of the training data.")
        print(f"This overlap must be removed to ensure valid model evaluation.")

        # Optional: Print the first few overlapping IDs
        print("\nFirst 5 Overlapping IDs:")
        for i, tid in enumerate(list(overlapping_ids)[:5]):
            print(f"- {tid}")

    else:
        print("âœ… **No sample overlap detected.** The splits are unique by ID.")

    return overlapping_ids
overlapping_ids = check_for_leakage(
    train_ds=dataset["train_sentiment"],
    test_ds=dataset["test_sentiment"],
    id_column="content" # Change if your unique ID column is named differently
)

Checking 30000 training samples against 10000 testing samples...

--- Leakage Detection Results ---
âœ… **No sample overlap detected.** The splits are unique by ID.


In [18]:
dataset.push_to_hub("tianharjuno/twitter-parse", commit_message="Created splits for sentiment classifier training")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/94adc90dba7861ea837ddb3dac78a2afb3795eb8', commit_message='Created splits for sentiment classifier training', commit_description='', oid='94adc90dba7861ea837ddb3dac78a2afb3795eb8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)