## Upload all of the initially parsed data to huggingface

In [None]:
import json

def normalize_mongo_json(data):
    cleaned = []
    for item in data:
        obj = {}

        for k, v in item.items():
            if k in ["_id", "__v", "created_at"]:
                continue  # drop fields

            # unwrap MongoDB extended JSON types
            if isinstance(v, dict) and "$date" in v:
                # ensure it's ISO 8601 without Z
                dt = v["$date"]
                # remove Z if present
                dt = dt.rstrip("Z")
                obj[k] = dt
            else:
                obj[k] = v

        cleaned.append(obj)

    return cleaned


# Example usage:
raw_json = json.loads(open("raw_dump_stage_1.json").read())
clean_json = normalize_mongo_json(raw_json)

with open("parsed_dump_stage_1.json", "w") as f:
    json.dump(clean_json, f, indent=2)

raw_json = json.loads(open("raw_dump_stage_2.json").read())
clean_json = normalize_mongo_json(raw_json)

with open("parsed_dump_stage_2.json", "w") as f:
    json.dump(clean_json, f, indent=2)


FileNotFoundError: [Errno 2] No such file or directory: 'out/raw_dump_stage_1.json'

In [24]:
from datasets import Dataset

stage_1_dataset = Dataset.from_json("parsed_dump_stage_1.json")
stage_2_dataset = Dataset.from_json("parsed_dump_stage_2.json")
print(f"{len(stage_1_dataset)} {len(stage_2_dataset)}")

201583 247820


In [None]:
import json
from datasets import load_dataset, Dataset, DatasetDict
dataset_1 = Dataset.from_json("parsed_dump_stage_1.json")
dataset_2 = Dataset.from_json("parsed_dump_stage_2.json")

dataset_1 = dataset_1.map(
    lambda x: {
        "relevant": False,   # default boolean
        "sentiment": 0       # default integer (-1, 0, 1 allowed)
    }
)
dataset_2 = dataset_2.map(
    lambda x: {
        "relevant": False,  # default boolean
        "sentiment": 0,  # default integer (-1, 0, 1 allowed)
    }
)

Map:   0%|          | 0/201583 [00:00<?, ? examples/s]

Map:   0%|          | 0/247820 [00:00<?, ? examples/s]

In [None]:
dataset_dict = DatasetDict()
dataset_dict["source_stage_1"] = dataset_1
dataset_dict["source_stage_2"] = dataset_2
dataset_dict.push_to_hub("tianharjuno/twitter-parse", commit_message="Initial Commit")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/619 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/dbf93f69b6513a82701a40b76a6e91b8a84dc3b6', commit_message='Initial Commit', commit_description='', oid='dbf93f69b6513a82701a40b76a6e91b8a84dc3b6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)

In [1]:
from datasets import load_dataset
ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
raw_ds = ds["source_stage_2"]

In [2]:
raw_ds["content"][:50]

['Kembalikan Tentara ke barak\n#TolakRUUTNI',
 'Masih ga nyangka suamiku tentara #tolakruutni',
 'Beberapa zine yg akan saya bawa saat melapak nanti\n\n#TolakRUUTNI \n#SupremasiSipil',
 'Sebagai mahasiswa hukum, kami memiliki tanggung jawab akademik untuk mengawal proses pembentukan undang-undang agar memiliki proses partisipasi yg bermakna!\n\nberjuang turun ke jalan, bahkan berjuang di ruang sidang, semua sah dilakukan. yg penting #tolakruutni',
 'sekali-kali ga nyebelin ye kite #FallingInLoveEra #TolakRUUTNI',
 'Presiden kita ini udah KOSONG dari pas zaman debat pilpres. Dan skrg negara kita di pimpin oleh 2 manusia KOSONG bin DONGO \n\n#TolakRUUTNI #SupremasiSipil',
 'Kalo bisa menteri dan pejabat negara lainnya ikut disandera juga.. wkwkwkkwk\n\n#TolakRUUTNI QRIS',
 'Great \nECCO CHI Ses\nBingx3\n#shailenzo #zelena #yulioli #grandefratello #zeudiners #ENHYPEN #ホットスポット #Trump #TolakRUUTNI',
 'Kalau mau bertani jangan jadi tentara.\n#TolakRUUTNI',
 '"Whenever I lay down to close my 

In [3]:
import re, unicodedata, jaconv, emoji

_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_WS       = re.compile(r'\s+')
_KUTI_CUT = re.compile(r'(?i)kutipan.*$', re.DOTALL)

# --- (MODIFIED) ---
# Catches "word" + "dari" + "domain.com" -> replaces with "word"
# Changed \w+ to \S+ to include punctuation like '!'
_DARI_URL_ATTACHED = re.compile(r'(\S+)dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I) 

# Catches " dari " + "domain.com" -> replaces with empty string
_DARI_URL_SPACED = re.compile(r'\s+dari\s+([a-z0-9.-]+\.[a-z]{2,})\b', re.I)

# --- (NEW) ---
# Catches any word ending in "dari" (e.g., "anarko!dari", "negaradari")
_DARI_STUCK = re.compile(r'(\S+)dari\b', re.I)

def cleantext(row: str):
    text = row["content"] #type: ignore
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)
    text = text.replace("tanya grok", " ")
    text = text.replace("grokproductivitypasang", " ")
    text = text.replace('\\n', ' ').replace('\\r', ' ')
    
    # Handle standard URLs first
    text = _URL.sub(' <url> ', text)
    text = text.replace('ini tidak tersedia', ' ')
    
    text = _MENTION.sub('@USER', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)
    text = _KUTI_CUT.sub('', text)

    # text = _DARI_URL_ATTACHED.sub(r'\1', text) 
    # text = _DARI_URL_SPACED.sub('', text)
    # text = _DARI_STUCK.sub(r'\1', text)
    
    text = _WS.sub(' ', text).strip()
    row["content"] = text #type: ignore
    return row

In [4]:
from datasets import Dataset
raw_ds = raw_ds.map(cleantext)
# remove duplicates
raw_df = raw_ds.to_pandas()
raw_df = raw_df.drop_duplicates(subset="content", keep="first").reset_index(drop=True)

raw_ds_dedup = Dataset.from_pandas(raw_df)


Map:   0%|          | 0/247820 [00:00<?, ? examples/s]

In [5]:
raw_ds_dedup

Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
    num_rows: 195952
})

In [6]:
content_list = raw_ds["content"]

# Correct syntax: "\n".join(content_list)
# Added encoding="utf-8" to prevent errors with special characters
with open("reader.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(content_list))

In [14]:
ds["cleaned"] = raw_ds_dedup

In [15]:

ds.push_to_hub("tianharjuno/twitter-parse", commit_message="cleaned up text")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/71d4a3692c4929d47ba0b545f276e8a46adb879e', commit_message='cleaned up text', commit_description='', oid='71d4a3692c4929d47ba0b545f276e8a46adb879e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)

In [16]:
ds["cleaned"]

Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
    num_rows: 195952
})

In [17]:
train_ds = ds["cleaned"]
train_ds_split = train_ds.train_test_split(train_size=5000, seed=42)
test_ds = train_ds_split["train"]
leftover_ds = train_ds_split["test"]

In [18]:
leftover_ds_split = leftover_ds.train_test_split(train_size=20000, seed=42)
final_train_ds = leftover_ds_split["train"]
final_train_ds

Dataset({
    features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
    num_rows: 20000
})

In [19]:
ds

DatasetDict({
    source_stage_1: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 201583
    })
    source_stage_2: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 247820
    })
    cleaned: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 195952
    })
    test: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
        num_rows: 5000
    })
    train: Dataset({
        features: ['tweet_id', 'time', 'author', 'content', 'comment_count', 'repost_count', 'like_count', 'view_count', 'relevant', 'sentiment'],
 

In [20]:
ds["test"] = test_ds
ds["train"] = final_train_ds

ds.push_to_hub("tianharjuno/twitter-parse", commit_message="Created a test dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/202 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/248 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/196 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.


README.md: 0.00B [00:00, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/2fee1a3524a90baca823df9239ba213cd9e1f1ef', commit_message='Created a test dataset', commit_description='', oid='2fee1a3524a90baca823df9239ba213cd9e1f1ef', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)