## Upload all of the initially parsed data to huggingface

In [None]:
import json
from datasets import Dataset
import pandas as pd

OUT_DIR ="OUT/"
ASSET_DIR="assets/"
with open(ASSET_DIR + "dump-formatted.json", "r", encoding="utf-8") as file:
  RAW_DOCUMENTS = json.load(file)
dataframe = pd.DataFrame(RAW_DOCUMENTS)
dataframe = dataframe.drop(columns=["_id", "__v", "created_at"])
dataset = Dataset.from_pandas(dataframe)
dataset.push_to_hub("tianharjuno/twitter-parse", commit_description="Initial commit. not cleaned yet")


In [2]:
from datasets import load_dataset
ds = load_dataset("tianharjuno/twitter-parse", cache_dir="cache/")
raw_ds = ds["train"]

In [4]:
import re, unicodedata, jaconv, emoji

# ─── pre-compiled patterns ────────────────────────────────────────────────
_URL      = re.compile(r'https?://\S+')
_MENTION  = re.compile(r'@\w+')
_REPEAT   = re.compile(r'(.)\1{2,}')       # ≥3 of same char
_WS       = re.compile(r'\s+')

# remove from the first token that *begins* with “kutipan” (any case) to the string-end
_KUTI_CUT = re.compile(r'(?i)kutipan.*$', re.DOTALL)

def cleantext(row: str):
    text = row["content"] #type: ignore
    text = unicodedata.normalize('NFKC', text)
    text = jaconv.z2h(text, kana=False, digit=True, ascii=True)

    text = text.replace('\\n', ' ').replace('\\r', ' ')
    text = _URL.sub(' <url> ', text)
    text = _MENTION.sub(' ', text)
    text = re.sub(r'^rt\s+', '', text, flags=re.I)
    text = re.sub(r'(\b\d{4})(?=[a-zA-Z])', r'\1 ', text)

    # ⇣ one liner does all the “kutipan” work; the old _KUTI_BEF/_KUTI_AFT are no longer needed
    text = _KUTI_CUT.sub('', text)

    text = emoji.demojize(text, delimiters=(' ', ' '))
    text = _REPEAT.sub(r'\1\1', text)
    text = _WS.sub(' ', text).strip().lower()
    row["content"] = text #type: ignore
    return row

In [5]:
raw_ds = raw_ds.map(cleantext)

In [7]:
import pandas as pd

kamus_alay_read = pd.read_csv("kamus-alay/colloquial-indonesian-lexicon.csv")
kamus_alay_read = kamus_alay_read.drop(columns=["In-dictionary", "context", "category1", "category2", "category3"], errors="ignore")
alay_dict = dict(zip(kamus_alay_read["slang"], kamus_alay_read["formal"]))

def formalize(example):
    text = example["content"]
    words = text.split()
    normalized = []
    for w in words:
        normalized.append(alay_dict.get(w.lower(), w))
    example["content"] = " ".join(normalized)
    return example
raw_ds = raw_ds.map(formalize)

Map: 100%|██████████| 201583/201583 [00:07<00:00, 27251.98 examples/s]


In [8]:
ds["cleaned"] = raw_ds

In [10]:
ds.push_to_hub("tianharjuno/twitter-parse", commit_description="cleaned up text")

Creating parquet from Arrow format: 100%|██████████| 202/202 [00:00<00:00, 1284.26ba/s]
Uploading files as bytes or binary IO objects is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.06it/s]
Creating parquet from Arrow format: 100%|██████████| 202/202 [00:00<00:00, 1447.31ba/s]
Uploading files as bytes or binary IO objects is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|██████████| 1/1 [00:29<00:00, 29.65s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/tianharjuno/twitter-parse/commit/d82e9073cf51fde0abf36badbba88e79d15989a6', commit_message='Upload dataset', commit_description='cleaned up text', oid='d82e9073cf51fde0abf36badbba88e79d15989a6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/tianharjuno/twitter-parse', endpoint='https://huggingface.co', repo_type='dataset', repo_id='tianharjuno/twitter-parse'), pr_revision=None, pr_num=None)