# Notebook: Uji Cepat Leksara
Notebook ini menguji library `leksara` end-to-end: cek lingkungan, instal, import, uji fungsi/kualitas output, pipeline kustom, I/O, benchmark, hingga pembersihan.

In [1]:
# Section 1 — Cek Lingkungan & Set Random Seed
import sys, platform, os, random, warnings, logging, time
try: import numpy as np
except Exception: np = None
try: import torch
except Exception: torch = None
print({"python": sys.version, "platform": platform.platform(), "cwd": os.getcwd()})
random.seed(1337)
if np is not None: np.random.seed(1337)
if torch is not None: torch.manual_seed(1337)
logging.basicConfig(level=logging.INFO)
warnings.filterwarnings("ignore")

{'python': '3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'cwd': 'c:\\Users\\Rhendy Saragih\\Downloads\\Kerjaan\\Leksara\\notebooks'}


In [6]:
# Section 2 — Instal / Upgrade Library (editable)
%pip install -e .
import importlib, pkgutil, sys, os, subprocess
from pathlib import Path

# Ensure runtime deps exist in this kernel (regex, pandas, emoji, Sastrawi)
try:
    import regex as _rx  # type: ignore
    import pandas as _pd  # type: ignore
    import emoji as _emoji  # type: ignore
    import Sastrawi as _sastrawi  # type: ignore
except Exception:
    print("Installing runtime dependencies into current kernel ...")
    subprocess.run([sys.executable, "-m", "pip", "install", "regex", "pandas", "emoji", "Sastrawi"], check=False)


def _find_repo_root(start: str | None = None) -> str:
    p = Path(start or os.getcwd()).resolve()
    for _ in range(10):
        if (p / "pyproject.toml").exists() or (p / "setup.py").exists():
            return str(p)
        if p.parent == p:
            break
        p = p.parent
    return str(Path(os.getcwd()).resolve())


def _ensure_import():
    try:
        import leksara as _test_import  # sanity
        print("Imported leksara from:", _test_import.__file__)
        return _test_import
    except ModuleNotFoundError:
        # Fallback: add discovered repo root to sys.path for local development
        repo_root = _find_repo_root()
        if repo_root not in sys.path:
            sys.path.insert(0, repo_root)
        _test_import = importlib.import_module("leksara")
        print("Imported leksara via sys.path fallback from:", _test_import.__file__)
        return _test_import


leksara_mod = _ensure_import()
# Tampilkan info distribusi terinstal (jika ada)
try:
    out = subprocess.run([sys.executable, "-m", "pip", "show", "leksara"], capture_output=True, text=True)
    print(out.stdout.strip() or "pip show leksara: (no metadata)")
except Exception:
    pass

Obtaining file:///C:/Users/Rhendy%20Saragih/Downloads/Kerjaan/Leksara/notebooks
Note: you may need to restart the kernel to use updated packages.
Installing runtime dependencies into current kernel ...


ERROR: file:///C:/Users/Rhendy%20Saragih/Downloads/Kerjaan/Leksara/notebooks does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.


Imported leksara from: C:\Users\Rhendy Saragih\Downloads\Kerjaan\Leksara\leksara\__init__.py
pip show leksara: (no metadata)
pip show leksara: (no metadata)


In [7]:
# Section 3 — Import Library & Verifikasi
import leksara
from leksara import basic_clean, review_cleaner, user_brush, review_miner, patterns, leksara as leksara_run
from leksara.cartboard import build_frame, build_frame_from_df, annotate_flags
from leksara.presets import ecommerce_review_preset
print("leksara available symbols:", [n for n in dir(leksara) if not n.startswith('_')][:30])

leksara available symbols: ['Any', 'Callable', 'Iterable', 'annotate_flags', 'annotations', 'apply_preset', 'basic_clean', 'build_frame', 'build_frame_from_df', 'cartboard', 'clean', 'get_preset', 'leksara', 'patterns', 'pd', 'presets', 'review_chain', 'review_cleaner', 'review_miner', 'user_brush', 'utils']


In [17]:
# Section 4 — Siapkan Data/Input Uji
import pandas as pd
df = pd.DataFrame({
    'chat_id': [1,2,3],
    'chat_message': [
        'Halo gan, hubungi saya di 0812-3456-7890 ya!',
        'Barangnya BAAAguuusss banget!!!',
        'Email saya: test@example.com, alamat: Jl. Mawar No. 12'
    ]
,})
df.head()

Unnamed: 0,chat_id,chat_message
0,1,"Halo gan, hubungi saya di 0812-3456-7890 ya!"
1,2,Barangnya BAAAguuusss banget!!!
2,3,"Email saya: test@example.com, alamat: Jl. Mawa..."


In [9]:
# Section 5 — Uji Fungsi Utama
assert basic_clean('Ini   Teks   KOTOR!!!') == 'ini teks kotor!!!'
masked = patterns.MASK_PHONE('hubungi 0812 3456 7890')
assert '<PHONE>' in masked
print('basic_clean & mask phone OK')

basic_clean & mask phone OK


In [10]:
# Section 6 — Uji Kelas/Objek Utama (ReviewChain & Preset)
from leksara.presets import ecommerce_review_preset
from leksara.cartboard import build_frame
chain = ecommerce_review_preset()
out_df = chain.run_on_dataframe(build_frame(df['chat_message']))
assert 'refined_text' in out_df.columns
out_df.head()

Unnamed: 0,original_text,refined_text,rating,rating_flag,pii_flag,non_alphabetical_flag,lang_mix_flag
0,"Halo gan, hubungi saya di 0812-3456-7890 ya!","Halo gan, hubungi saya di 0812-3456-7890 ya!",,False,False,False,False
1,Barangnya BAAAguuusss banget!!!,Barangnya BAAAguuusss banget!!!,,False,False,False,False
2,"Email saya: test@example.com, alamat: Jl. Mawa...","Email saya: test@example.com, alamat: Jl. Mawa...",,False,False,False,False


In [11]:
# Section 7 — Uji Kasus Tepi & Validasi Error
try:
    _ = basic_clean(123)
except TypeError:
    print('TypeError tertangkap untuk basic_clean angka')
else:
    print('Peringatan: basic_clean seharusnya error untuk tipe non-string/non-iterable')

TypeError tertangkap untuk basic_clean angka


In [12]:
# Section 8 — Uji I/O: Simpan & Muat
import tempfile, json, os
tmpdir = tempfile.mkdtemp(prefix='leksara_demo_')
json_path = os.path.join(tmpdir, 'output.json')
# Simpan refined_text hasil preset
out_df[['refined_text']].to_json(json_path, orient='records', force_ascii=False)
loaded = json.loads(open(json_path, 'r', encoding='utf-8').read())
assert isinstance(loaded, list) and len(loaded) == len(out_df)
json_path

'C:\\Users\\RHENDY~1\\AppData\\Local\\Temp\\leksara_demo_cjtne6e8\\output.json'

In [13]:
# Section 9 — Uji Kinerja (Benchmark)
from time import perf_counter
big = df['chat_message'].tolist() * 2000
t0 = perf_counter()
_ = [patterns.TO_LOWER(patterns.MASK_PHONE(t)) for t in big]
t1 = perf_counter()
print(f"Mask+Lower {len(big)} teks: {t1 - t0:.3f}s")

Mask+Lower 6000 teks: 0.007s


In [14]:
# Section 10 — Visualisasi/Inspeksi Output
import matplotlib.pyplot as plt
lens = out_df['refined_text'].fillna('').map(len)
lens.plot(kind='bar', title='Panjang refined_text')
plt.show()
out_df[['refined_text']].head()

ModuleNotFoundError: No module named 'matplotlib'

In [15]:
# Section 11 — Eksekusi CLI (Jika Ada)
import subprocess, sys
res = subprocess.run([sys.executable, '-c', 'import leksara,sys;print(\'ok\')'], capture_output=True, text=True)
print('CLI-simulated import returncode:', res.returncode)
print('stdout:', res.stdout.strip())
print('stderr:', res.stderr.strip())


# Section 12 — Pembersihan Artefak
import shutil
shutil.rmtree(tmpdir, ignore_errors=True)
print('Removed:', tmpdir)

CLI-simulated import returncode: 1
stdout: 
stderr: Traceback (most recent call last):
  File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'leksara'
Removed: C:\Users\RHENDY~1\AppData\Local\Temp\leksara_demo_cjtne6e8


In [19]:
# Section 4 — Siapkan Data/Input Uji
import pandas as pd
df = pd.DataFrame({
    'chat_id': [1,2,3],
    'chat_message': [
        'Halo gan, hubungi saya di 0812-3456-7890 ya!',
        'Barangnya BAAAguuusss banget!!!',
        'Email saya: test@example.com, alamat: Jl. Mawar No. 12'
    ]
,})
df.head()

Unnamed: 0,chat_id,chat_message
0,1,"Halo gan, hubungi saya di 0812-3456-7890 ya!"
1,2,Barangnya BAAAguuusss banget!!!
2,3,"Email saya: test@example.com, alamat: Jl. Mawa..."


In [None]:
# Demo: Gaya pemakaian mirip contoh (CamelCase + pipeline kustom)
# Fallback ke lowercase package
from leksara import leksara
from leksara.functions import to_lowercase
from leksara.patterns import MASK_PHONE as replace_phone

custom_pipeline = {
    'patterns': [replace_phone],  # buat sensor nomor telepon
    'functions': [to_lowercase],  # fungsi spesifik yang dipakai
}

df['safe_message'] = leksara(df['chat_message'], pipeline=custom_pipeline)
print(df[['chat_id', 'safe_message']])

   chat_id                                       safe_message
0        1               halo gan, hubungi saya di <phone>ya!
1        2                    barangnya baaaguuusss banget!!!
2        3  email saya: test@example.com, alamat: jl. mawa...


In [21]:
# Section 13 — User Brush (PII): mask or remove sensitive info
from leksara import user_brush as ub

# Create masked and fully redacted variants
df_ub = df.copy()
df_ub['masked'] = df_ub['chat_message'].map(
    lambda t: ub.replace_phone(ub.replace_email(ub.replace_address(ub.replace_id(t))))
)
df_ub['redacted'] = df_ub['chat_message'].map(
    lambda t: ub.remove_phone(ub.remove_email(ub.remove_address(ub.remove_id(t))))
)
df_ub[['chat_id', 'chat_message', 'masked', 'redacted']]

Unnamed: 0,chat_id,chat_message,masked,redacted
0,1,"Halo gan, hubungi saya di 0812-3456-7890 ya!","Halo gan, hubungi saya di <PHONE>ya!","Halo gan, hubungi saya di ya!"
1,2,Barangnya BAAAguuusss banget!!!,Barangnya BAAAguuusss banget!!!,Barangnya BAAAguuusss banget!!!
2,3,"Email saya: test@example.com, alamat: Jl. Mawa...","Email saya: <EMAIL>, alamat: <ADDRESS>","Email saya: , alamat:"


In [22]:
# Section 14 — Review Miner: normalize ratings, elongations, acronyms, slangs, contractions, and stemming
from leksara import review_miner as rm
from importlib.resources import files
import json

# Load built-in maps (acronyms, slangs)
acronyms = json.loads(files('leksara.functions.data').joinpath('acronyms.json').read_text(encoding='utf-8'))
slang_map = json.loads(files('leksara.functions.data').joinpath('slang_map.json').read_text(encoding='utf-8'))

examples = pd.Series([
    'BINTANG 5 nih, barangnya BAAAguuusss bgtu!!!',
    'CS ramah, tp OTS gak bisa. COD boleh?',
])

step1 = examples.map(rm.replace_rating)
step2 = step1.map(rm.shorten_elongation)
step3 = step2.map(lambda t: rm.replace_acronym(t, acronyms))
step4 = step3.map(lambda t: rm.normalize_slangs(t, slang_map))
step5 = step4.map(lambda t: rm.expand_contraction(t, {"gak": "tidak", "nggak": "tidak"}))
step6 = step5.map(lambda t: rm.normalize_word(t))

pd.DataFrame({'input': examples, 'rating': step1, 'short': step2, 'acronym': step3, 'slang': step4, 'contract': step5, 'stem': step6})

Unnamed: 0,input,rating,short,acronym,slang,contract,stem
0,"BINTANG 5 nih, barangnya BAAAguuusss bgtu!!!","rating_5 nih, barangnya BAAAguuusss bgtu!!!","rating_5 nih, barangnya BAgus bgtu!!!","rating_5 nih, barangnya BAgus bgtu!!!","rating_5 nih, barangnya BAgus begitu!!!","rating_5 nih, barangnya BAgus begitu!!!",rating 5 nih barang bagus begitu
1,"CS ramah, tp OTS gak bisa. COD boleh?","CS ramah, tp OTS gak bisa. COD boleh?","CS ramah, tp OTS gak bisa. COD boleh?","customer service ramah, tp on the spot gak bis...","customer service ramah, tapi on the spot gak b...","customer service ramah, tapi on the spot tidak...",customer service ramah tapi on the spot tidak...


In [26]:
# Section 16 — End-to-end Stopwords: built-in list, whitelist, and pipeline integration
import importlib, leksara.functions.stopwords as _sw_mod
importlib.reload(_sw_mod)
from leksara.functions.stopwords import remove_stopwords_id, load_id_stopwords
from leksara import leksara as run_pipeline
from leksara.patterns import TO_LOWER

# Show built-in size and a preview
id_sw = load_id_stopwords()
print('Jumlah stopwords ID:', len(id_sw))
print('Contoh:', sorted(list(id_sw))[:12])

reviews = pd.Series([
    'Ini adalah produk yang sangat bagus dan berkualitas',
    'Barang ini tidak sesuai dengan yang diharapkan',
    'Kualitasnya bagus tapi pengiriman lama',
])

# 1) Basic removal using built-in list
no_sw = reviews.map(lambda t: remove_stopwords_id(t))

# 2) With whitelist (keep certain tokens) and extra words
no_sw_wl = reviews.map(lambda t: remove_stopwords_id(t, whitelist={'yang', 'dengan'}, extra={'sangat'}))

# 3) Integrate in a custom pipeline after lowercasing
from functools import partial
rm_id = partial(remove_stopwords_id, whitelist=None)
pipe = {
    'patterns': [],
    'functions': [TO_LOWER, rm_id],
}
no_sw_pipe = run_pipeline(reviews, pipeline=pipe)

pd.DataFrame({
    'input': reviews,
    'no_sw': no_sw,
    'no_sw_wl': no_sw_wl,
    'no_sw_pipe': no_sw_pipe,
})

Jumlah stopwords ID: 237
Contoh: ['adalah', 'agar', 'aja', 'akan', 'akhirnya', 'aku', 'anda', 'andai', 'andaikan', 'ane', 'antar', 'antara']


Unnamed: 0,input,no_sw,no_sw_wl,no_sw_pipe
0,Ini adalah produk yang sangat bagus dan berkua...,produksangatbagusberkualitas,produkyangbagusberkualitas,produksangatbagusberkualitas
1,Barang ini tidak sesuai dengan yang diharapkan,Barangsesuaidiharapkan,Barangsesuaidenganyangdiharapkan,barangsesuaidiharapkan
2,Kualitasnya bagus tapi pengiriman lama,Kualitasnyabaguspengirimanlama,Kualitasnyabaguspengirimanlama,kualitasnyabaguspengirimanlama


## Section 16A — Satu-panggilan: gaya clean[df] untuk banyak versi hasil

In [27]:
# Build several cleaned variants at once, similar to `clean[df]` idea
from leksara import basic_clean
from leksara.functions.stopwords import remove_stopwords_id

# Source df (re-use earlier df)
variants = {}
variants['lower_ws'] = basic_clean(df['chat_message'], remove_punct=False, reduce_repeat=True)
variants['lower_ws_no_punct'] = basic_clean(df['chat_message'], remove_punct=True, reduce_repeat=True)
variants['lower_ws_no_punct_no_digits'] = basic_clean(df['chat_message'], remove_punct=True, remove_digits=True, reduce_repeat=True)
variants['lower_ws_no_punct_stopwords'] = basic_clean(df['chat_message'], remove_punct=True, reduce_repeat=True, remove_stopwords=True)

# Extra: stopwords using expanded list function
import pandas as _pd
variants['no_stopwords_id'] = _pd.Series(df['chat_message']).map(remove_stopwords_id)

multi = df[['chat_id', 'chat_message']].copy()
for k, s in variants.items():
    multi[k] = s

multi.head()

Unnamed: 0,chat_id,chat_message,lower_ws,lower_ws_no_punct,lower_ws_no_punct_no_digits,lower_ws_no_punct_stopwords,no_stopwords_id
0,1,"Halo gan, hubungi saya di 0812-3456-7890 ya!","halo gan, hubungi saya di 0812-3456-7890 ya!",halo gan hubungi saya di 0812 3456 7890 ya,halo gan hubungi saya di ya,haloganhubungisaya081234567890ya,"Halogan,hubungi0812-3456-7890!"
1,2,Barangnya BAAAguuusss banget!!!,barangnya baaguuss banget!!!,barangnya baaguuss banget,barangnya baaguuss banget,barangnyabaaguussbanget,BarangnyaBAAAguuusssbanget!!!
2,3,"Email saya: test@example.com, alamat: Jl. Mawa...","email saya: test@example.com, alamat: jl. mawa...",email saya test example com alamat jl mawar no 12,email saya test example com alamat jl mawar no,emailsayatestexamplecomalamatjlmawarno12,"Email:test@example.com,alamat:Jl.MawarNo.12"


## Section 16B — Helper: fungsi `clean_variants(series)` untuk keluaran multi-versi sekali jalan

In [28]:
import pandas as pd
from typing import Mapping
from leksara import basic_clean
from leksara.functions.stopwords import remove_stopwords_id


def clean_variants(series: pd.Series) -> pd.DataFrame:
    """Return multiple cleaned variants of a Series in one call."""
    out = pd.DataFrame({'original': series})
    out['lower_ws'] = basic_clean(series, remove_punct=False, reduce_repeat=True)
    out['lower_ws_no_punct'] = basic_clean(series, remove_punct=True, reduce_repeat=True)
    out['lower_ws_no_punct_no_digits'] = basic_clean(series, remove_punct=True, remove_digits=True, reduce_repeat=True)
    out['lower_ws_no_punct_stopwords'] = basic_clean(series, remove_punct=True, reduce_repeat=True, remove_stopwords=True)
    out['no_stopwords_id'] = series.map(remove_stopwords_id)
    return out

clean_variants(df['chat_message']).head()

Unnamed: 0,original,lower_ws,lower_ws_no_punct,lower_ws_no_punct_no_digits,lower_ws_no_punct_stopwords,no_stopwords_id
0,"Halo gan, hubungi saya di 0812-3456-7890 ya!","halo gan, hubungi saya di 0812-3456-7890 ya!",halo gan hubungi saya di 0812 3456 7890 ya,halo gan hubungi saya di ya,haloganhubungisaya081234567890ya,"Halogan,hubungi0812-3456-7890!"
1,Barangnya BAAAguuusss banget!!!,barangnya baaguuss banget!!!,barangnya baaguuss banget,barangnya baaguuss banget,barangnyabaaguussbanget,BarangnyaBAAAguuusssbanget!!!
2,"Email saya: test@example.com, alamat: Jl. Mawa...","email saya: test@example.com, alamat: jl. mawa...",email saya test example com alamat jl mawar no 12,email saya test example com alamat jl mawar no,emailsayatestexamplecomalamatjlmawarno12,"Email:test@example.com,alamat:Jl.MawarNo.12"


In [None]:
# Section 13 — User Brush (PII): mask or remove sensitive info
from leksara import user_brush as ub

# Create masked and fully redacted variants
df_ub = df.copy()
df_ub['masked'] = df_ub['chat_message'].map(
    lambda t: ub.replace_phone(ub.replace_email(ub.replace_address(ub.replace_id(t))))
)
df_ub['redacted'] = df_ub['chat_message'].map(
    lambda t: ub.remove_phone(ub.remove_email(ub.remove_address(ub.remove_id(t))))
)
df_ub[['chat_id', 'chat_message', 'masked', 'redacted']]

In [None]:
from Leksara import user_brush
from Leksara.functions import user_brush import to_lowercase
from Leksara.patterns import user_brush import replace_phone

custom_pipeline = {
             ‘patterns’: [replace_phone], # buat sensor nomor telepon
             ‘functions’: [to_lowercase] # hanya fungsi ini saja yang spesifik dia pakai buat datanya
}

df[‘safe_message’] = Leksara(df[‘chat_message’], pipeline=custom_pipeline)

print(df[[‘chat_id’, ‘safe_message’]])

In [34]:
# Demo: Pemakaian sederhana (CamelCase-style)
from leksara import user_brush
from leksara.functions import to_lowercase
from leksara.patterns import MASK_PHONE as replace_phone

custom_pipeline = {
    'patterns': [replace_phone],  # sensor nomor telepon
    'functions': [to_lowercase],  # fungsi spesifik yang dipakai
}

df['safe_message'] = leksara(df['chat_message'], pipeline=custom_pipeline)
print(df[['chat_id', 'safe_message']])

   chat_id                                       safe_message
0        1               halo gan, hubungi saya di <phone>ya!
1        2                    barangnya baaaguuusss banget!!!
2        3  email saya: test@example.com, alamat: jl. mawa...
