# Notebook: Uji Cepat Leksara
Notebook ini menguji library `leksara` end-to-end: cek lingkungan, instal, import, uji fungsi/kualitas output, pipeline kustom, I/O, benchmark, hingga pembersihan.

In [1]:
from leksara import leksara
from leksara import case_normal, remove_punctuation, remove_whitespace
from leksara import replace_phone, replace_email, replace_address, replace_id
import pandas as pd

# Membuat DataFrame contoh
df = pd.DataFrame({
    "chat_id": [1, 2],
    "chat_message": [
        "Halo! Nomor saya 0812-3456-7890. Email: x@y.com, Alamat: Jakarta",
        "Hubungi +6281234567890 ya — EMAIL saya: test@mail.co.id! Alamat saya di Bandung"
    ]
})

# Menyusun custom pipeline dengan PII cleaning
custom_pipeline = {
    "patterns": [
        (replace_phone, {"mode": "replace"}), 
        (replace_email, {"mode": "replace"}),  
        (replace_address, {"mode": "replace"}),  
        (replace_id, {"mode": "replace"})  
    ],
    "functions": [
        case_normal, 
        remove_punctuation,  
        remove_whitespace 
    ]
}

# Menggunakan pipeline pada kolom chat_message
df["safe_message"] = leksara(df["chat_message"], pipeline=custom_pipeline)

# Menampilkan hasilnya
df[["chat_id", "safe_message"]]

Unnamed: 0,chat_id,safe_message
0,1,halo nomor saya [PHONE_NUMBER] email [EMAIL] a...
1,2,hubungi [PHONE_NUMBER] ya email saya [EMAIL] a...


In [2]:
# Menggunakan pipeline default pada kolom chat_message
df["safe_message"] = leksara(df["chat_message"])

# Menampilkan hasilnya
df[["chat_id", "safe_message"]]

Unnamed: 0,chat_id,safe_message
0,1,halo nomor saya 081234567890 email xycom alama...
1,2,hubungi 6281234567890 ya email saya testmailco...


In [3]:
from leksara import word_normalization

data = ["Produk Bagus sekali", "Saya membeli peralatan rumah tangga"]

custom_pipeline = {
    "patterns": [],
    "functions": [
        (word_normalization, {"word_list": ["Bagus"], "mode": "keep"})
    ]
}

print(leksara(data, pipeline=custom_pipeline))


['produk Bagus sekali', 'saya beli alat rumah tangga']


In [4]:
from leksara import case_normal, remove_stopwords, remove_whitespace

# --- contoh data ---
df = pd.DataFrame({
    "chat_id": [1, 2, 3],
    "chat_message": [
        "Saya sangat suka produk ini, dan saya akan beli lagi!",
        "Produk ini bagus sekali untuk dipakai di rumah.",
        "Namun, harga-nya agak mahal ya..."
    ]
})

# --- pipeline sederhana ---
custom_pipeline = {
    "functions": [
        case_normal,
        remove_stopwords,
        remove_punctuation,
        remove_whitespace
    ]
}

df["cleaned"] = leksara(df["chat_message"], pipeline=custom_pipeline)

print("=== Data Asli ===")
print(df[["chat_id", "chat_message"]])

print("\n=== Setelah Cleaning ===")
print(df[["chat_id", "cleaned"]])

=== Data Asli ===
   chat_id                                       chat_message
0        1  Saya sangat suka produk ini, dan saya akan bel...
1        2    Produk ini bagus sekali untuk dipakai di rumah.
2        3                  Namun, harga-nya agak mahal ya...

=== Setelah Cleaning ===
   chat_id                     cleaned
0        1            suka produk beli
1        2  produk bagus dipakai rumah
2        3           harganya mahal ya


In [5]:
from leksara import ReviewChain
from leksara.core.presets import get_preset

# Ambil preset ecommerce_review
pipeline_steps = get_preset("ecommerce_review")

# Membuat ReviewChain dengan langkah-langkah dari preset (BENAR)
review_chain = ReviewChain.from_steps(
    patterns=pipeline_steps["patterns"],
    functions=pipeline_steps["functions"],
)

# Sample data untuk memproses
data = [
    "Produk baru saya: iphone12, harga 12 juta. Hubungi 0812-3456-7890.",
    "Email saya: test@example.com. Produk sangat berkualitas!"
]

# Proses teks menggunakan pipeline
processed_data = review_chain.transform(data)

# Menampilkan hasil setelah diproses
for original, processed in zip(data, processed_data):
    print(f"Original: {original}")
    print(f"Processed: {processed}\n")

out, metrics = review_chain.transform(data, benchmark=True)
display(metrics)
display(out)


Original: Produk baru saya: iphone12, harga 12 juta. Hubungi 0812-3456-7890.
Processed: produk iphone12 harga 12 juta hubung [PHONE_NUMBER]

Original: Email saya: test@example.com. Produk sangat berkualitas!
Processed: email [EMAIL] produk kualitas



{'n_steps': 15,
 'total_time_sec': 0.0002873999619623646,
 'per_step': [('word_normalization', 9.190000128000975e-05),
  ('remove_stopwords', 3.2199997804127634e-05),
  ('remove_emoji', 2.6599998818710446e-05),
  ('mask_whitelist', 2.5699991965666413e-05),
  ('unmask_whitelist', 1.719998545013368e-05),
  ('replace_phone', 1.619999238755554e-05),
  ('replace_url', 1.4499993994832039e-05),
  ('replace_address', 1.3700002455152571e-05),
  ('remove_punctuation', 1.0800009476952255e-05),
  ('replace_email', 8.899995009414852e-06),
  ('shorten_elongation', 8.699993486516178e-06),
  ('replace_id', 7.599999662488699e-06),
  ('remove_whitespace', 6.499991286545992e-06),
  ('remove_tags', 5.1000097300857306e-06),
  ('case_normal', 1.7999991541728377e-06)]}

['produk iphone12 harga 12 juta hubung [PHONE_NUMBER]',
 'email [EMAIL] produk kualitas']

In [6]:
steps = review_chain.named_steps
for i, (k, v) in enumerate(steps.items(), 1):
    print(f"{i}. {v}")

1. replace_phone
2. replace_email
3. replace_address
4. replace_id
5. mask_whitelist
6. remove_tags
7. case_normal
8. replace_url
9. remove_emoji
10. word_normalization
11. remove_stopwords
12. shorten_elongation
13. remove_punctuation
14. remove_whitespace
15. unmask_whitelist
