In [1]:
# === 1. Import library ===
import re
import glob
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display, HTML


In [2]:
# === 2. Ambil semua file teks dalam folder ===
files = glob.glob("datatext/*.txt")  # ganti sesuai folder kamu

punct_pattern = re.compile(r"[!\"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~¡«»…‘’“”–—´]")

all_data = []
summary = []


In [9]:
# === 3. Proses setiap file ===
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()

    sentences = re.split(r'(?<=[.!?])\s+|\n+', text.strip())
    sentences = [s for s in sentences if s]

    for i, sentence in enumerate(sentences, start=1):
        words = sentence.split()
        word_count = sum(1 for w in words if not punct_pattern.fullmatch(w))
        punct_count = sum(1 for w in words if punct_pattern.fullmatch(w))
        all_data.append({
            "File": file.split("\\")[-1],  # hanya nama file
            "Kalimat ke": i,
            "Jumlah Kata": word_count,
            "Jumlah Tanda Baca": punct_count,
            "Total Token": len(words)
        })

    df_file = pd.DataFrame([d for d in all_data if d["File"] == file.split("\\")[-1]])
    summary.append({
        "File": file.split("\\")[-1],
        "Jumlah Kalimat": len(sentences),
        "Jumlah Kata": df_file["Jumlah Kata"].sum(),
        "Jumlah Tanda Baca": df_file["Jumlah Tanda Baca"].sum(),
        "Rata-rata Kata/Kalimat": df_file["Jumlah Kata"].mean()
    })



In [10]:
# === 4. Buat DataFrame ===
df = pd.DataFrame(all_data)
df_summary = pd.DataFrame(summary)


In [11]:
# === 5. Tabel ringkasan yang lebih cantik ===
styled_summary = df_summary.style.set_table_styles(
    [{'selector': 'th', 'props': [('background-color', '#1f77b4'),
                                  ('color', 'white'),
                                  ('font-weight', 'bold')]}]
).bar(subset=["Jumlah Kata", "Jumlah Tanda Baca"], color='#ff7f0e')
display(HTML("<h3>📊 Ringkasan Statistik per File</h3>"))
display(styled_summary)


Unnamed: 0,File,Jumlah Kalimat,Jumlah Kata,Jumlah Tanda Baca,Rata-rata Kata/Kalimat
0,babasan-paribasa.txt,953,9643,904,10.118573
1,mantra-ajian-kabedasan.txt,1,19,0,19.0
2,mantra-jampe-nganjang.txt,1,84,0,84.0
3,mantra-jangjawokan-paranti-dipupur.txt,1,47,0,47.0
4,mantra-rajah-citra-kasunyian.txt,1,27,0,27.0
5,mantra-singlar-ka-musuh.txt,1,26,0,26.0
6,mantra-mantra-asihan-si-burung-pundung.txt,1,29,0,29.0
7,mantra-jampe-dicoco-kala.txt,1,31,0,31.0
8,mantra-jampe-nyimpen-beas.txt,2,92,0,46.0
9,wawacan-nabi-yusuf.txt,164,3820,0,23.292683


In [15]:
# === 6. Grafik interaktif ===
# Bar chart jumlah kalimat per dokumen
fig1 = px.bar(
    df_summary, x="Jumlah Kalimat", y="File", color="File",
    title="Jumlah Kalimat per Dokumen",
    labels={"Jumlah Kalimat": "Jumlah Kalimat", "File": "Dokumen"},
    text="Jumlah Kalimat"
)
fig1.update_traces(textposition="outside")
fig1.show()

# Pie chart global
total_words = df["Jumlah Kata"].sum()
total_punct = df["Jumlah Tanda Baca"].sum()
fig2 = go.Figure(data=[go.Pie(
    labels=["Kata", "Tanda Baca"],
    values=[total_words, total_punct],
    hole=0.4
)])
fig2.update_layout(title_text="Distribusi Global Kata vs Tanda Baca")
fig2.show()


In [7]:
# === 7. Analisis tambahan: kata paling sering muncul ===
from collections import Counter

words_all = []
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()
    tokens = re.findall(r"\w+", text.lower())
    words_all.extend(tokens)

word_freq = Counter(words_all).most_common(15)
df_wordfreq = pd.DataFrame(word_freq, columns=["Kata", "Frekuensi"])

fig4 = px.bar(
    df_wordfreq, x="Kata", y="Frekuensi",
    title="15 Kata Paling Sering Muncul (Semua File)",
    text="Frekuensi"
)
fig4.update_traces(textposition="outside")
fig4.show()


In [None]:
import glob
import re
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff

# ==== 1. Fungsi untuk load corpus POS ====
def load_corpus(path):
    corpus = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or "/" not in line:
                continue
            word, pos = line.split("/", 1)
            word = word.lower().replace("é", "e")  # anggap é = e
            corpus[word] = pos
    return corpus

# ==== 2. Load corpus universal & custom ====
corpus_universal = load_corpus("corpus/pos-uni.txt")
corpus_custom = load_corpus("corpus/pos-sun.txt")

# ==== 3. Ambil semua file teks ====
files = glob.glob("datatext/*.txt")

all_data = []
oov_words = set()  # simpan kata yang tidak ada di corpus

# ==== 4. Proses setiap file ====
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()

    # Pisahkan kalimat (akhiran .!? atau baris baru)
    sentences = re.split(r'(?<=[.!?])\s+|\n+', text.strip())
    sentences = [s for s in sentences if s]

    for i, sentence in enumerate(sentences, start=1):
        # Tokenisasi: pisahkan tanda baca dari kata
        tokens = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE)

        for w in tokens:
            w_norm = w.lower().replace("é", "e")  # normalisasi huruf

            # ==== Deteksi otomatis ====
            if w_norm.isdigit():
                pos_uni, pos_cus = "NUM", "ANGKA"
            elif re.match(r"^\W+$", w_norm):  # tanda baca murni
                pos_uni, pos_cus = "PUNCT", "TANDA_BACA"
            else:
                pos_uni = corpus_universal.get(w_norm, "UNK")
                pos_cus = corpus_custom.get(w_norm, "UNK")

                if pos_uni == "UNK" and pos_cus == "UNK":
                    oov_words.add(w_norm)  # masukkan ke daftar OOV

            all_data.append({
                "File": file,
                "Kalimat ke": i,
                "Kata": w,
                "POS Universal": pos_uni,
                "POS Custom": pos_cus
            })

# ==== 5. DataFrame hasil tagging ====
df_pos = pd.DataFrame(all_data)

# ==== 6. Statistik kata yang diproses vs OOV ====
total_kata = len(df_pos)
kata_oov = len(oov_words)
kata_terproses = total_kata - kata_oov

print("=== Statistik Kata ===")
print(f"Total kata        : {total_kata}")
print(f"Kata terproses    : {kata_terproses}")
print(f"Kata OOV (tidak ditemukan) : {kata_oov}")

# Simpan kata OOV ke file
with open("oov.txt", "w", encoding="utf-8") as f:
    for w in sorted(oov_words):
        f.write(w + "\n")

# ==== 7. Tabel interaktif (preview 50 baris pertama) ====
fig_table = ff.create_table(df_pos.head(50))
fig_table.show()

# ==== 8. Distribusi POS Universal ====
pos_counts_uni = df_pos["POS Universal"].value_counts().reset_index()
pos_counts_uni.columns = ["POS", "Jumlah"]

fig_uni = px.bar(
    pos_counts_uni, x="POS", y="Jumlah",
    title="Distribusi POS (Universal)", color="POS",
    text="Jumlah"
)
fig_uni.update_traces(textposition="outside")
fig_uni.show()

# ==== 9. Distribusi POS Custom ====
pos_counts_cus = df_pos["POS Custom"].value_counts().reset_index()
pos_counts_cus.columns = ["POS", "Jumlah"]

fig_cus = px.bar(
    pos_counts_cus, x="POS", y="Jumlah",
    title="Distribusi POS (Custom)", color="POS",
    text="Jumlah"
)
fig_cus.update_traces(textposition="outside")
fig_cus.show()

# ==== 10. Perbandingan Universal vs Custom ====
comparison = df_pos.groupby(["POS Universal", "POS Custom"]).size().reset_index(name="Count")

fig_comp = px.scatter(
    comparison, x="POS Universal", y="POS Custom", size="Count",
    title="Perbandingan POS Universal vs Custom", color="Count"
)
fig_comp.show()


=== Statistik Kata ===
Total kata        : 33048
Kata terproses    : 27549
Kata OOV (tidak ditemukan) : 5499


In [None]:
import glob
import re
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff

# ==== 1. Fungsi untuk load corpus POS ====
def load_corpus(path):
    corpus = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line or "/" not in line:
                continue
            word, pos = line.rsplit("/", 1)  # pakai rsplit supaya lebih aman
            word = word.lower().replace("é", "e")  # anggap é = e
            corpus[word] = pos
    return corpus

# ==== 2. Load corpus universal & custom ====
corpus_universal = load_corpus("corpus/pos-uni.txt")
corpus_custom = load_corpus("corpus/pos-sun.txt")

# ==== 3. Fungsi bantu untuk cek ke corpus ====
def _get_word_tag(word: str) -> str:
    """Cek tag dari corpus custom dulu, lalu universal"""
    return corpus_custom.get(word, corpus_universal.get(word, "UNK"))

def _check_infiks_ar(word: str):
    """Check for -ar- infix pattern (huruf ke-2 'a' dan ke-3 'r')"""
    if len(word) >= 4 and word[1] == 'a' and word[2] == 'r':
        base_word = word[0] + word[3:]
        return True, base_word
    return False, word

def _apply_kb_rules(word: str):
    """Apply KB rules untuk kata OOV"""
    word_lower = word.lower()

    # Rule: (-ar- jika huruf ke-2 a dan ke-3 r) + KB
    has_ar_infix, word_without_ar = _check_infiks_ar(word_lower)
    if has_ar_infix:
        base_tag = _get_word_tag(word_without_ar)
        if base_tag == 'KB':
            return 'KB'
        word_lower = word_without_ar

    # Rule: ka- + KS
    if word_lower.startswith('ka'):
        base_word = word_lower[2:]
        if base_word and _get_word_tag(base_word) == 'KS':
            return 'KB'

    # Rule: pa- + [KB / KP / KS]
    if word_lower.startswith('pa'):
        base_word = word_lower[2:]
        if base_word and _get_word_tag(base_word) in ['KB', 'KP', 'KS']:
            return 'KB'

    # Rule: pang- + [KB / KS]
    if word_lower.startswith('pang'):
        base_word = word_lower[4:]
        if base_word and _get_word_tag(base_word) in ['KB', 'KS']:
            return 'KB'

    # Rule: ar- + KB
    if word_lower.startswith('ar'):
        base_word = word_lower[2:]
        if base_word and _get_word_tag(base_word) == 'KB':
            return 'KB'

    # Rule: [KB / KP / KS] + -an
    if word_lower.endswith('an'):
        base_word = word_lower[:-2]
        if base_word and _get_word_tag(base_word) in ['KB', 'KP', 'KS']:
            return 'KB'

    # Rule: [KB / KP] + -eun
    if word_lower.endswith('eun'):
        base_word = word_lower[:-3]
        if base_word and _get_word_tag(base_word) in ['KB', 'KP']:
            return 'KB'

    # Rule: [KB / KP / KS / WIL] + -na
    if word_lower.endswith('na'):
        base_word = word_lower[:-2]
        if base_word and _get_word_tag(base_word) in ['KB', 'KP', 'KS', 'WIL']:
            return 'KB'

    # Rule: ka- + [KB / KP / KS] + -an
    if word_lower.startswith('ka') and word_lower.endswith('an'):
        base_word = word_lower[2:-2]
        if base_word and _get_word_tag(base_word) in ['KB', 'KP', 'KS']:
            return 'KB'

    # Rule: pa- + [KB / KP / KS] + -an
    if word_lower.startswith('pa') and word_lower.endswith('an'):
        base_word = word_lower[2:-2]
        if base_word and _get_word_tag(base_word) in ['KB', 'KP', 'KS']:
            return 'KB'

    # Rule: per- + KP + -an
    if word_lower.startswith('per') and word_lower.endswith('an'):
        base_word = word_lower[3:-2]
        if base_word and _get_word_tag(base_word) == 'KP':
            return 'KB'

    # Rule: pi- + KB + -an
    if word_lower.startswith('pi') and word_lower.endswith('an'):
        base_word = word_lower[2:-2]
        if base_word and _get_word_tag(base_word) == 'KB':
            return 'KB'

    # Rule: pi- + [KB / KP / KS] + -eun
    if word_lower.startswith('pi') and word_lower.endswith('eun'):
        base_word = word_lower[2:-3]
        if base_word and _get_word_tag(base_word) in ['KB', 'KP', 'KS']:
            return 'KB'

    return None

# ==== 4. Ambil semua file teks ====
files = glob.glob("datatext/*.txt")

all_data = []
oov_words = set()

# ==== 5. Proses setiap file ====
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()

    # Pisahkan kalimat (akhiran .!? atau baris baru)
    sentences = re.split(r'(?<=[.!?])\s+|\n+', text.strip())
    sentences = [s for s in sentences if s]

    for i, sentence in enumerate(sentences, start=1):
        tokens = re.findall(r"\w+|[^\w\s]", sentence, re.UNICODE)

        for w in tokens:
            w_norm = w.lower().replace("é", "e")  # normalisasi huruf
            sumber_tag = "Corpus"

            if w_norm.isdigit():
                pos_uni, pos_cus = "NUM", "ANGKA"
            elif re.match(r"^\W+$", w_norm):  # tanda baca
                pos_uni, pos_cus = "PUNCT", "TANDA_BACA"
            else:
                pos_uni = corpus_universal.get(w_norm, "NOUN")  # default = NOUN
                pos_cus = corpus_custom.get(w_norm, "KB")       # default = KB (kata benda)

                # Kalau OOV, coba rule-based
                if pos_uni == "NOUN" and pos_cus == "KB":
                    rule_tag = _apply_kb_rules(w_norm)
                    if rule_tag:
                        pos_cus = rule_tag
                        sumber_tag = "Rule"
                    else:
                        oov_words.add(w_norm)
                        sumber_tag = "OOV"


            all_data.append({
                "File": file,
                "Kalimat ke": i,
                "Kata": w,
                "POS Universal": pos_uni,
                "POS Custom": pos_cus,
                "Sumber Tag": sumber_tag
            })

# ==== 6. DataFrame hasil tagging ====
df_pos = pd.DataFrame(all_data)

# ==== 7. Statistik kata ====
total_kata = len(df_pos)
kata_oov = len([w for w in df_pos["Sumber Tag"] if w == "OOV"])
kata_rule = len([w for w in df_pos["Sumber Tag"] if w == "Rule"])
kata_corpus = total_kata - kata_oov - kata_rule

print("=== Statistik Kata ===")
print(f"Total kata        : {total_kata}")
print(f"Kata dari Corpus  : {kata_corpus}")
print(f"Kata dari Rule    : {kata_rule}")
print(f"Kata OOV          : {kata_oov}")

# Simpan kata OOV ke file
import os

# Pastikan file disimpan di folder yang valid
output_file = os.path.join(os.getcwd(), "oov.txt")

with open(output_file, "w", encoding="utf-8") as f:
    for w in sorted(oov_words):
        f.write(w + "\n")

print(f"Daftar OOV berhasil disimpan ke: {output_file}")

# ==== 8. Tabel interaktif (preview 50 baris pertama) ====
fig_table = ff.create_table(df_pos.head(50))
fig_table.show()

# ==== 9. Distribusi POS Universal ====
pos_counts_uni = df_pos["POS Universal"].value_counts().reset_index()
pos_counts_uni.columns = ["POS", "Jumlah"]

fig_uni = px.bar(
    pos_counts_uni, x="POS", y="Jumlah",
    title="Distribusi POS (Universal)", color="POS",
    text="Jumlah"
)
fig_uni.update_traces(textposition="outside")
fig_uni.show()

# ==== 10. Distribusi POS Custom ====
pos_counts_cus = df_pos["POS Custom"].value_counts().reset_index()
pos_counts_cus.columns = ["POS", "Jumlah"]

fig_cus = px.bar(
    pos_counts_cus, x="POS", y="Jumlah",
    title="Distribusi POS (Custom)", color="POS",
    text="Jumlah"
)
fig_cus.update_traces(textposition="outside")
fig_cus.show()

# ==== 11. Perbandingan Universal vs Custom ====
comparison = df_pos.groupby(["POS Universal", "POS Custom"]).size().reset_index(name="Count")

fig_comp = px.scatter(
    comparison, x="POS Universal", y="POS Custom", size="Count",
    title="Perbandingan POS Universal vs Custom", color="Count"
)
fig_comp.show()


=== Statistik Kata ===
Total kata        : 33048
Kata dari Corpus  : 16015
Kata dari Rule    : 510
Kata OOV          : 16523
Daftar OOV berhasil disimpan ke: d:\PENELITIAN\2025\PDP POS TAGGING BASA SUNDA\Code POS\sunda-pos\oov.txt


In [28]:
import re
from typing import List, Tuple, Optional

class SundaPOSTagger:
    def __init__(self, corpus_path: str):
        """
        Initialize Sunda POS Tagger with corpus text file
        Format: kata/POS per line
        """
        self.dataset = self._load_dataset(corpus_path)
        self.word_dict = self._create_word_dict()

    def _load_dataset(self, corpus_path: str) -> list:
        """Load dataset from text file format kata/POS"""
        data = []
        try:
            with open(corpus_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line or line.startswith("#"):  # skip kosong/komentar
                        continue
                    if "/" in line:
                        word, tag = line.split("/", 1)
                        word, tag = word.strip(), tag.strip()
                        data.append((word, tag))
            return data
        except Exception as e:
            raise Exception(f"Error loading dataset: {e}")

    def _create_word_dict(self) -> dict:
        """Create dictionary for fast word lookup"""
        word_dict = {}
        for word, tag in self.dataset:
            word = word.lower()
            if "," in word:
                words = [w.strip() for w in word.split(",")]
                for w in words:
                    word_dict[w] = tag
            else:
                word_dict[word] = tag
        return word_dict

    def _get_word_tag(self, word: str) -> Optional[str]:
        """Get tag for a word from dataset"""
        return self.word_dict.get(word.lower())

    def _check_infiks_ar(self, word: str) -> Tuple[bool, str]:
        """Check for -ar- infix pattern"""
        if len(word) >= 4 and word[1] == "a" and word[2] == "r":
            base_word = word[0] + word[3:]
            return True, base_word
        return False, word

    def _apply_kb_rules(self, word: str) -> Optional[str]:
        """Apply KB rules"""
        word_lower = word.lower()

        # Rule: -ar- infix
        has_ar_infix, word_without_ar = self._check_infiks_ar(word_lower)
        if has_ar_infix:
            base_tag = self._get_word_tag(word_without_ar)
            if base_tag == "KB":
                return "KB"
            word_lower = word_without_ar

        # Rule: ka- + KS
        if word_lower.startswith("ka"):
            base_word = word_lower[2:]
            if base_word and self._get_word_tag(base_word) == "KS":
                return "KB"

        # Rule: pa- + [KB/KP/KS]
        if word_lower.startswith("pa"):
            base_word = word_lower[2:]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KP", "KS"]:
                    return "KB"

        # Rule: pang- + [KB/KS]
        if word_lower.startswith("pang"):
            base_word = word_lower[4:]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KS"]:
                    return "KB"

        # Rule: ar- + KB
        if word_lower.startswith("ar"):
            base_word = word_lower[2:]
            if base_word and self._get_word_tag(base_word) == "KB":
                return "KB"

        # Rule: [KB/KP/KS] + -an
        if word_lower.endswith("an"):
            base_word = word_lower[:-2]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KP", "KS"]:
                    return "KB"

        # Rule: [KB/KP] + -eun
        if word_lower.endswith("eun"):
            base_word = word_lower[:-3]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KP"]:
                    return "KB"

        # Rule: [KB/KP/KS/WIL] + -na
        if word_lower.endswith("na"):
            base_word = word_lower[:-2]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KP", "KS", "WIL"]:
                    return "KB"

        # Rule: ka- + [KB/KP/KS] + -an
        if word_lower.startswith("ka") and word_lower.endswith("an"):
            base_word = word_lower[2:-2]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KP", "KS"]:
                    return "KB"

        # Rule: pa- + [KB/KP/KS] + -an
        if word_lower.startswith("pa") and word_lower.endswith("an"):
            base_word = word_lower[2:-2]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KP", "KS"]:
                    return "KB"

        # Rule: per- + KP + -an
        if word_lower.startswith("per") and word_lower.endswith("an"):
            base_word = word_lower[3:-2]
            if base_word and self._get_word_tag(base_word) == "KP":
                return "KB"

        # Rule: pi- + KB + -an
        if word_lower.startswith("pi") and word_lower.endswith("an"):
            base_word = word_lower[2:-2]
            if base_word and self._get_word_tag(base_word) == "KB":
                return "KB"

        # Rule: pi- + [KB/KP/KS] + -eun
        if word_lower.startswith("pi") and word_lower.endswith("eun"):
            base_word = word_lower[2:-3]
            if base_word:
                base_tag = self._get_word_tag(base_word)
                if base_tag in ["KB", "KP", "KS"]:
                    return "KB"

        return None

    def tag_word(self, word: str) -> str:
        """Tag a single word"""
        exact_tag = self._get_word_tag(word)
        if exact_tag:
            return exact_tag
        kb_tag = self._apply_kb_rules(word)
        if kb_tag:
            return kb_tag
        return "UNK"

    def tag_sentence(self, sentence: str) -> List[Tuple[str, str]]:
        """Tag all words in a sentence"""
        words = re.findall(r"\b\w+\b", sentence)
        return [(word, self.tag_word(word)) for word in words]

    def tag_sentence_string(self, sentence: str) -> str:
        tagged_words = self.tag_sentence(sentence)
        return " ".join([f"{w}/{t}" for w, t in tagged_words])


# === Main ===
if __name__ == "__main__":
    # Load corpus kata/POS
    tagger = SundaPOSTagger("corpus/pos-sun.txt")

    print("=== Sunda POS Tagger ===\n")

    # Baca kalimat uji dari file multi-dokumen
    with open("datatext/babasan-paribasa.txt", "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f if line.strip()]

    print("Tag kalimat dari file:")
    for sentence in sentences:
        print(f"Input : {sentence}")
        print(f"Output: {tagger.tag_sentence_string(sentence)}\n")


=== Sunda POS Tagger ===

Tag kalimat dari file:
Input : Abang-abang lambé = beureum biwirna wungkul, caritaan nu ngan saukur keur ngangeunahkeun haté nu séjén.
Output: Abang/UNK abang/UNK lambé/UNK beureum/KS biwirna/KB wungkul/KT caritaan/UNK nu/KG ngan/KT saukur/UNK keur/UNK ngangeunahkeun/UNK haté/UNK nu/KG séjén/UNK

Input : Abis bulan, abis uang = béak bulán, duit gajih ogé béak deuih.
Output: Abis/UNK bulan/KB abis/UNK uang/UNK béak/UNK bulán/UNK duit/UNK gajih/UNK ogé/UNK béak/UNK deuih/UNK

Input : Abong biwir teu diwengku, abong létah teu tulangan = jalma anu ngomongna sakarep-karep, ngomong teu reujeung, wiwaha.
Output: Abong/KT biwir/KB teu/PL diwengku/UNK abong/KT létah/UNK teu/PL tulangan/UNK jalma/UNK anu/UNK ngomongna/UNK sakarep/UNK karep/UNK ngomong/UNK teu/PL reujeung/UNK wiwaha/KB

Input : Adep-hidép = kumawula (ka salaki)
Output: Adep/UNK hidép/UNK kumawula/UNK ka/UNK salaki/KB

Input : Adil paramarta = kacida adilna.
Output: Adil/UNK paramarta/KT kacida/KT adilna/