In [None]:
# Notebook 01: Khmer Text Segmentation Exploration
# Objective: Explore raw Khmer corpus, verify segmentation quality, and understand tokenization patterns.

import os
import sys
sys.path.append('..')

from src.preprocessing.text_loader import load_corpus
from src.segmentation.segmenter_interface import KhmerSegmenter
from src.preprocessing.unicode_normalizer import normalize_text
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

In [None]:
# Load raw data
raw_dir = "../data/raw"
corpus = load_corpus(raw_dir)

print(f"=== CORPUS OVERVIEW ===")
print(f"Number of documents: {len(corpus)}")
for doc_id, content in corpus.items():
    print(f"{doc_id}: {len(content)} characters, {len(content.split())} word-like tokens")

In [None]:
# Initialize segmenter
segmenter = KhmerSegmenter()

# Sample segmentation examples
sample_texts = [
    corpus['khmer_articles.txt'][:200],
    corpus['khmer_news_corpus.txt'][:200],
    corpus['khmer_corpus.txt'][:200]
]

print("=== SEGMENTATION EXAMPLES ===")
for i, text in enumerate(sample_texts):
    normalized = normalize_text(text)
    segmented = segmenter.segment(normalized)

    print(f"\n--- Sample {i+1} ---")
    print(f"Original (first 100 chars): {text[:100]}...")
    print(f"Segmented (first 20 tokens): {' | '.join(segmented[:20])}")
    print(f"Token count: {len(segmented)}")

In [None]:
# Analyze token length distribution
all_tokens = []
for doc_id, text in corpus.items():
    normalized = normalize_text(text)
    segmented = segmenter.segment(normalized)
    all_tokens.extend(segmented)

token_lengths = [len(token) for token in all_tokens]

print(f"=== TOKEN STATISTICS ===")
print(f"Total tokens: {len(all_tokens)}")
print(f"Unique tokens: {len(set(all_tokens))}")
print(f"Average token length: {np.mean(token_lengths):.2f} characters")
print(f"Median token length: {np.median(token_lengths):.2f} characters")

In [None]:
# Plot token length distribution
plt.figure(figsize=(10, 6))
plt.hist(token_lengths, bins=range(1, 21), edgecolor='black')
plt.title('Distribution of Khmer Token Lengths')
plt.xlabel('Token Length (characters)')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Most common tokens
token_counts = Counter(all_tokens)
most_common = token_counts.most_common(20)

print("=== MOST COMMON TOKENS ===")
for token, count in most_common:
    print(f"{token}: {count} occurrences")

In [None]:
# Plot top 20 tokens with Khmer font support
tokens, counts = zip(*most_common)

plt.figure(figsize=(12, 8))

# Use a font that supports Khmer script
plt.rcParams['font.family'] = 'Khmer OS'  # or another Khmer-supporting font like 'Khmer OS System', 'Khmer OS Siemreap'

plt.barh(range(len(tokens)), counts)
plt.yticks(range(len(tokens)), tokens)

# Ensure the font is applied to the labels
for label in plt.gca().get_yticklabels():
    label.set_fontname('Khmer OS')

plt.xlabel('Frequency')
plt.title('Top 20 Most Frequent Khmer Tokens')
plt.gca().invert_yaxis()
plt.tight_layout()  # Adjust layout to prevent label cutoff
plt.show()


In [None]:
# Key Observations
print("""
=== KEY OBSERVATIONS ===
1. Segmentation Quality: khmer-nltk successfully tokenized Khmer text without explicit spacing
2. Token Length: Most tokens are 1-5 characters (typical for Khmer script)
3. High-frequency words: Common function words appear at the top of frequency list
4. Corpus Variation: Different document types show varying token distributions
""")