# Test Preprocessing and Chunking
Verify that preprocessing and chunking work correctly on sample files

In [None]:
import sys
sys.path.append('../src')

from preprocessing import normalize_arabic, clean_document
from chunking import chunk_document
import glob

## Test 1: Arabic Normalization

In [None]:
# Test basic normalization
test_cases = [
    "اَلسَّلامُ عَلَيْكُم",  # With diacritics
    "إستفسار أو آراء",      # Different alef forms
    "مدرسة ومدرسه",         # Taa marbuta
    "على ومحمّد"            # Mixed
]

for test in test_cases:
    normalized = normalize_arabic(test)
    print(f"Original:   {test}")
    print(f"Normalized: {normalized}")
    print()

## Test 2: Document Cleaning on Real Files

In [None]:
# Test on 3 actual files
files = glob.glob('../data/**/*.txt', recursive=True)[:3]

for filepath in files:
    print(f"\n{'='*60}")
    print(f"FILE: {filepath}")
    print('='*60)
    
    with open(filepath, 'r', encoding='utf-8') as f:
        original = f.read()
    
    cleaned = clean_document(original)
    normalized = normalize_arabic(cleaned)
    
    print(f"\nORIGINAL LENGTH: {len(original)} chars")
    print(f"CLEANED LENGTH: {len(cleaned)} chars")
    print(f"NORMALIZED LENGTH: {len(normalized)} chars")
    
    print(f"\n--- BEFORE (first 150 chars) ---")
    print(original[:150])
    
    print(f"\n--- AFTER (first 150 chars) ---")
    print(normalized[:150])

## Test 3: Chunking Strategy

In [None]:
# Test chunking on 5 files
files = glob.glob('../data/**/*.txt', recursive=True)[:5]

total_chunks = 0
chunk_sizes = []

for filepath in files:
    chunks = chunk_document(filepath, chunk_size=512, overlap=128)
    total_chunks += len(chunks)
    
    print(f"\n{filepath.split('/')[-1]}: {len(chunks)} chunks")
    
    for i, chunk in enumerate(chunks):
        chunk_sizes.append(len(chunk))
        print(f"  Chunk {i+1}: {len(chunk)} chars")
        print(f"    Preview: {chunk[:100]}...")

print(f"\n{'='*60}")
print(f"Total chunks created: {total_chunks}")
print(f"Average chunk size: {sum(chunk_sizes)/len(chunk_sizes):.0f} chars")
print(f"Min chunk size: {min(chunk_sizes)} chars")
print(f"Max chunk size: {max(chunk_sizes)} chars")

## Test 4: Verify Chunk Quality

In [None]:
# Pick one file and examine chunks in detail
test_file = glob.glob('../data/education/*.txt')[0]
chunks = chunk_document(test_file, chunk_size=512, overlap=128)

print(f"Testing file: {test_file}")
print(f"Number of chunks: {len(chunks)}\n")

for i, chunk in enumerate(chunks):
    print(f"\n{'='*60}")
    print(f"CHUNK {i+1} ({len(chunk)} chars)")
    print('='*60)
    print(chunk)
    
    if i >= 2:  # Show first 3 chunks only
        break

## ✅ Checkpoint

If all tests pass:
- Arabic normalization works correctly
- Document cleaning preserves structure
- Chunking creates reasonable-sized pieces
- Ready to process all documents!