# Data Exploration and Processing
Process all documents into chunks with metadata

In [None]:
import sys
sys.path.append('../src')

from chunking import chunk_document
from preprocessing import normalize_arabic
import glob
import json

In [None]:
# Process all documents
all_chunks = []
metadata = []

categories = ['health', 'education', 'business', 'transportation', 'justice', 'housing', 'culture', 'info']

for cat in categories:
    files = glob.glob(f'../data/{cat}/*.txt')
    
    for filepath in files:
        try:
            chunks = chunk_document(filepath, chunk_size=512, overlap=128)
            
            for i, chunk in enumerate(chunks):
                all_chunks.append(chunk)
                metadata.append({
                    'source_file': filepath,
                    'category': cat,
                    'chunk_id': i,
                    'chunk_length': len(chunk)
                })
        except Exception as e:
            print(f"Error processing {filepath}: {e}")

In [None]:
print(f"Total documents: {len(set(m['source_file'] for m in metadata))}")
print(f"Total chunks: {len(all_chunks)}")
print(f"\nChunks per category:")
for cat in categories:
    count = len([m for m in metadata if m['category'] == cat])
    print(f"  {cat}: {count}")

In [None]:
# Save
with open('../index/corpus_chunks.json', 'w', encoding='utf-8') as f:
    json.dump(all_chunks, f, ensure_ascii=False, indent=2)

with open('../index/corpus_meta.json', 'w', encoding='utf-8') as f:
    json.dump(metadata, f, ensure_ascii=False, indent=2)

print("\nâœ… Saved to index/ directory")

In [None]:
# Sample inspection
print("\n=== Sample Chunk ===")
print(f"Category: {metadata[0]['category']}")
print(f"Length: {metadata[0]['chunk_length']}")
print(f"\nContent:\n{all_chunks[0][:300]}...")