

### Databricks Documentation RAG Ingestion Pipeline
### This notebook implements a high-quality RAG ingestion pipeline with:
### Data reading from Databricks tables
### Semantic chunking for context-aware splitting
### Metadata extraction for enhanced retrieval

In [None]:
from ingestion_pipeline import (
    read_databricks_docs,
    SemanticChunker,
    MetadataExtractor,
    process_document,
    process_all_documents
)

#### Step 1: Read Data from Databricks Table

In [None]:
# Read documentation data
docs_df = read_databricks_docs("databricks_databricks_documentation_dataset.v01.docs")
display(docs_df.limit(5))

#### Step 2: Test Semantic Chunking on a Sample Document

In [None]:
# Initialize semantic chunker
chunker = SemanticChunker(
    model_name='all-MiniLM-L6-v2',
    similarity_threshold=0.5,
    min_chunk_size=200,
    max_chunk_size=1000,
    overlap_sentences=2
)

In [None]:
# Test on a sample document
sample_doc = docs_df.first()
chunks = chunker.chunk_text(sample_doc['content'])


print(f"Created {len(chunks)} semantic chunks from sample document")
print("\nChunk Statistics:")
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1}:")
    print(f"  - Characters: {chunk.char_count}")
    print(f"  - Sentences: {chunk.sentence_count}")
    print(f"  - Preview: {chunk.text[:200]}...")

 #### Step 3: Test Metadata Extraction

In [None]:
# Initialize metadata extractor
extractor = MetadataExtractor()

# Extract metadata from sample document
metadata = extractor.extract_metadata(
    doc_id=sample_doc['id'],
    url=sample_doc['url'],
    content=sample_doc['content']
)

print("Extracted Metadata:")
print(f"  Title: {metadata['title']}")
print(f"  Document Type: {metadata['document_type']}")
print(f"  URL Category: {metadata['url_category']}")
print(f"  URL Path: {metadata['url_path']}")
print(f"\nHeaders ({len(metadata['headers'])}):")
for header in metadata['headers'][:5]:
    print(f"  {'#' * header['level']} {header['text']}")

print(f"\nCode Blocks: {len(metadata['code_blocks'])}")
for i, cb in enumerate(metadata['code_blocks'][:3]):
    print(f"  Block {i+1}: {cb['language']} ({cb['length']} chars)")

print(f"\nLinks: {len(metadata['links'])}")
for link in metadata['links'][:5]:
    print(f"  - {link['text']}: {link['url']}")

print(f"\nKeywords:")
for key, values in metadata['keywords'].items():
    print(f"  {key}: {values[:5]}")

print(f"\nStatistics:")
for key, value in metadata['statistics'].items():
    print(f"  {key}: {value}")



#### Step 4: Process Single Document with Both Functions

In [None]:
processed_chunks = process_document(
    doc_id=sample_doc['id'],
    url=sample_doc['url'],
    content=sample_doc['content'],
    chunker=chunker,
    extractor=extractor
)

print(f"Processed document into {len(processed_chunks)} chunks with metadata")
print(f"\nFirst chunk details:")
chunk = processed_chunks[0]
for key, value in chunk.items():
    if key != 'text':  # Skip full text for brevity
        print(f"  {key}: {value}")
print(f"\nText preview: {chunk['text'][:300]}...")

#### Step 5: Process All Documents (Batch Processing)

In [None]:
# Process all documents and save to Delta table
output_table = "databricks_databricks_documentation_dataset.v01.processed_chunks"

chunks_df = process_all_documents(
    docs_df=docs_df,
    output_table=output_table
)

#### Step 6: Analyze Results

In [None]:
# Display processing statistics
print("Document Type Distribution:")
chunks_df.groupBy('doc_type').count().orderBy('count', ascending=False).show()


print("Chunk Size Statistics:")
chunks_df.select('char_count', 'sentence_count').describe().show()


# Display sample chunks
print("Sample Processed Chunks:")
display(chunks_df.limit(10))