# Cell 1: Markdown

In [1]:
"""
# Data Preparation for FAISS Vector Database

This notebook demonstrates:
1. Loading Python/Azure documentation
2. Data structure and exploration
3. Preparing documents for embedding

**Run this notebook first** before proceeding to indexing.
"""

'\n# Data Preparation for FAISS Vector Database\n\nThis notebook demonstrates:\n1. Loading Python/Azure documentation\n2. Data structure and exploration\n3. Preparing documents for embedding\n\n**Run this notebook first** before proceeding to indexing.\n'

# Cell 2: Setup

In [3]:
import sys
sys.path.append('..')
from src.data_loader import DocumentationLoader
import json

# Cell 3: Create sample documents

In [4]:

loader = DocumentationLoader()
docs = loader.create_sample_docs()

print(f"Total documents created: {len(docs)}")

✓ Created 15 sample documents
✓ Saved to: data\processed\sample_docs.json
Total documents created: 15


# Cell 4: Explore document structure

In [5]:

print("\n--- Sample Document Structure ---")
print(json.dumps(docs[0], indent=2))


--- Sample Document Structure ---
{
  "title": "Python Azure Functions - Getting Started",
  "category": "Azure Functions",
  "text": "Azure Functions is a serverless compute service that lets you run event-triggered code without having to explicitly provision or manage infrastructure. You can use Python to write Azure Functions. Azure Functions supports Python versions 3.7, 3.8, 3.9, and 3.10. The Python worker for Azure Functions is based on the Azure Functions Python library.",
  "url": "https://docs.microsoft.com/azure/functions/python",
  "tags": [
    "azure",
    "serverless",
    "python",
    "functions"
  ]
}


# Cell 5: Analyze categories

In [6]:

from collections import Counter

categories = [doc['category'] for doc in docs]
category_counts = Counter(categories)

print("\n--- Document Categories ---")
for category, count in category_counts.most_common():
    print(f"{category}: {count} documents")


--- Document Categories ---
Python Advanced: 5 documents
Python Basics: 2 documents
Azure AI/ML: 2 documents
Azure Functions: 1 documents
Azure Storage: 1 documents
Azure Databases: 1 documents
Azure Security: 1 documents
Azure DevOps: 1 documents
Azure Messaging: 1 documents


# Cell 6: Analyze document lengths

In [7]:

text_lengths = [len(doc['text']) for doc in docs]
print(f"\n--- Document Statistics ---")
print(f"Total documents: {len(docs)}")
print(f"Avg text length: {sum(text_lengths) / len(text_lengths):.1f} characters")
print(f"Min text length: {min(text_lengths)} characters")
print(f"Max text length: {max(text_lengths)} characters")


--- Document Statistics ---
Total documents: 15
Avg text length: 392.3 characters
Min text length: 347 characters
Max text length: 454 characters


# Cell 7: View sample documents by category

In [8]:

print("\n--- Sample Documents by Category ---")
for category in category_counts.keys():
    print(f"\n{category}:")
    category_docs = [d for d in docs if d['category'] == category]
    for doc in category_docs[:2]:  # Show first 2 from each category
        print(f"  - {doc['title']}")


--- Sample Documents by Category ---

Azure Functions:
  - Python Azure Functions - Getting Started

Python Basics:
  - Python Virtual Environments Best Practices
  - Python List Comprehensions

Azure Storage:
  - Azure Blob Storage Python SDK

Azure Databases:
  - Azure Cosmos DB Python SDK

Python Advanced:
  - Python Decorators Explained
  - Python Type Hints and Annotations

Azure Security:
  - Azure Key Vault with Python

Azure AI/ML:
  - Azure Machine Learning Python SDK
  - Azure Cognitive Services Python SDK

Azure DevOps:
  - Azure DevOps Python API

Azure Messaging:
  - Azure Service Bus Python SDK


# Cell 8: Markdown

In [9]:
"""
## Next Steps

Documents are now ready for embedding and indexing.

Proceed to: `02_faiss_indexing.ipynb`
"""

'\n## Next Steps\n\nDocuments are now ready for embedding and indexing.\n\nProceed to: `02_faiss_indexing.ipynb`\n'