# Part A: Data Ingestion & Vector Database
Build vector database from cloud cost optimization sources using OpenAI embeddings and PGVector

## 1. Setup & Imports

In [12]:
import os
import json
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter

# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector
from tqdm import tqdm
import time

# Load environment variables
load_dotenv()

print("✅ Imports loaded successfully")

✅ Imports loaded successfully


## 2. Configuration

In [None]:
# OpenAI Configuration
EMBEDDING_MODEL = "text-embedding-3-small"

# PGVector Configuration
CONNECTION = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
COLLECTION_NAME = "cloud_cost_optimization"

# Chunking Configuration
CHUNK_SIZE = 400  # tokens (roughly 1600 characters)
CHUNK_OVERLAP = 50  # tokens

print(f"📝 Using embedding model: {EMBEDDING_MODEL}")
print(f"💾 Collection name: {COLLECTION_NAME}")

## 3. Define Data Sources

In [None]:
# Load sources from YAML configuration for better management
import yaml

with open('data/sources.yaml', 'r') as f:
    config = yaml.safe_load(f)
    SOURCES = config['sources']

print(f"📚 Configured {len(SOURCES)} data sources")
for source in SOURCES:
    print(f"  - {source['name']} ({source['provider']})")

## 4. Scrape and Clean Content

In [None]:
def scrape_content(url):
    """Scrape and clean content from a URL"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script, style, nav, footer elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header']):
            element.decompose()
        
        # Extract text from main content areas
        text = soup.get_text(separator='\n', strip=True)
        
        # Clean up extra whitespace
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        text = '\n'.join(lines)
        
        return text
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

# Scrape all sources
raw_documents = []
for source in tqdm(SOURCES, desc="Scraping sources"):
    content = scrape_content(source['url'])
    if content:
        raw_documents.append({
            'content': content,
            'metadata': source
        })
        print(f"Scraped {source['name']}: {len(content)} characters")
    time.sleep(1)  # Be polite

print(f"\n📄 Total documents scraped: {len(raw_documents)}")

## 5. Chunk Documents

In [None]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE * 4,  # Approx 4 chars per token
    chunk_overlap=CHUNK_OVERLAP * 4,
    length_function=len,
    separators=['\n\n', '\n', '. ', ' ', '']
)

# Create Document objects and chunk them
all_chunks = []
chunk_id = 1

for doc in raw_documents:
    # Split text into chunks
    chunks = text_splitter.split_text(doc['content'])
    
    # Create Document objects with metadata
    for chunk in chunks:
        all_chunks.append(
            Document(
                page_content=chunk,
                metadata={
                    'id': chunk_id,
                    'source': doc['metadata']['name'],
                    'url': doc['metadata']['url'],
                    'provider': doc['metadata']['provider']
                }
            )
        )
        chunk_id += 1

print(f"✂️ Created {len(all_chunks)} chunks")
print(f"📊 Average chunk size: {sum(len(c.page_content) for c in all_chunks) // len(all_chunks)} characters")

## 6. Initialize Embeddings & Vector Store

In [None]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=os.getenv('OPENAI_API_KEY')
)

# Initialize PGVector store
vector_store = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION,
    use_jsonb=True,
)

print("✅ Vector store initialized")

## 7. Add Documents to Vector Store

In [None]:
# Add documents in batches to avoid rate limits
BATCH_SIZE = 50
total_added = 0

for i in tqdm(range(0, len(all_chunks), BATCH_SIZE), desc="Adding to vector DB"):
    batch = all_chunks[i:i+BATCH_SIZE]
    ids = [str(doc.metadata['id']) for doc in batch]
    
    vector_store.add_documents(batch, ids=ids)
    total_added += len(batch)
    
    # Rate limiting for OpenAI API
    time.sleep(1)

print(f"\n✅ Added {total_added} chunks to vector database")

## 8. Generate Statistics

In [None]:
# Calculate statistics
provider_counts = {}
for chunk in all_chunks:
    provider = chunk.metadata['provider']
    provider_counts[provider] = provider_counts.get(provider, 0) + 1

stats = {
    'total_sources': len(SOURCES),
    'total_documents_scraped': len(raw_documents),
    'total_chunks': len(all_chunks),
    'embedding_model': EMBEDDING_MODEL,
    'embedding_dimensions': 1536,  # text-embedding-3-small dimension
    'chunk_size_config': CHUNK_SIZE,
    'chunk_overlap': CHUNK_OVERLAP,
    'avg_chunk_size_chars': sum(len(c.page_content) for c in all_chunks) // len(all_chunks),
    'provider_breakdown': provider_counts,
    'vector_db': 'pgvector',
    'collection_name': COLLECTION_NAME
}

# Save statistics
os.makedirs('data', exist_ok=True)
with open('data/ingestion_stats.json', 'w') as f:
    json.dump(stats, f, indent=2)

# Print summary
print("\n" + "="*50)
print("📊 INGESTION STATISTICS")
print("="*50)
print(f"Total Sources: {stats['total_sources']}")
print(f"Documents Scraped: {stats['total_documents_scraped']}")
print(f"Total Chunks: {stats['total_chunks']}")
print(f"Embedding Model: {stats['embedding_model']}")
print(f"Embedding Dimensions: {stats['embedding_dimensions']}")
print(f"Average Chunk Size: {stats['avg_chunk_size_chars']} characters")
print(f"\nProvider Breakdown:")
for provider, count in provider_counts.items():
    print(f"  {provider}: {count} chunks")
print("="*50)
print("\n✅ Statistics saved to data/ingestion_stats.json")

## 9. Test Query (Optional)

In [None]:
# Quick test to verify vector store works
test_query = "How to reduce S3 storage costs?"
results = vector_store.similarity_search(test_query, k=3)

print(f"🔍 Test Query: '{test_query}'\n")
for i, doc in enumerate(results, 1):
    print(f"Result {i}:")
    print(f"Source: {doc.metadata['source']}")
    print(f"Content: {doc.page_content[:200]}...\n")