# Part A: Data Ingestion & Vector Database
Build vector database from cloud cost optimization sources using OpenAI embeddings and PGVector

## 1. Setup & Imports

In [4]:
import os
import json
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests
from langchain_text_splitters import RecursiveCharacterTextSplitter

# from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_postgres import PGVector
from tqdm import tqdm
import time

# Load environment variables
load_dotenv()

print("‚úÖ Imports loaded successfully")

‚úÖ Imports loaded successfully


## 2. Configuration

In [5]:
# OpenAI Configuration
EMBEDDING_MODEL = "text-embedding-3-small"

# PGVector Configuration
CONNECTION = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
COLLECTION_NAME = "cloud_cost_optimization"

# Chunking Configuration
CHUNK_SIZE = 400  # tokens (roughly 1600 characters)
CHUNK_OVERLAP = 50  # tokens

print(f"üìù Using embedding model: {EMBEDDING_MODEL}")
print(f"üíæ Collection name: {COLLECTION_NAME}")

üìù Using embedding model: text-embedding-3-small
üíæ Collection name: cloud_cost_optimization


## 3. Define Data Sources

In [6]:
# Load sources from YAML configuration for better management
import yaml

with open('data/sources.yaml', 'r') as f:
    config = yaml.safe_load(f)
    SOURCES = config['sources']

print(f"üìö Configured {len(SOURCES)} data sources")
for source in SOURCES:
    print(f"  - {source['name']} ({source['provider']})")

üìö Configured 23 data sources
  - AWS Cost Optimization Pillar (AWS)
  - AWS S3 Storage Classes (AWS)
  - AWS EC2 Pricing (AWS)
  - AWS Cost Optimization Blog (AWS)
  - Azure Cost Management Best Practices (Azure)
  - Azure Advisor Cost Recommendations (Azure)
  - Azure Storage Cost Optimization (Azure)
  - GCP Cost Optimization Best Practices (GCP)
  - GCP Cloud Storage Cost Optimization (GCP)
  - GCP Compute Engine Pricing (GCP)
  - AWS Reserved Instances Guide (AWS)
  - AWS Spot Instances Best Practices (AWS)
  - Azure Reserved VM Instances (Azure)
  - Azure Spot Virtual Machines (Azure)
  - GCP Committed Use Discounts (GCP)
  - GCP Preemptible VM Instances (GCP)
  - AWS Lambda Cost Optimization (AWS)
  - Azure Functions Cost Optimization (Azure)
  - GCP Cloud Functions Pricing (GCP)
  - AWS RDS Cost Optimization (AWS)
  - Azure SQL Database Cost Optimization (Azure)
  - GCP Cloud SQL Cost Optimization (GCP)
  - Cloud Cost Optimization Guide - Cloudability (Third-party)


## 4. Scrape and Clean Content

In [7]:
def scrape_content(url):
    """Scrape and clean content from a URL"""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Remove script, style, nav, footer elements
        for element in soup(['script', 'style', 'nav', 'footer', 'header']):
            element.decompose()
        
        # Extract text from main content areas
        text = soup.get_text(separator='\n', strip=True)
        
        # Clean up extra whitespace
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        text = '\n'.join(lines)
        
        return text
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None

# Scrape all sources
raw_documents = []
for source in tqdm(SOURCES, desc="Scraping sources"):
    content = scrape_content(source['url'])
    if content:
        raw_documents.append({
            'content': content,
            'metadata': source
        })
        print(f"Scraped {source['name']}: {len(content)} characters")
    time.sleep(1)  # Be polite

print(f"\nüìÑ Total documents scraped: {len(raw_documents)}")

Scraping sources:   0%|          | 0/23 [00:00<?, ?it/s]

Scraped AWS Cost Optimization Pillar: 2659 characters


Scraping sources:   4%|‚ñç         | 1/23 [00:07<02:54,  7.92s/it]

Scraped AWS S3 Storage Classes: 27231 characters


Scraping sources:   9%|‚ñä         | 2/23 [00:11<01:51,  5.30s/it]

Scraped AWS EC2 Pricing: 3109 characters


Scraping sources:  13%|‚ñà‚ñé        | 3/23 [00:15<01:34,  4.72s/it]

Scraped AWS Cost Optimization Blog: 4265 characters


Scraping sources:  17%|‚ñà‚ñã        | 4/23 [00:17<01:09,  3.66s/it]

Scraped Azure Cost Management Best Practices: 19508 characters


Scraping sources:  22%|‚ñà‚ñà‚ñè       | 5/23 [00:21<01:06,  3.68s/it]

Scraped Azure Advisor Cost Recommendations: 10547 characters


Scraping sources:  26%|‚ñà‚ñà‚ñå       | 6/23 [00:25<01:05,  3.86s/it]

Scraped Azure Storage Cost Optimization: 22145 characters


Scraping sources:  30%|‚ñà‚ñà‚ñà       | 7/23 [00:29<01:03,  4.00s/it]

Scraped GCP Cost Optimization Best Practices: 16994 characters


Scraping sources:  35%|‚ñà‚ñà‚ñà‚ñç      | 8/23 [00:37<01:20,  5.36s/it]

Scraped GCP Cloud Storage Cost Optimization: 9818 characters


Scraping sources:  39%|‚ñà‚ñà‚ñà‚ñâ      | 9/23 [00:40<01:02,  4.46s/it]

Scraped GCP Compute Engine Pricing: 100716 characters


Scraping sources:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 10/23 [00:46<01:06,  5.11s/it]

Scraped AWS Reserved Instances Guide: 6876 characters


Scraping sources:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 11/23 [00:50<00:54,  4.53s/it]

Scraped AWS Spot Instances Best Practices: 14332 characters


Scraping sources:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 12/23 [00:52<00:43,  3.96s/it]

Scraped Azure Reserved VM Instances: 11933 characters


Scraping sources:  57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 13/23 [00:55<00:36,  3.64s/it]

Scraped Azure Spot Virtual Machines: 9817 characters


Scraping sources:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 14/23 [00:59<00:32,  3.64s/it]

Scraped GCP Committed Use Discounts: 32038 characters


Scraping sources:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 15/23 [01:02<00:28,  3.60s/it]

Scraped GCP Preemptible VM Instances: 11282 characters


Scraping sources:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 16/23 [01:08<00:28,  4.13s/it]

Scraped AWS Lambda Cost Optimization: 29618 characters


Scraping sources:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 17/23 [01:13<00:26,  4.48s/it]

Scraped Azure Functions Cost Optimization: 21065 characters


Scraping sources:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 18/23 [01:16<00:20,  4.09s/it]

Scraped GCP Cloud Functions Pricing: 685 characters


Scraping sources:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 19/23 [01:20<00:15,  3.90s/it]

Scraped AWS RDS Cost Optimization: 35746 characters


Scraping sources:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 20/23 [01:22<00:10,  3.35s/it]

Scraped Azure SQL Database Cost Optimization: 11199 characters


Scraping sources:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 21/23 [01:26<00:07,  3.60s/it]

Error scraping https://cloud.google.com/sql/docs/mysql/optimize-costs: 404 Client Error: Not Found for url: https://cloud.google.com/sql/docs/mysql/optimize-costs


Scraping sources:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 22/23 [01:29<00:03,  3.34s/it]

Error scraping https://www.cloudability.com/blog/cloud-cost-optimization/: 404 Client Error: Not Found for url: https://www.apptio.com/products/cloudability/blog/cloud-cost-optimization/


Scraping sources: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23/23 [01:33<00:00,  4.08s/it]


üìÑ Total documents scraped: 21





## 5. Chunk Documents

In [8]:
# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE * 4,  # Approx 4 chars per token
    chunk_overlap=CHUNK_OVERLAP * 4,
    length_function=len,
    separators=['\n\n', '\n', '. ', ' ', '']
)

# Create Document objects and chunk them
all_chunks = []
chunk_id = 1

for doc in raw_documents:
    # Split text into chunks
    chunks = text_splitter.split_text(doc['content'])
    
    # Create Document objects with metadata
    for chunk in chunks:
        all_chunks.append(
            Document(
                page_content=chunk,
                metadata={
                    'id': chunk_id,
                    'source': doc['metadata']['name'],
                    'url': doc['metadata']['url'],
                    'provider': doc['metadata']['provider']
                }
            )
        )
        chunk_id += 1

print(f"‚úÇÔ∏è Created {len(all_chunks)} chunks")
print(f"üìä Average chunk size: {sum(len(c.page_content) for c in all_chunks) // len(all_chunks)} characters")

‚úÇÔ∏è Created 294 chunks
üìä Average chunk size: 1506 characters


## 6. Initialize Embeddings & Vector Store

In [9]:
# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=os.getenv('OPENAI_API_KEY')
)

# Initialize PGVector store
vector_store = PGVector(
    embeddings=embeddings,
    collection_name=COLLECTION_NAME,
    connection=CONNECTION,
    use_jsonb=True,
)

print("‚úÖ Vector store initialized")

‚úÖ Vector store initialized


## 7. Add Documents to Vector Store

In [10]:
# Add documents in batches to avoid rate limits
BATCH_SIZE = 50
total_added = 0

for i in tqdm(range(0, len(all_chunks), BATCH_SIZE), desc="Adding to vector DB"):
    batch = all_chunks[i:i+BATCH_SIZE]
    ids = [str(doc.metadata['id']) for doc in batch]
    
    vector_store.add_documents(batch, ids=ids)
    total_added += len(batch)
    
    # Rate limiting for OpenAI API
    time.sleep(1)

print(f"\n‚úÖ Added {total_added} chunks to vector database")

Adding to vector DB: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [00:33<00:00,  5.59s/it]


‚úÖ Added 294 chunks to vector database





## 8. Generate Statistics

In [11]:
# Calculate statistics
provider_counts = {}
for chunk in all_chunks:
    provider = chunk.metadata['provider']
    provider_counts[provider] = provider_counts.get(provider, 0) + 1

stats = {
    'total_sources': len(SOURCES),
    'total_documents_scraped': len(raw_documents),
    'total_chunks': len(all_chunks),
    'embedding_model': EMBEDDING_MODEL,
    'embedding_dimensions': 1536,  # text-embedding-3-small dimension
    'chunk_size_config': CHUNK_SIZE,
    'chunk_overlap': CHUNK_OVERLAP,
    'avg_chunk_size_chars': sum(len(c.page_content) for c in all_chunks) // len(all_chunks),
    'provider_breakdown': provider_counts,
    'vector_db': 'pgvector',
    'collection_name': COLLECTION_NAME
}

# Save statistics
os.makedirs('data', exist_ok=True)
with open('data/ingestion_stats.json', 'w') as f:
    json.dump(stats, f, indent=2)

# Print summary
print("\n" + "="*50)
print("üìä INGESTION STATISTICS")
print("="*50)
print(f"Total Sources: {stats['total_sources']}")
print(f"Documents Scraped: {stats['total_documents_scraped']}")
print(f"Total Chunks: {stats['total_chunks']}")
print(f"Embedding Model: {stats['embedding_model']}")
print(f"Embedding Dimensions: {stats['embedding_dimensions']}")
print(f"Average Chunk Size: {stats['avg_chunk_size_chars']} characters")
print(f"\nProvider Breakdown:")
for provider, count in provider_counts.items():
    print(f"  {provider}: {count} chunks")
print("="*50)
print("\n‚úÖ Statistics saved to data/ingestion_stats.json")


üìä INGESTION STATISTICS
Total Sources: 23
Documents Scraped: 21
Total Chunks: 294
Embedding Model: text-embedding-3-small
Embedding Dimensions: 1536
Average Chunk Size: 1506 characters

Provider Breakdown:
  AWS: 93 chunks
  Azure: 78 chunks
  GCP: 123 chunks

‚úÖ Statistics saved to data/ingestion_stats.json


## 9. Test Query (Optional)

In [12]:
# # Quick test to verify vector store works
# test_query = "How to reduce S3 storage costs?"
# results = vector_store.similarity_search(test_query, k=3)

# print(f"üîç Test Query: '{test_query}'\n")
# for i, doc in enumerate(results, 1):
#     print(f"Result {i}:")
#     print(f"Source: {doc.metadata['source']}")
#     print(f"Content: {doc.page_content[:200]}...\n")

In [13]:
# # Cell: Clear Existing Data from Vector Database

# from langchain_postgres import PGVector
# from langchain_openai import OpenAIEmbeddings
# import os

# # Initialize connection
# embeddings = OpenAIEmbeddings(
#     model=EMBEDDING_MODEL,
#     openai_api_key=os.getenv('OPENAI_API_KEY')
# )

# # Connect to vector store
# vector_store = PGVector(
#     embeddings=embeddings,
#     collection_name=COLLECTION_NAME,
#     connection=CONNECTION,
#     use_jsonb=True,
# )

# # Delete the collection (drops the table)
# try:
#     vector_store.delete_collection()
#     print("‚úÖ Successfully deleted existing collection and all data")
#     print(f"   Collection '{COLLECTION_NAME}' has been removed")
# except Exception as e:
#     print(f"‚ö†Ô∏è Note: {str(e)}")
#     print("   (This is OK if collection didn't exist)")

# print("\nüîÑ Ready to re-ingest with new sources!")