In [None]:
import os
import re
import json
import logging
import hashlib
from typing import Dict, List, Optional, Tuple, Union, Set
from dataclasses import dataclass, field
from pathlib import Path
import asyncio
from collections import defaultdict
import numpy as np

# For PDF processing
import PyPDF2
import fitz  # PyMuPDF

# For web scraping
import requests
from bs4 import BeautifulSoup

# For text processing and embeddings
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt', quiet=True)

# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

@dataclass
class KnowledgeChunk:
    """A chunk of knowledge extracted from a document."""
    content: str
    source: str
    page_num: Optional[int] = None
    section: Optional[str] = None
    url: Optional[str] = None
    embedding: Optional[np.ndarray] = None
    metadata: Dict = field(default_factory=dict)
    chunk_id: str = field(init=False)
    
    def __post_init__(self):
        # Generate a unique ID based on content
        self.chunk_id = hashlib.md5(self.content.encode()).hexdigest()
    
    def to_dict(self) -> Dict:
        """Convert to dictionary for storage."""
        result = {
            "chunk_id": self.chunk_id,
            "content": self.content,
            "source": self.source,
            "metadata": self.metadata
        }
        
        if self.page_num is not None:
            result["page_num"] = self.page_num
        if self.section is not None:
            result["section"] = self.section
        if self.url is not None:
            result["url"] = self.url
        if self.embedding is not None:
            result["embedding"] = self.embedding.tolist()
            
        return result
    
    @classmethod
    def from_dict(cls, data: Dict) -> 'KnowledgeChunk':
        """Create from dictionary storage."""
        embedding = data.get("embedding")
        if embedding is not None:
            embedding = np.array(embedding)
        
        chunk = cls(
            content=data["content"],
            source=data["source"],
            page_num=data.get("page_num"),
            section=data.get("section"),
            url=data.get("url"),
            embedding=embedding,
            metadata=data.get("metadata", {})
        )
        chunk.chunk_id = data["chunk_id"]
        return chunk


class KnowledgeBase:
    """Manages a collection of knowledge chunks with semantic search capabilities."""
    
    def __init__(self, embedding_model: str = 'all-MiniLM-L6-v2', cache_dir: Optional[str] = None):
        """Initialize the knowledge base.
        
        Args:
            embedding_model: The SentenceTransformer model to use for embeddings
            cache_dir: Directory to cache extracted text and embeddings
        """
        self.chunks: Dict[str, KnowledgeChunk] = {}
        self.source_to_chunks: Dict[str, List[str]] = defaultdict(list)
        self.embedding_model = SentenceTransformer(embedding_model)
        self.cache_dir = cache_dir
        
        if cache_dir:
            os.makedirs(cache_dir, exist_ok=True)
    
    def add_chunk(self, chunk: KnowledgeChunk) -> None:
        """Add a knowledge chunk to the base."""
        if chunk.embedding is None:
            chunk.embedding = self.embedding_model.encode(chunk.content)
        
        self.chunks[chunk.chunk_id] = chunk
        self.source_to_chunks[chunk.source].append(chunk.chunk_id)
    
    def search(self, query: str, top_k: int = 5) -> List[Tuple[KnowledgeChunk, float]]:
        """Search for chunks relevant to the query."""
        if not self.chunks:
            return []
        
        query_embedding = self.embedding_model.encode(query)
        
        results = []
        for chunk_id, chunk in self.chunks.items():
            if chunk.embedding is not None:
                similarity = cosine_similarity(query_embedding, chunk.embedding)
                results.append((chunk, similarity))
        
        results.sort(key=lambda x: x[1], reverse=True)
        return results[:top_k]
    
    def get_chunks_by_source(self, source: str) -> List[KnowledgeChunk]:
        """Get all chunks from a specific source."""
        chunk_ids = self.source_to_chunks.get(source, [])
        return [self.chunks[chunk_id] for chunk_id in chunk_ids]
    
    def save_to_disk(self, filepath: str) -> None:
        """Save the knowledge base to disk."""
        data = {
            "chunks": [chunk.to_dict() for chunk in self.chunks.values()]
        }
        
        with open(filepath, 'w') as f:
            json.dump(data, f)
    
    def load_from_disk(self, filepath: str) -> None:
        """Load the knowledge base from disk."""
        with open(filepath, 'r') as f:
            data = json.load(f)
        
        self.chunks = {}
        self.source_to_chunks = defaultdict(list)
        
        for chunk_data in data["chunks"]:
            chunk = KnowledgeChunk.from_dict(chunk_data)
            self.chunks[chunk.chunk_id] = chunk
            self.source_to_chunks[chunk.source].append(chunk.chunk_id)


def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Calculate cosine similarity between two vectors."""
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


class DocumentProcessor:
    """Processes documents (PDF, web pages) into knowledge chunks."""
    
    def __init__(self, knowledge_base: KnowledgeBase, chunk_size: int = 1000, chunk_overlap: int = 200):
        """Initialize the document processor.
        
        Args:
            knowledge_base: KnowledgeBase to store processed chunks
            chunk_size: Maximum character length for each chunk
            chunk_overlap: Character overlap between consecutive chunks
        """
        self.knowledge_base = knowledge_base
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
    
    def process_pdf(self, pdf_path: str, extract_sections: bool = True) -> List[str]:
        """Process a PDF document into the knowledge base.
        
        Args:
            pdf_path: Path to the PDF file
            extract_sections: Whether to try to extract section titles
            
        Returns:
            List of chunk IDs added to the knowledge base
        """
        logger.info(f"Processing PDF: {pdf_path}")
        
        # Check if we have a cached version
        if self.knowledge_base.cache_dir:
            pdf_hash = hashlib.md5(open(pdf_path, 'rb').read()).hexdigest()
            cache_file = os.path.join(self.knowledge_base.cache_dir, f"{pdf_hash}.json")
            
            if os.path.exists(cache_file):
                logger.info(f"Loading cached PDF processing for {pdf_path}")
                with open(cache_file, 'r') as f:
                    cached_data = json.load(f)
                
                chunk_ids = []
                for chunk_data in cached_data:
                    chunk = KnowledgeChunk.from_dict(chunk_data)
                    self.knowledge_base.add_chunk(chunk)
                    chunk_ids.append(chunk.chunk_id)
                
                return chunk_ids
        
        # Extract text using PyMuPDF (fitz)
        doc = fitz.open(pdf_path)
        
        # Extract sections and content
        sections = []
        current_section = "Introduction"
        
        filename = os.path.basename(pdf_path)
        chunk_ids = []
        
        for page_num in range(len(doc)):
            page = doc[page_num]
            text = page.get_text()
            
            # Try to identify section headers if requested
            if extract_sections:
                lines = text.split('\n')
                for line in lines:
                    line = line.strip()
                    # Simple heuristic for section headers: short, all caps or title case
                    if 3 < len(line) < 100 and (line.isupper() or line.istitle()):
                        if not any(c.isdigit() for c in line) or re.match(r'^\d+\.\s+', line):
                            current_section = line
            
            # Create chunks from the page text
            chunks = self._chunk_text(text)
            
            for chunk_text in chunks:
                chunk = KnowledgeChunk(
                    content=chunk_text,
                    source=filename,
                    page_num=page_num + 1,
                    section=current_section
                )
                
                self.knowledge_base.add_chunk(chunk)
                chunk_ids.append(chunk.chunk_id)
        
        # Cache the results if caching is enabled
        if self.knowledge_base.cache_dir:
            chunks_data = [self.knowledge_base.chunks[chunk_id].to_dict() for chunk_id in chunk_ids]
            with open(cache_file, 'w') as f:
                json.dump(chunks_data, f)
        
        return chunk_ids
    
    def process_webpage(self, url: str, selector: Optional[str] = None) -> List[str]:
        """Process a webpage into the knowledge base.
        
        Args:
            url: URL of the webpage
            selector: Optional CSS selector to extract specific content
            
        Returns:
            List of chunk IDs added to the knowledge base
        """
        logger.info(f"Processing webpage: {url}")
        
        # Check if we have a cached version
        if self.knowledge_base.cache_dir:
            url_hash = hashlib.md5(url.encode()).hexdigest()
            cache_file = os.path.join(self.knowledge_base.cache_dir, f"{url_hash}.json")
            
            if os.path.exists(cache_file):
                logger.info(f"Loading cached webpage processing for {url}")
                with open(cache_file, 'r') as f:
                    cached_data = json.load(f)
                
                chunk_ids = []
                for chunk_data in cached_data:
                    chunk = KnowledgeChunk.from_dict(chunk_data)
                    self.knowledge_base.add_chunk(chunk)
                    chunk_ids.append(chunk.chunk_id)
                
                return chunk_ids
        
        # Fetch the webpage
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            logger.error(f"Failed to fetch webpage {url}: {e}")
            return []
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(['script', 'style', 'nav', 'footer', 'head']):
            element.decompose()
        
        # Extract content based on selector if provided
        if selector:
            content_elements = soup.select(selector)
            content = ' '.join(element.get_text(strip=True) for element in content_elements)
        else:
            # Extract main content
            main_content = soup.find('main') or soup.find('article') or soup.find('body')
            content = main_content.get_text(strip=True) if main_content else soup.get_text(strip=True)
        
        # Extract title
        title_element = soup.find('title')
        title = title_element.get_text() if title_element else url
        
        # Create chunks
        chunks = self._chunk_text(content)
        chunk_ids = []
        
        for chunk_text in chunks:
            chunk = KnowledgeChunk(
                content=chunk_text,
                source=title,
                url=url
            )
            
            self.knowledge_base.add_chunk(chunk)
            chunk_ids.append(chunk.chunk_id)
        
        # Cache the results if caching is enabled
        if self.knowledge_base.cache_dir:
            chunks_data = [self.knowledge_base.chunks[chunk_id].to_dict() for chunk_id in chunk_ids]
            with open(cache_file, 'w') as f:
                json.dump(chunks_data, f)
        
        return chunk_ids
    
    def _chunk_text(self, text: str) -> List[str]:
        """Split text into overlapping chunks of approximately even size."""
        if len(text) <= self.chunk_size:
            return [text]
        
        # Split text into sentences
        sentences = sent_tokenize(text)
        
        chunks = []
        current_chunk = []
        current_length = 0
        
        for sentence in sentences:
            sentence_length = len(sentence)
            
            # If adding this sentence would exceed chunk size, save current chunk and start a new one
            if current_length + sentence_length > self.chunk_size and current_length > 0:
                chunks.append(' '.join(current_chunk))
                
                # Include overlap by keeping some sentences from the previous chunk
                overlap_length = 0
                overlap_sentences = []
                
                # Add sentences from the end of the previous chunk until we reach desired overlap
                for i in range(len(current_chunk) - 1, -1, -1):
                    sent = current_chunk[i]
                    if overlap_length + len(sent) <= self.chunk_overlap:
                        overlap_sentences.insert(0, sent)
                        overlap_length += len(sent)
                    else:
                        break
                
                current_chunk = overlap_sentences
                current_length = overlap_length
            
            current_chunk.append(sentence)
            current_length += sentence_length
        
        # Add the last chunk if there's anything left
        if current_chunk:
            chunks.append(' '.join(current_chunk))
        
        return chunks


class ManualIntegration:
    """Integrates manuals and documentation into agent knowledge."""
    
    def __init__(self, knowledge_base: KnowledgeBase = None, cache_dir: str = "./knowledge_cache"):
        """Initialize the manual integration system.
        
        Args:
            knowledge_base: Optional existing knowledge base to use
            cache_dir: Directory to cache extracted text and embeddings
        """
        self.knowledge_base = knowledge_base or KnowledgeBase(cache_dir=cache_dir)
        self.document_processor = DocumentProcessor(self.knowledge_base)
    
    async def load_manuals(self, manual_sources: Dict[str, Union[str, List[str]]]) -> None:
        """Load manuals from various sources.
        
        Args:
            manual_sources: Dictionary mapping manual types to file paths or URLs
                Example: {"linux_commands": "/path/to/manual.pdf", "web_docs": ["https://example.com/docs"]}
        """
        tasks = []
        
        for manual_type, sources in manual_sources.items():
            if isinstance(sources, str):
                sources = [sources]
            
            for source in sources:
                if source.startswith(('http://', 'https://')):
                    tasks.append(self._process_web_source(manual_type, source))
                else:
                    tasks.append(self._process_file_source(manual_type, source))
        
        await asyncio.gather(*tasks)
        logger.info(f"Loaded {len(self.knowledge_base.chunks)} knowledge chunks from manuals")
    
    async def _process_web_source(self, manual_type: str, url: str) -> None:
        """Process a web source in the background."""
        self.document_processor.process_webpage(url)
    
    async def _process_file_source(self, manual_type: str, file_path: str) -> None:
        """Process a file source in the background."""
        if file_path.lower().endswith('.pdf'):
            self.document_processor.process_pdf(file_path)
        else:
            logger.warning(f"Unsupported file type: {file_path}")
    
    def query_knowledge(self, query: str, top_k: int = 5) -> List[Dict]:
        """Query the knowledge base for relevant information.
        
        Args:
            query: The query string
            top_k: Number of top results to return
            
        Returns:
            List of relevant knowledge chunks with similarity scores
        """
        results = self.knowledge_base.search(query, top_k=top_k)
        return [
            {
                "content": chunk.content,
                "source": chunk.source,
                "page": chunk.page_num,
                "section": chunk.section,
                "similarity": score,
                "url": chunk.url
            }
            for chunk, score in results
        ]
    
    def save_knowledge_base(self, filepath: str) -> None:
        """Save the knowledge base to disk."""
        self.knowledge_base.save_to_disk(filepath)
    
    def load_knowledge_base(self, filepath: str) -> None:
        """Load the knowledge base from disk."""
        self.knowledge_base.load_from_disk(filepath)


class AgentKnowledgeProvider:
    """Provides knowledge to agents based on their role and current task."""
    
    def __init__(self, manual_integration: ManualIntegration):
        """Initialize the agent knowledge provider.
        
        Args:
            manual_integration: The manual integration system
        """
        self.manual_integration = manual_integration
        self.role_knowledge_mapping = {}
    
    def configure_role_knowledge(self, role: str, knowledge_topics: List[str]) -> None:
        """Configure which knowledge topics are relevant for each agent role.
        
        Args:
            role: Agent role (e.g., "programmer", "project_manager")
            knowledge_topics: List of knowledge topics relevant for this role
        """
        self.role_knowledge_mapping[role] = knowledge_topics
    
    def get_knowledge_for_task(self, role: str, task_description: str, max_chunks: int = 5) -> Dict:
        """Get relevant knowledge for a specific task and agent role.
        
        Args:
            role: Agent role (e.g., "programmer", "project_manager")
            task_description: Description of the current task
            max_chunks: Maximum number of knowledge chunks to return
            
        Returns:
            Dictionary with relevant knowledge organized by source
        """
        # Get all knowledge relevant to the role and task
        results = self.manual_integration.query_knowledge(task_description, top_k=max_chunks)
        
        # Organize by source for better context
        organized_knowledge = defaultdict(list)
        for result in results:
            source_name = result["source"]
            organized_knowledge[source_name].append({
                "content": result["content"],
                "page": result["page"],
                "section": result["section"],
                "similarity": result["similarity"]
            })
        
        return dict(organized_knowledge)
    
    def create_knowledge_prompt(self, role: str, task_description: str) -> str:
        """Create a prompt with relevant knowledge for the agent.
        
        Args:
            role: Agent role
            task_description: Description of the current task
            
        Returns:
            Formatted prompt with relevant knowledge
        """
        knowledge = self.get_knowledge_for_task(role, task_description)
        
        prompt_parts = ["### Relevant Knowledge\n"]
        
        for source, chunks in knowledge.items():
            prompt_parts.append(f"\n## From {source}:\n")
            
            for chunk in chunks:
                section = f" - Section: {chunk['section']}" if chunk.get('section') else ""
                page = f" - Page: {chunk['page']}" if chunk.get('page') is not None else ""
                
                prompt_parts.append(f"{section}{page}\n{chunk['content']}\n")
        
        return "\n".join(prompt_parts)


# Example usage
async def main():
    # Initialize the manual integration system
    manual_integration = ManualIntegration(cache_dir="./knowledge_cache")
    
    # Load manuals
    await manual_integration.load_manuals({
        "linux_commands": "/path/to/linux_manual.pdf",
        "programming_guides": [
            "/path/to/python_guide.pdf",
            "https://docs.python.org/3/"
        ]
    })
    
    # Save knowledge base for future use
    manual_integration.save_knowledge_base("./knowledge_base.json")
    
    # Create knowledge provider for agents
    knowledge_provider = AgentKnowledgeProvider(manual_integration)
    
    # Configure knowledge for different roles
    knowledge_provider.configure_role_knowledge(
        "programmer", 
        ["linux_commands", "programming_guides"]
    )
    
    knowledge_provider.configure_role_knowledge(
        "project_manager", 
        ["project_management", "team_coordination"]
    )
    
    # Example: Get knowledge for a programmer task
    task = "Need to optimize the performance of a Python script that processes large log files"
    knowledge_prompt = knowledge_provider.create_knowledge_prompt("programmer", task)
    
    print(knowledge_prompt)

if __name__ == "__main__":
    asyncio.run(main())