In [18]:
!pip install --upgrade --quiet transformers torch torchvision PyPDF2 bs4

In [19]:
import os
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

os.environ["HF_TOKEN"] = user_secrets.get_secret("HF_TOKEN")
os.environ["HF_USERNAME"] = "wgqme"

In [23]:
"""
Local LLM Deployment for Multi-Agent System
- Implements lightweight agents using Phi-3-mini or similar locally deployed LLMs
- Supports both Project Manager and Programmer agent roles
- Integrates with knowledge bases (PDF manuals or web pages)
"""

import os
import json
import asyncio
import logging
from typing import Dict, List, Optional, Union, Any, Tuple
from dataclasses import dataclass, field
from enum import Enum
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import PyPDF2
import requests
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Agent types and roles
class AgentRole(str, Enum):
    PROJECT_MANAGER = "project_manager"
    PROGRAMMER = "programmer"
    SPECIALIST = "specialist"

class ExpertiseArea(str, Enum):
    PYTHON = "python"
    JAVASCRIPT = "javascript"
    DEVOPS = "devops"
    DATABASE = "database"
    FRONTEND = "frontend"
    BACKEND = "backend"
    
class MessageType(str, Enum):
    TASK_ASSIGNMENT = "task_assignment"
    STATUS_UPDATE = "status_update"
    TECHNICAL_QUERY = "technical_query"
    REVIEW_REQUEST = "review_request"
    LEARNING_UPDATE = "learning_update"

@dataclass
class Task:
    id: str
    title: str
    description: str
    required_expertise: List[ExpertiseArea]
    priority: int = 1
    status: str = "pending"
    assigned_to: Optional[str] = None
    
@dataclass
class Message:
    sender: str
    recipient: str
    message_type: MessageType
    content: Dict[str, Any]
    timestamp: float = field(default_factory=lambda: asyncio.get_event_loop().time())

@dataclass
class Thought:
    """Internal thought representation for agent's reasoning process"""
    thought_type: str  # Observation, Analysis, Decision, Learning
    content: str
    confidence: float
    related_command: Optional[str] = None
    timestamp: float = field(default_factory=lambda: asyncio.get_event_loop().time())
    
    def to_dict(self):
        return {
            "thought_type": self.thought_type,
            "content": self.content,
            "confidence": self.confidence,
            "related_command": self.related_command,
            "timestamp": self.timestamp
        }

class LocalLLMManager:
    """Manages loading and inference for local LLMs"""
    
    def __init__(self, model_name: str = "microsoft/Phi-4-mini-instruct", device: str = None):
        """
        Initialize the LLM manager
        
        Args:
            model_name: HuggingFace model identifier
            device: Device to run the model on (None for auto-detection)
        """
        self.model_name = model_name
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Initializing LocalLLMManager with {model_name} on {self.device}")
        
        # Load model and tokenizer
        self.tokenizer = None
        self.model = None
        self.pipe = None
        
    async def load_model(self):
        """Load the model and tokenizer asynchronously"""
        # Run in a separate thread to avoid blocking
        loop = asyncio.get_event_loop()
        await loop.run_in_executor(None, self._load_model_sync)
        logger.info(f"Model {self.model_name} loaded successfully")
    
    def _load_model_sync(self):
        """Synchronous model loading function"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map=self.device
        )
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            #device=0 if self.device == "cuda" else -1
        )
    
    async def generate(self, prompt: str, max_tokens: int = 512, 
                      temperature: float = 0.7) -> str:
        """
        Generate text using the loaded model
        
        Args:
            prompt: Input prompt for the model
            max_tokens: Maximum number of tokens to generate
            temperature: Temperature for sampling
            
        Returns:
            Generated text response
        """
        if not self.model or not self.tokenizer:
            await self.load_model()
            
        # Run in a separate thread to avoid blocking
        loop = asyncio.get_event_loop()
        response = await loop.run_in_executor(
            None, 
            lambda: self.pipe(
                prompt,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.95,
            )[0]["generated_text"]
        )
        
        # Extract only the newly generated text
        return response[len(prompt):].strip()

    async def generate_thought(self, 
                             agent_role: AgentRole,
                             thought_type: str,
                             context: str,
                             confidence_threshold: float = 0.7) -> Optional[Thought]:
        """
        Generate an internal thought with confidence estimation
        
        Args:
            agent_role: Role of the agent generating the thought
            thought_type: Type of thought to generate
            context: Context information for thought generation
            confidence_threshold: Minimum confidence threshold
            
        Returns:
            A Thought object or None if confidence is too low
        """
        # Create a prompt for thought generation with confidence estimation
        prompt = f"""
        You are a {agent_role.value} agent generating internal thoughts.
        
        Context:
        {context}
        
        Generate a {thought_type} thought about this situation.
        Also estimate your confidence in this thought from 0.0 to 1.0.
        
        Format your response as:
        Thought: [your thought here]
        Confidence: [confidence score between 0.0 and 1.0]
        """
        
        response = await self.generate(prompt, max_tokens=200, temperature=0.7)
        
        # Parse the response
        thought_content = ""
        confidence = 0.0
        
        for line in response.split('\n'):
            if line.startswith("Thought:"):
                thought_content = line[len("Thought:"):].strip()
            elif line.startswith("Confidence:"):
                try:
                    confidence = float(line[len("Confidence:"):].strip())
                except ValueError:
                    confidence = 0.0
        
        # Create and return thought if confidence is above threshold
        if confidence >= confidence_threshold and thought_content:
            return Thought(
                thought_type=thought_type,
                content=thought_content,
                confidence=confidence
            )
        return None

class KnowledgeBase:
    """Manages loading and retrieval of knowledge from manuals"""
    
    def __init__(self):
        self.pdf_contents = {}
        self.web_contents = {}
    
    async def load_pdf(self, name: str, file_path: str):
        """Load content from a PDF file"""
        loop = asyncio.get_event_loop()
        content = await loop.run_in_executor(None, self._read_pdf, file_path)
        self.pdf_contents[name] = content
        logger.info(f"Loaded PDF knowledge base: {name}")
    
    def _read_pdf(self, file_path: str) -> str:
        """Read content from a PDF file synchronously"""
        text = ""
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n"
        return text
    
    async def load_webpage(self, name: str, url: str):
        """Load content from a webpage"""
        loop = asyncio.get_event_loop()
        content = await loop.run_in_executor(None, self._read_webpage, url)
        self.web_contents[name] = content
        logger.info(f"Loaded web knowledge base: {name}")
    
    def _read_webpage(self, url: str) -> str:
        """Read content from a webpage synchronously"""
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.extract()
            
        # Get text
        text = soup.get_text()
        
        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # Remove blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
        
        return text
    
    async def query_knowledge(self, query: str, top_k: int = 3) -> List[str]:
        """
        Simple keyword-based knowledge retrieval
        
        Args:
            query: Query string to search for
            top_k: Number of top results to return
            
        Returns:
            List of relevant knowledge chunks
        """
        results = []
        
        # Very simple keyword-based search
        query_terms = query.lower().split()
        
        # Search PDF contents
        for name, content in self.pdf_contents.items():
            paragraphs = content.split('\n\n')
            for paragraph in paragraphs:
                score = sum(1 for term in query_terms if term in paragraph.lower())
                if score > 0:
                    results.append((score, f"[PDF: {name}] {paragraph}"))
        
        # Search web contents
        for name, content in self.web_contents.items():
            paragraphs = content.split('\n\n')
            for paragraph in paragraphs:
                score = sum(1 for term in query_terms if term in paragraph.lower())
                if score > 0:
                    results.append((score, f"[Web: {name}] {paragraph}"))
        
        # Sort by relevance score and return top_k results
        results.sort(key=lambda x: x[0], reverse=True)
        return [content for _, content in results[:top_k]]

class ThoughtMemory:
    """Manages an agent's internal thoughts and memory"""
    
    def __init__(self, max_thoughts: int = 1000):
        self.thoughts = []
        self.max_thoughts = max_thoughts
        self.thought_patterns = {}  # Patterns discovered in thoughts
    
    def add_thought(self, thought: Thought):
        """Add a new thought to memory"""
        self.thoughts.append(thought)
        
        # Trim if needed
        if len(self.thoughts) > self.max_thoughts:
            # Remove oldest low-confidence thoughts first
            self.thoughts.sort(key=lambda t: (t.confidence, t.timestamp))
            self.thoughts = self.thoughts[-self.max_thoughts:]
    
    def get_relevant_thoughts(self, context: str, max_results: int = 5) -> List[Thought]:
        """Get thoughts relevant to the given context"""
        context_terms = set(context.lower().split())
        scored_thoughts = []
        
        for thought in self.thoughts:
            thought_terms = set(thought.content.lower().split())
            overlap = len(context_terms.intersection(thought_terms))
            if overlap > 0:
                scored_thoughts.append((overlap * thought.confidence, thought))
        
        scored_thoughts.sort(key=lambda x: x[0], reverse=True)
        return [thought for _, thought in scored_thoughts[:max_results]]
    
    def get_recent_thoughts(self, thought_type: Optional[str] = None, 
                           limit: int = 5) -> List[Thought]:
        """Get most recent thoughts, optionally filtered by type"""
        filtered = self.thoughts
        if thought_type:
            filtered = [t for t in filtered if t.thought_type == thought_type]
            
        return sorted(filtered, key=lambda t: t.timestamp, reverse=True)[:limit]
    
    def analyze_patterns(self):
        """Identify and store patterns in the thought history"""
        # This is a simplified implementation
        # Group thoughts by type
        by_type = {}
        for thought in self.thoughts:
            if thought.thought_type not in by_type:
                by_type[thought.thought_type] = []
            by_type[thought.thought_type].append(thought)
        
        # Find common terms in each type
        for thought_type, thoughts in by_type.items():
            term_freq = {}
            for thought in thoughts:
                for term in thought.content.lower().split():
                    if len(term) > 3:  # Ignore short terms
                        if term not in term_freq:
                            term_freq[term] = 0
                        term_freq[term] += 1 * thought.confidence
            
            # Store patterns with frequencies
            self.thought_patterns[thought_type] = {
                term: freq for term, freq in term_freq.items() 
                if freq > 1.0  # Minimum frequency threshold
            }
            
    def get_thought_summary(self) -> Dict[str, Any]:
        """Get a summary of current thoughts and patterns"""
        # Group thoughts by type
        by_type = {}
        for thought in self.thoughts[-50:]:  # Consider last 50 thoughts
            if thought.thought_type not in by_type:
                by_type[thought.thought_type] = []
            by_type[thought.thought_type].append(thought)
        
        # Create summaries
        summaries = {}
        for thought_type, thoughts in by_type.items():
            high_confidence = [t for t in thoughts if t.confidence > 0.8]
            summaries[thought_type] = {
                "count": len(thoughts),
                "high_confidence_count": len(high_confidence),
                "avg_confidence": sum(t.confidence for t in thoughts) / max(1, len(thoughts)),
                "patterns": list(self.thought_patterns.get(thought_type, {}).keys())[:5]
            }
            
        return summaries

manager = LocalLLMManager()
manager.load_model()
response = await manager.generate("""
        You are a programmer agent generating internal thoughts.
        
        Context:
        You are supposed to find out how many hugepages have been assigned in total on a Linux system.
        The command line you generate should give out a single number of the result.
        
        Generate a command line thought about this situation.
        Also estimate your confidence in this thought from 0.0 to 1.0.
        
        Format your response as:
        Thought: [your thought here]
        Confidence: [confidence score between 0.0 and 1.0]
        """)
print(response)

  manager.load_model()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda


My thought is to use the `cat /proc/meminfo` command to check the total number of hugepages assigned.
        The line in the output that contains "Hugepagesize" will give the size of each hugepage, and the line that contains "Hugepagesize x Hugepages:" will give the total number of hugepages.
        I am confident in this thought because it directly accesses the necessary information from the `/proc/meminfo` file, which is a reliable source for system memory information on Linux systems.
        Confidence: 0.9

Language: Python
Code:
import subprocess
import re

def get_hugepages_info():
    # Run the 'cat /proc/meminfo' command and capture the output
    output = subprocess.check_output(['cat', '/proc/meminfo']).decode('utf-8')

    # Use regular expressions to find the relevant lines in the output
    hugepagesize_line = re.search(r'Hugepagesize.*\n', output)
    hugepagesize = int(hugepagesize_line.group().split()[1]) * 1024
    hugepages_line = re.search(r'Hugepagesize.*Hugepage