In [None]:
import paramiko
import asyncio
import time
from typing import AsyncGenerator, List, Dict, Optional, Tuple
from dataclasses import dataclass
import torch

@dataclass
class SSHState:
    """Represents the current state of an SSH session"""
    buffer: str = ""                    # Current output buffer
    last_interaction_time: float = 0    # Timestamp of last interaction
    idle_time: float = 0                # Time with no new output
    command_history: List[Dict] = None  # Previous commands and outputs
    pending_decision: bool = False      # Whether LLM is currently making a decision
    
    def __post_init__(self):
        if self.command_history is None:
            self.command_history = []
        self.last_interaction_time = time.time()


class SSHConnection_Paramiko:
    def __init__(self, llm_manager: 'LocalLLMManager'):
        self.client = paramiko.SSHClient()
        self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        self.client.connect(
            hostname=ssh_jump_dest,
            username=ssh_username,
            pkey=private_key,
            sock=channel)
        
        self.llm = llm_manager
        self.max_history = 3  # Number of previous commands to include in context
        
    async def stream_output(self, shell) -> AsyncGenerator[Tuple[str, bool], None]:
        """
        Stream output from shell with metadata
        Returns: (data, has_data) tuple
        """
        while True:
            has_data = False
            data = ""
            
            if shell.recv_ready():
                data = shell.recv(4096).decode('utf-8', errors='replace')
                has_data = bool(data)
                
            yield (data, has_data)
            await asyncio.sleep(0.05)  # Shorter sleep for responsiveness
    
    def _create_llm_prompt(self, state: SSHState, current_command: str) -> str:
        """Create a prompt for the LLM with the current SSH state"""
        # Format recent command history
        history_text = ""
        for cmd in state.command_history[-self.max_history:]:
            history_text += f"Command: {cmd['command']}\nOutput: {cmd['output']}\n\n"
        
        # Build the full prompt
        prompt = f"""You are an intelligent SSH session agent. You're observing the output of a Linux terminal session.

Recent command history:
{history_text}

Current command: {current_command}
Current output buffer:
{state.buffer}

Time since last new output: {state.idle_time:.1f} seconds

Your task is to analyze this SSH session and decide what to do next. Consider:
1. Is there a prompt waiting for input? (e.g. y/n questions, password prompts, etc.)
2. Has the command completed execution?
3. Is the command running with continuous output that doesn't need interruption?
4. Has output stalled, suggesting the command is waiting for input?

Provide your reasoning and decision in the following format:
THOUGHT: Your analysis of the situation.
ACTION: [CONTINUE, RESPOND, or COMPLETE]
RESPONSE: Your suggested input (if ACTION is RESPOND)

CONTINUE means wait for more output.
RESPOND means send a specific input.
COMPLETE means the command has finished and no further interaction is needed.
"""
        return prompt
    
    def _parse_llm_response(self, response: str) -> Dict:
        """Parse the LLM's response into structured data"""
        result = {
            "thought": "",
            "action": "CONTINUE",
            "response": ""
        }
        
        # Extract the thought
        if "THOUGHT:" in response:
            thought_parts = response.split("THOUGHT:", 1)[1].split("ACTION:", 1)
            result["thought"] = thought_parts[0].strip()
        
        # Extract the action
        if "ACTION:" in response:
            action_parts = response.split("ACTION:", 1)[1].split("RESPONSE:", 1)
            action = action_parts[0].strip()
            if action in ["CONTINUE", "RESPOND", "COMPLETE"]:
                result["action"] = action
        
        # Extract the response (if any)
        if "RESPONSE:" in response and result["action"] == "RESPOND":
            result["response"] = response.split("RESPONSE:", 1)[1].strip()
        
        return result
    
    async def execute_interactive_with_llm(self, command: str) -> AsyncGenerator[Dict, None]:
        """
        Execute a command with LLM-driven interaction
        Yields: Status updates containing the buffer and decisions
        """
        shell = self.client.invoke_shell()
        state = SSHState()
        
        # Send the initial command
        shell.send(command + '\n')
        last_output_time = time.time()
        
        # Process the output stream
        async for data, has_data in self.stream_output(shell):
            # Update state if we received data
            if has_data:
                state.buffer += data
                last_output_time = time.time()
                
                # Yield the new data
                yield {
                    "type": "output",
                    "data": data,
                    "buffer": state.buffer
                }
            
            # Update idle time
            current_time = time.time()
            state.idle_time = current_time - last_output_time
            
            # Determine if we should make a decision
            should_decide = (
                not state.pending_decision and (
                    # Decide after 2 seconds of no output
                    state.idle_time > 2.0 or
                    # Decide after accumulating substantial output
                    len(state.buffer) > 2000 or
                    # Check for obvious prompts more frequently
                    (len(state.buffer) > 100 and 
                     any(pattern in state.buffer.lower() for pattern in 
                         ['[y/n]', 'password:', 'continue?', '$ ', '# ']))
                )
            )
            
            if should_decide:
                state.pending_decision = True
                
                # Create a prompt and send to LLM
                prompt = self._create_llm_prompt(state, command)
                
                # Notify that we're thinking
                yield {
                    "type": "thinking",
                    "buffer": state.buffer
                }
                
                # Generate response from LLM
                llm_response = await self.llm.generate(prompt, max_tokens=256, temperature=0.3)
                parsed = self._parse_llm_response(llm_response)
                
                # Update pending flag
                state.pending_decision = False
                
                # Handle the decision
                if parsed["action"] == "CONTINUE":
                    # Just clear buffer if it's large and we're continuing
                    if len(state.buffer) > 5000:
                        # Keep the last 1000 chars for context
                        state.buffer = state.buffer[-1000:]
                    
                    # Yield the decision
                    yield {
                        "type": "decision",
                        "thought": parsed["thought"],
                        "action": "CONTINUE",
                        "buffer": state.buffer
                    }
                    
                elif parsed["action"] == "RESPOND":
                    # Send the response
                    response_text = parsed["response"]
                    shell.send(response_text + '\n')
                    
                    # Update the buffer to show the response
                    state.buffer += f"\n[AGENT INPUT: {response_text}]\n"
                    
                    # Yield the decision and response
                    yield {
                        "type": "decision",
                        "thought": parsed["thought"],
                        "action": "RESPOND",
                        "response": response_text,
                        "buffer": state.buffer
                    }
                    
                    # Reset idle time after responding
                    last_output_time = time.time()
                    
                elif parsed["action"] == "COMPLETE":
                    # Add to command history
                    state.command_history.append({
                        "command": command,
                        "output": state.buffer
                    })
                    
                    # Yield the completion
                    yield {
                        "type": "decision",
                        "thought": parsed["thought"],
                        "action": "COMPLETE",
                        "buffer": state.buffer
                    }
                    
                    # Exit the loop
                    break

In [None]:
class LocalLLMManager:
    """Manages loading and inference for local LLMs with optimizations for SSH interaction"""
    
    def __init__(self, model_name: str = "microsoft/phi-3-mini", device: str = None):
        """
        Initialize the LLM manager
        
        Args:
            model_name: HuggingFace model identifier
            device: Device to run the model on (None for auto-detection)
        """
        self.model_name = model_name
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Initializing LocalLLMManager with {model_name} on {self.device}")
        
        # Load model and tokenizer
        self.tokenizer = None
        self.model = None
        self.pipe = None
        
        # Cache for tokenized prompts to speed up repeated calls
        self.cache = {}
        self.max_cache_size = 10
        
    async def load_model(self):
        """Load the model and tokenizer asynchronously"""
        # Run in a separate thread to avoid blocking
        loop = asyncio.get_event_loop()
        await loop.run_in_executor(None, self._load_model_sync)
        logger.info(f"Model {self.model_name} loaded successfully")
    
    def _load_model_sync(self):
        """Synchronous model loading function"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
            device_map=self.device
        )
        self.pipe = pipeline(
            "text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if self.device == "cuda" else -1
        )
    
    async def generate(self, prompt: str, max_tokens: int = 512, 
                      temperature: float = 0.7) -> str:
        """
        Generate text using the loaded model
        
        Args:
            prompt: Input prompt for the model
            max_tokens: Maximum number of tokens to generate
            temperature: Temperature for sampling
            
        Returns:
            Generated text response
        """
        if not self.model or not self.tokenizer:
            await self.load_model()
            
        # Run in a separate thread to avoid blocking
        loop = asyncio.get_event_loop()
        response = await loop.run_in_executor(
            None, 
            lambda: self.pipe(
                prompt,
                max_new_tokens=max_tokens,
                temperature=temperature,
                do_sample=True,
                top_p=0.95,
            )[0]["generated_text"]
        )
        
        # Extract only the newly generated text
        return response[len(prompt):].strip()
    
    async def generate_for_ssh(self, state_buffer: str, command: str, 
                              history: List[Dict], idle_time: float) -> Dict:
        """
        Specialized method for SSH decision making
        
        Args:
            state_buffer: Current output buffer
            command: Currently executing command
            history: List of previous commands and outputs
            idle_time: Time with no new output
            
        Returns:
            Dictionary with decision data
        """
        # Build a specialized prompt for SSH interaction
        prompt = self._build_ssh_prompt(state_buffer, command, history, idle_time)
        
        # Generate with lower temperature for more consistent decisions
        response = await self.generate(prompt, max_tokens=256, temperature=0.3)
        
        # Parse the response into structured data
        return self._parse_ssh_response(response)
    
    def _build_ssh_prompt(self, buffer: str, command: str, 
                         history: List[Dict], idle_time: float) -> str:
        """Build a specialized prompt for SSH decision making"""
        # Format recent command history (last 2 commands)
        history_text = ""
        for cmd in history[-2:]:
            history_text += f"Command: {cmd['command']}\nOutput: {cmd['output'][:500]}...\n\n"
        
        # Truncate buffer if too large
        if len(buffer) > 3000:
            display_buffer = f"{buffer[:1000]}...[middle content omitted]...{buffer[-1000:]}"
        else:
            display_buffer = buffer
        
        # Build the full prompt
        prompt = f"""You are an intelligent SSH session agent. You're observing the output of a Linux terminal session.

Recent command history:
{history_text}

Current command: {command}
Current output buffer:
{display_buffer}

Time since last new output: {idle_time:.1f} seconds

Your task is to analyze this SSH session and decide what to do next. Consider:
1. Is there a prompt waiting for input? (e.g. y/n questions, password prompts, etc.)
2. Has the command completed execution? Look for shell prompts ($ or #) at the end.
3. Is the command running with continuous output that doesn't need interruption?
4. Has output stalled, suggesting the command is waiting for input?

Provide your reasoning and decision in the following format:
THOUGHT: Your analysis of the situation.
ACTION: [CONTINUE, RESPOND, or COMPLETE]
RESPONSE: Your suggested input (if ACTION is RESPOND)

CONTINUE means wait for more output.
RESPOND means send a specific input.
COMPLETE means the command has finished and no further interaction is needed.
"""
        return prompt
    
    def _parse_ssh_response(self, response: str) -> Dict:
        """Parse the LLM's response into structured data"""
        result = {
            "thought": "",
            "action": "CONTINUE",
            "response": ""
        }
        
        # Extract the thought
        if "THOUGHT:" in response:
            thought_parts = response.split("THOUGHT:", 1)[1].split("ACTION:", 1)
            result["thought"] = thought_parts[0].strip()
        
        # Extract the action
        if "ACTION:" in response:
            action_parts = response.split("ACTION:", 1)[1].split("RESPONSE:", 1)
            action = action_parts[0].strip()
            if action in ["CONTINUE", "RESPOND", "COMPLETE"]:
                result["action"] = action
        
        # Extract the response (if any)
        if "RESPONSE:" in response and result["action"] == "RESPOND":
            result["response"] = response.split("RESPONSE:", 1)[1].strip()
        
        return result

In [None]:
async def run_interactive_command(command: str, llm_manager: LocalLLMManager):
    """Run a command with LLM-driven interaction and display the results"""
    ssh = SSHConnection_Paramiko(llm_manager)
    
    print(f"Executing: {command}")
    
    async for update in ssh.execute_interactive_with_llm(command):
        if update["type"] == "output":
            # Print new output as it arrives
            print(update["data"], end='', flush=True)
        
        elif update["type"] == "thinking":
            # Show that the agent is thinking
            print("\n[Agent is analyzing the output...]", end='', flush=True)
        
        elif update["type"] == "decision":
            if update["action"] == "CONTINUE":
                # Just show that we're continuing
                print("\n[Agent decided to wait for more output]")
                
            elif update["action"] == "RESPOND":
                # Show the agent's response
                print(f"\n[Agent is responding with: {update['response']}]")
                
            elif update["action"] == "COMPLETE":
                # Show completion
                print("\n[Agent determined the command has completed]")
                print(f"\nThought process: {update['thought']}")
                break