# Model Serving APIs: Production-Grade PyTorch Inference System

**PyTorch Mastery Hub: Enterprise-Ready Model Deployment**

**Authors:** ML Engineering Team  
**Institution:** PyTorch Mastery Hub  
**Module:** Production Deployment & MLOps  
**Date:** August 2025

## Overview

This notebook provides a comprehensive implementation of production-grade model serving APIs for PyTorch models. We build a complete inference system with FastAPI, featuring async batch processing, authentication, monitoring, and deployment configurations optimized for enterprise environments.

## Key Objectives
1. Implement high-performance model serving with async batching
2. Build comprehensive security and authentication systems
3. Create production-ready FastAPI application with monitoring
4. Develop extensive testing and load testing frameworks
5. Generate complete deployment configurations (Docker, Kubernetes, Helm)
6. Set up monitoring and alerting infrastructure
7. Create CLI tools for deployment and management

## 1. Setup and Environment Configuration

```python
# Import required libraries
import asyncio
import time
import os
import json
import secrets
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Any, Union
from collections import defaultdict, deque
import warnings
warnings.filterwarnings('ignore')

# Core ML and data processing
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

# API and web frameworks
try:
    from fastapi import FastAPI, HTTPException, Depends, Request, status
    from fastapi.security import HTTPBearer, APIKeyHeader
    from fastapi.middleware.cors import CORSMiddleware
    from fastapi.middleware.gzip import GZipMiddleware
    from fastapi.responses import JSONResponse
    from contextlib import asynccontextmanager
    from pydantic import BaseModel
    FASTAPI_AVAILABLE = True
    print("✅ FastAPI available")
except ImportError:
    FASTAPI_AVAILABLE = False
    print("⚠️ FastAPI not available - using simulation mode")
    # Mock classes for demonstration
    class BaseModel: pass
    class FastAPI: pass

# Monitoring and logging
import logging
from logging.handlers import RotatingFileHandler

# Set up comprehensive logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

# Create results directory structure
results_dir = Path('../../results/08_production/api_server')
results_dir.mkdir(parents=True, exist_ok=True)

# Subdirectories for organized outputs
(results_dir / 'deployment_configs').mkdir(exist_ok=True)
(results_dir / 'monitoring').mkdir(exist_ok=True)
(results_dir / 'scripts').mkdir(exist_ok=True)
(results_dir / 'logs').mkdir(exist_ok=True)

print("🚀 MODEL SERVING API SYSTEM")
print("=" * 50)
print(f"📁 Results directory: {results_dir}")
print(f"🔧 FastAPI available: {FASTAPI_AVAILABLE}")
print(f"🎯 CUDA available: {torch.cuda.is_available()}")
print("✅ Environment setup complete!")
```

## 2. Data Models and Request/Response Schemas

```python
# Pydantic models for API request/response validation
if FASTAPI_AVAILABLE:
    
    class PredictionRequest(BaseModel):
        """Request schema for single prediction."""
        image_data: Optional[str] = None  # Base64 encoded image
        tensor_data: Optional[List[List[List[float]]]] = None  # Raw tensor data
        return_probabilities: bool = True
        confidence_threshold: float = 0.5
        
        class Config:
            schema_extra = {
                "example": {
                    "tensor_data": [[[0.5] * 32] * 32] * 3,
                    "return_probabilities": True,
                    "confidence_threshold": 0.5
                }
            }
    
    class PredictionResponse(BaseModel):
        """Response schema for single prediction."""
        predicted_class: int
        class_name: str
        confidence: float
        probabilities: Optional[List[float]] = None
        inference_time_ms: float
        model_version: str
        request_id: str
        
        class Config:
            schema_extra = {
                "example": {
                    "predicted_class": 2,
                    "class_name": "corn",
                    "confidence": 0.95,
                    "probabilities": [0.02, 0.03, 0.95],
                    "inference_time_ms": 25.4,
                    "model_version": "1.0.0",
                    "request_id": "req_123456"
                }
            }
    
    class BatchPredictionRequest(BaseModel):
        """Request schema for batch prediction."""
        batch: List[PredictionRequest]
        batch_id: Optional[str] = None
        
        class Config:
            schema_extra = {
                "example": {
                    "batch": [
                        {"tensor_data": [[[0.5] * 32] * 32] * 3, "return_probabilities": True},
                        {"tensor_data": [[[0.3] * 32] * 32] * 3, "return_probabilities": True}
                    ],
                    "batch_id": "batch_001"
                }
            }
    
    class BatchPredictionResponse(BaseModel):
        """Response schema for batch prediction."""
        predictions: List[PredictionResponse]
        batch_size: int
        total_inference_time_ms: float
        batch_id: str
        
        class Config:
            schema_extra = {
                "example": {
                    "predictions": [
                        {"predicted_class": 1, "confidence": 0.89},
                        {"predicted_class": 2, "confidence": 0.94}
                    ],
                    "batch_size": 2,
                    "total_inference_time_ms": 45.2,
                    "batch_id": "batch_001"
                }
            }
    
    class HealthResponse(BaseModel):
        """Health check response schema."""
        status: str
        timestamp: str
        version: str
        uptime_seconds: float
        model_loaded: bool
        gpu_available: bool
        queue_length: int
        
        class Config:
            schema_extra = {
                "example": {
                    "status": "healthy",
                    "timestamp": "2025-08-13T10:30:00Z",
                    "version": "1.0.0",
                    "uptime_seconds": 3600,
                    "model_loaded": True,
                    "gpu_available": False,
                    "queue_length": 2
                }
            }
    
    class ModelInfo(BaseModel):
        """Model information response schema."""
        model_name: str
        model_version: str
        model_type: str
        input_shape: List[int]
        output_classes: List[str]
        parameters: int
        model_size_mb: float
        training_date: str
        
        class Config:
            schema_extra = {
                "example": {
                    "model_name": "SampleCNN",
                    "model_version": "1.0.0",
                    "model_type": "CNN",
                    "input_shape": [3, 32, 32],
                    "output_classes": ["class_0", "class_1", "class_2"],
                    "parameters": 25088,
                    "model_size_mb": 0.1,
                    "training_date": "2025-08-13"
                }
            }

else:
    # Dummy classes for when FastAPI is not available
    class PredictionRequest: pass
    class PredictionResponse: pass
    class BatchPredictionRequest: pass
    class BatchPredictionResponse: pass
    class HealthResponse: pass
    class ModelInfo: pass

print("📋 DATA MODELS CONFIGURED")
print("=" * 50)
print("✅ Request/Response schemas defined")
print("✅ Validation models ready")
print("📊 Example schemas with documentation")
```

## 3. Sample Model and Core Inference Engine

```python
# Sample CNN model for demonstration
class SampleCNN(nn.Module):
    """Sample CNN model for demonstration purposes."""
    
    def __init__(self, num_classes=10):
        super(SampleCNN, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((8, 8)),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.AdaptiveAvgPool2d((4, 4))
        )
        
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(64 * 4 * 4, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(128, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# Advanced Model Server with Async Batching
class ModelServer:
    """High-performance model server with async batch processing."""
    
    def __init__(self, model, device, max_batch_size=32, batch_timeout=0.01):
        self.model = model.to(device)
        self.device = device
        self.max_batch_size = max_batch_size
        self.batch_timeout = batch_timeout
        
        # Request queue and processing
        self.request_queue = asyncio.Queue()
        self.response_futures = {}
        self.batch_processor_task = None
        self.is_running = False
        
        # Statistics tracking
        self.stats = {
            'total_requests': 0,
            'successful_requests': 0,
            'failed_requests': 0,
            'total_inference_time': 0,
            'total_batches': 0,
            'batch_sizes': [],
            'queue_lengths': [],
            'start_time': time.time()
        }
        
        # Model metadata
        self.model_info = {
            'name': 'SampleCNN',
            'version': '1.0.0',
            'type': 'CNN',
            'input_shape': [3, 32, 32],
            'output_classes': [f'class_{i}' for i in range(10)],
            'parameters': sum(p.numel() for p in model.parameters()),
            'model_size_mb': sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 * 1024),
            'training_date': datetime.now().strftime('%Y-%m-%d')
        }
        
        print(f"🤖 Model Server initialized")
        print(f"   📊 Parameters: {self.model_info['parameters']:,}")
        print(f"   💾 Size: {self.model_info['model_size_mb']:.2f} MB")
        print(f"   🔄 Max batch size: {max_batch_size}")
        print(f"   ⏱️ Batch timeout: {batch_timeout}s")
    
    async def start_batch_processor(self):
        """Start the batch processing task."""
        if not self.is_running:
            self.is_running = True
            self.batch_processor_task = asyncio.create_task(self._batch_processor())
            print("🚀 Batch processor started")
    
    async def stop_batch_processor(self):
        """Stop the batch processing task."""
        if self.is_running:
            self.is_running = False
            if self.batch_processor_task:
                self.batch_processor_task.cancel()
                try:
                    await self.batch_processor_task
                except asyncio.CancelledError:
                    pass
            print("🛑 Batch processor stopped")
    
    async def _batch_processor(self):
        """Main batch processing loop."""
        while self.is_running:
            try:
                batch = []
                batch_futures = []
                
                # Collect requests for batch
                deadline = time.time() + self.batch_timeout
                
                while len(batch) < self.max_batch_size and time.time() < deadline:
                    try:
                        timeout = max(0.001, deadline - time.time())
                        request_id, request_data, future = await asyncio.wait_for(
                            self.request_queue.get(), timeout=timeout
                        )
                        batch.append((request_id, request_data))
                        batch_futures.append(future)
                    except asyncio.TimeoutError:
                        break
                
                # Process batch if we have requests
                if batch:
                    await self._process_batch(batch, batch_futures)
                
                # Small delay to prevent tight loop
                await asyncio.sleep(0.001)
                
            except Exception as e:
                print(f"Error in batch processor: {e}")
                await asyncio.sleep(0.1)
    
    async def _process_batch(self, batch, futures):
        """Process a batch of requests."""
        try:
            batch_start_time = time.time()
            
            # Prepare batch tensor
            batch_tensors = []
            for request_id, request_data in batch:
                tensor = self._prepare_tensor(request_data)
                batch_tensors.append(tensor)
            
            if not batch_tensors:
                return
            
            # Stack tensors into batch
            batch_tensor = torch.stack(batch_tensors).to(self.device)
            
            # Run inference
            with torch.no_grad():
                outputs = self.model(batch_tensor)
                probabilities = torch.softmax(outputs, dim=1)
                predictions = torch.argmax(probabilities, dim=1)
            
            batch_inference_time = (time.time() - batch_start_time) * 1000
            
            # Process results for each request
            for i, ((request_id, request_data), future) in enumerate(zip(batch, futures)):
                try:
                    response = self._create_response(
                        predictions[i].item(),
                        probabilities[i].cpu().numpy(),
                        batch_inference_time / len(batch),
                        request_data,
                        request_id
                    )
                    future.set_result(response)
                    self.stats['successful_requests'] += 1
                except Exception as e:
                    future.set_exception(e)
                    self.stats['failed_requests'] += 1
            
            # Update statistics
            self.stats['total_batches'] += 1
            self.stats['batch_sizes'].append(len(batch))
            self.stats['total_inference_time'] += batch_inference_time
            self.stats['queue_lengths'].append(self.request_queue.qsize())
            
        except Exception as e:
            # Set exception for all futures in case of batch failure
            for future in futures:
                if not future.done():
                    future.set_exception(e)
                    self.stats['failed_requests'] += 1
    
    def _prepare_tensor(self, request_data):
        """Prepare tensor from request data."""
        if 'tensor_data' in request_data and request_data['tensor_data'] is not None:
            tensor = torch.tensor(request_data['tensor_data'], dtype=torch.float32)
            if tensor.dim() == 3:  # Add batch dimension
                tensor = tensor.unsqueeze(0)
            return tensor.squeeze(0)  # Remove batch dim for stacking later
        else:
            # Default random tensor for demonstration
            return torch.randn(3, 32, 32)
    
    def _create_response(self, prediction, probabilities, inference_time, request_data, request_id):
        """Create response dictionary."""
        response = {
            'predicted_class': prediction,
            'class_name': self.model_info['output_classes'][prediction],
            'confidence': float(probabilities[prediction]),
            'inference_time_ms': inference_time,
            'model_version': self.model_info['version'],
            'request_id': request_id
        }
        
        if request_data.get('return_probabilities', True):
            response['probabilities'] = probabilities.tolist()
        
        return response
    
    async def predict(self, request_data):
        """Submit prediction request."""
        request_id = f"req_{int(time.time() * 1000)}_{secrets.token_hex(4)}"
        future = asyncio.Future()
        
        await self.request_queue.put((request_id, request_data, future))
        self.stats['total_requests'] += 1
        
        return await future
    
    def get_health_status(self):
        """Get server health status."""
        uptime = time.time() - self.stats['start_time']
        return {
            'status': 'healthy' if self.is_running else 'stopped',
            'timestamp': datetime.now().isoformat(),
            'version': self.model_info['version'],
            'uptime_seconds': uptime,
            'model_loaded': True,
            'gpu_available': torch.cuda.is_available(),
            'queue_length': self.request_queue.qsize()
        }
    
    def get_model_info(self):
        """Get model information."""
        return self.model_info.copy()
    
    def get_stats(self):
        """Get detailed server statistics."""
        avg_batch_size = np.mean(self.stats['batch_sizes']) if self.stats['batch_sizes'] else 0
        avg_queue_length = np.mean(self.stats['queue_lengths']) if self.stats['queue_lengths'] else 0
        avg_inference_time = (self.stats['total_inference_time'] / 
                            max(self.stats['total_batches'], 1))
        
        success_rate = (self.stats['successful_requests'] / 
                       max(self.stats['total_requests'], 1))
        
        return {
            'total_requests': self.stats['total_requests'],
            'successful_requests': self.stats['successful_requests'],
            'failed_requests': self.stats['failed_requests'],
            'success_rate': success_rate,
            'total_batches': self.stats['total_batches'],
            'avg_batch_size': avg_batch_size,
            'avg_inference_time_ms': avg_inference_time,
            'avg_queue_length': avg_queue_length,
            'uptime_seconds': time.time() - self.stats['start_time'],
            'queue_length': self.request_queue.qsize()
        }

# Initialize model and server
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SampleCNN(num_classes=10)
model.eval()

model_server = ModelServer(model, device, max_batch_size=16, batch_timeout=0.01)

print("\n🤖 MODEL SERVER INITIALIZED")
print("=" * 50)
print(f"✅ Model: {model_server.model_info['name']}")
print(f"📊 Parameters: {model_server.model_info['parameters']:,}")
print(f"💾 Size: {model_server.model_info['model_size_mb']:.2f} MB")
print(f"🔧 Device: {device}")
print(f"🔄 Max batch size: {model_server.max_batch_size}")
print(f"⏱️ Batch timeout: {model_server.batch_timeout}s")
```

## 4. Authentication and Security System

```python
# Advanced Authentication and Rate Limiting
class APIKeyManager:
    """Comprehensive API key management with tiered access."""
    
    def __init__(self):
        # In production, use a proper database
        self.api_keys = {
            "test_key_123": {
                "name": "Test User",
                "tier": "standard",
                "rate_limit": 100,  # requests per minute
                "created_at": datetime.now(),
                "last_used": None,
                "total_requests": 0,
                "features": ["predict", "batch_predict", "model_info"]
            },
            "premium_key_456": {
                "name": "Premium User", 
                "tier": "premium",
                "rate_limit": 1000,
                "created_at": datetime.now(),
                "last_used": None,
                "total_requests": 0,
                "features": ["predict", "batch_predict", "model_info", "stats", "metrics"]
            },
            "enterprise_key_789": {
                "name": "Enterprise User",
                "tier": "enterprise", 
                "rate_limit": 10000,
                "created_at": datetime.now(),
                "last_used": None,
                "total_requests": 0,
                "features": ["predict", "batch_predict", "model_info", "stats", "metrics", "admin"]
            }
        }
        
        # Rate limiting tracking with time windows
        self.rate_limit_store = defaultdict(lambda: deque(maxlen=1000))
    
    def validate_api_key(self, api_key: str) -> Optional[Dict]:
        """Validate API key and return user info."""
        if api_key in self.api_keys:
            user_info = self.api_keys[api_key].copy()
            # Update usage stats
            self.api_keys[api_key]["last_used"] = datetime.now()
            self.api_keys[api_key]["total_requests"] += 1
            return user_info
        return None
    
    def check_rate_limit(self, api_key: str, rate_limit: int) -> tuple[bool, Dict]:
        """Check if request is within rate limits and return limit info."""
        now = time.time()
        minute_ago = now - 60
        
        # Clean old entries
        user_requests = self.rate_limit_store[api_key]
        while user_requests and user_requests[0] < minute_ago:
            user_requests.popleft()
        
        current_count = len(user_requests)
        remaining = max(0, rate_limit - current_count)
        
        # Check if under limit
        if current_count < rate_limit:
            user_requests.append(now)
            return True, {
                'limit': rate_limit,
                'remaining': remaining - 1,
                'reset_time': int(now + 60),
                'current_count': current_count + 1
            }
        
        return False, {
            'limit': rate_limit,
            'remaining': 0,
            'reset_time': int(now + 60),
            'current_count': current_count
        }
    
    def check_feature_access(self, api_key: str, feature: str) -> bool:
        """Check if API key has access to specific feature."""
        user_info = self.api_keys.get(api_key)
        if not user_info:
            return False
        return feature in user_info.get('features', [])
    
    def get_usage_stats(self, api_key: str) -> Dict:
        """Get comprehensive usage statistics for API key."""
        if api_key not in self.api_keys:
            return {}
        
        user_info = self.api_keys[api_key]
        current_minute_requests = len([
            req_time for req_time in self.rate_limit_store[api_key]
            if req_time > time.time() - 60
        ])
        
        return {
            'user_name': user_info['name'],
            'tier': user_info['tier'],
            'total_requests': user_info['total_requests'],
            'current_minute_requests': current_minute_requests,
            'rate_limit': user_info['rate_limit'],
            'features': user_info['features'],
            'last_used': user_info['last_used'].isoformat() if user_info['last_used'] else None,
            'created_at': user_info['created_at'].isoformat()
        }

class SecurityMiddleware:
    """Advanced security middleware with threat detection."""
    
    def __init__(self):
        self.blocked_ips = set()
        self.suspicious_requests = defaultdict(int)
        self.request_patterns = defaultdict(lambda: deque(maxlen=100))
        self.threat_patterns = [
            'script', 'javascript:', 'eval(', 'exec(', 'import os',
            'subprocess', '__import__', 'system('
        ]
    
    def validate_request_size(self, content_length: int, max_size: int = 10 * 1024 * 1024) -> bool:
        """Validate request size with configurable limits."""
        return content_length <= max_size
    
    def detect_suspicious_activity(self, client_ip: str, request_path: str, user_agent: str = "") -> Dict:
        """Advanced suspicious activity detection."""
        now = time.time()
        
        # Track request patterns
        self.request_patterns[client_ip].append(now)
        
        threat_score = 0
        threats_detected = []
        
        # Check for rapid requests (more than 50 requests in 10 seconds)
        recent_requests = [
            req_time for req_time in self.request_patterns[client_ip]
            if req_time > now - 10
        ]
        
        if len(recent_requests) > 50:
            threat_score += 30
            threats_detected.append("rapid_requests")
            self.suspicious_requests[client_ip] += 1
        
        # Check for suspicious patterns in request path
        for pattern in self.threat_patterns:
            if pattern.lower() in request_path.lower():
                threat_score += 40
                threats_detected.append(f"suspicious_pattern_{pattern}")
        
        # Check user agent
        if not user_agent or len(user_agent) < 10:
            threat_score += 10
            threats_detected.append("suspicious_user_agent")
        
        # Block if threat score is too high
        is_blocked = threat_score >= 50 or self.suspicious_requests[client_ip] > 5
        if is_blocked:
            self.blocked_ips.add(client_ip)
        
        return {
            'is_blocked': is_blocked or client_ip in self.blocked_ips,
            'threat_score': threat_score,
            'threats_detected': threats_detected,
            'request_count_10s': len(recent_requests),
            'total_suspicious_requests': self.suspicious_requests[client_ip]
        }
    
    def sanitize_input(self, data: Any) -> Any:
        """Comprehensive input sanitization."""
        if isinstance(data, str):
            # Remove potential script injections
            for pattern in self.threat_patterns:
                data = data.replace(pattern, '')
            # Additional sanitization
            data = data.replace('<script>', '').replace('</script>', '')
            data = data.replace('javascript:', '').replace('data:', '')
        elif isinstance(data, dict):
            return {k: self.sanitize_input(v) for k, v in data.items()}
        elif isinstance(data, list):
            return [self.sanitize_input(item) for item in data]
        
        return data
    
    def get_security_stats(self) -> Dict:
        """Get security statistics."""
        return {
            'blocked_ips_count': len(self.blocked_ips),
            'blocked_ips': list(self.blocked_ips),
            'suspicious_requests_count': sum(self.suspicious_requests.values()),
            'unique_suspicious_ips': len(self.suspicious_requests),
            'threat_patterns_count': len(self.threat_patterns)
        }

# Initialize security components
api_key_manager = APIKeyManager()
security_middleware = SecurityMiddleware()

print("\n🔐 SECURITY SYSTEM CONFIGURED")
print("=" * 50)
print(f"📋 API Keys configured: {len(api_key_manager.api_keys)}")
print("🛡️ Security middleware active")
print("⚡ Rate limiting enabled")
print("🔍 Threat detection ready")

# Test authentication and security
print("\n🧪 TESTING SECURITY SYSTEM")
print("-" * 30)

# Test API key validation
for key, info in list(api_key_manager.api_keys.items())[:2]:
    user_info = api_key_manager.validate_api_key(key)
    print(f"✅ {user_info['name']} ({user_info['tier']}): {user_info['rate_limit']} req/min")

# Test rate limiting
test_key = "test_key_123"
is_allowed, limit_info = api_key_manager.check_rate_limit(test_key, 100)
print(f"⚡ Rate limit test: {'✅ Allowed' if is_allowed else '❌ Blocked'}")
print(f"   Remaining: {limit_info['remaining']}, Reset: {limit_info['reset_time']}")

# Test security detection
security_result = security_middleware.detect_suspicious_activity(
    "192.168.1.100", "/predict", "TestClient/1.0"
)
print(f"🛡️ Security check: Score {security_result['threat_score']}, Blocked: {security_result['is_blocked']}")
```

## 5. FastAPI Application Implementation

```python
# Complete FastAPI Application with All Features
if FASTAPI_AVAILABLE:
    
    # Application lifecycle management
    @asynccontextmanager
    async def lifespan(app: FastAPI):
        """Manage application lifecycle with proper startup/shutdown."""
        # Startup
        print("🚀 Starting model server...")
        await model_server.start_batch_processor()
        print("✅ Model server ready!")
        
        yield
        
        # Shutdown
        print("🔄 Shutting down model server...")
        await model_server.stop_batch_processor()
        print("✅ Model server stopped!")
    
    # Initialize FastAPI app with comprehensive configuration
    app = FastAPI(
        title="PyTorch Model Serving API",
        description="""
        **Production-Grade PyTorch Model Serving**
        
        High-performance model inference API with:
        - Async batch processing for optimal throughput
        - Tiered authentication and rate limiting
        - Comprehensive monitoring and metrics
        - Enterprise security features
        - Real-time health monitoring
        
        **Authentication**: Include `X-API-Key` header with your API key.
        **Rate Limits**: Based on your subscription tier.
        """,
        version="1.0.0",
        docs_url="/docs",
        redoc_url="/redoc",
        lifespan=lifespan,
        openapi_tags=[
            {"name": "health", "description": "Health and status endpoints"},
            {"name": "prediction", "description": "Model inference endpoints"},
            {"name": "monitoring", "description": "Metrics and monitoring"},
            {"name": "admin", "description": "Administrative endpoints"}
        ]
    )
    
    # Add comprehensive middleware
    app.add_middleware(
        CORSMiddleware,
        allow_origins=["*"],  # Configure appropriately for production
        allow_credentials=True,
        allow_methods=["GET", "POST"],
        allow_headers=["*"],
    )
    
    app.add_middleware(GZipMiddleware, minimum_size=1000)
    
    # Custom middleware for request logging and security
    @app.middleware("http")
    async def security_middleware_func(request: Request, call_next):
        start_time = time.time()
        
        # Security checks
        client_ip = request.client.host
        user_agent = request.headers.get("user-agent", "")
        content_length = int(request.headers.get("content-length", 0))
        
        # Validate request size
        if not security_middleware.validate_request_size(content_length):
            return JSONResponse(
                status_code=413,
                content={"error": "Request too large", "max_size_mb": 10}
            )
        
        # Check for suspicious activity
        security_result = security_middleware.detect_suspicious_activity(
            client_ip, str(request.url.path), user_agent
        )
        
        if security_result['is_blocked']:
            return JSONResponse(
                status_code=429,
                content={
                    "error": "Blocked due to suspicious activity",
                    "threat_score": security_result['threat_score'],
                    "contact": "support@example.com"
                }
            )
        
        # Process request
        response = await call_next(request)
        
        # Add security headers
        response.headers["X-Content-Type-Options"] = "nosniff"
        response.headers["X-Frame-Options"] = "DENY"
        response.headers["X-XSS-Protection"] = "1; mode=block"
        
        # Add timing header
        process_time = time.time() - start_time
        response.headers["X-Process-Time"] = str(process_time)
        
        return response
    
    # Security dependencies
    api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
    
    async def get_api_key(api_key: str = Depends(api_key_header)):
        """Validate API key and return user info."""
        if not api_key:
            raise HTTPException(
                status_code=status.HTTP_401_UNAUTHORIZED,
                detail="API key required",
                headers={"WWW-Authenticate": "ApiKey"}
            )
        
        user_info = api_key_manager.validate_api_key(api_key)
        if not user_info:
            raise HTTPException(
                status_code=status.HTTP_401_UNAUTHORIZED,
                detail="Invalid API key",
                headers={"WWW-Authenticate": "ApiKey"}
            )
        
        return user_info
    
    async def rate_limit_check(
        request: Request,
        user_info: dict = Depends(get_api_key)
    ):
        """Check rate limits with detailed headers."""
        api_key = request.headers.get("X-API-Key")
        
        is_allowed, limit_info = api_key_manager.check_rate_limit(api_key, user_info["rate_limit"])
        
        if not is_allowed:
            raise HTTPException(
                status_code=status.HTTP_429_TOO_MANY_REQUESTS,
                detail="Rate limit exceeded",
                headers={
                    "X-RateLimit-Limit": str(limit_info['limit']),
                    "X-RateLimit-Remaining": str(limit_info['remaining']),
                    "X-RateLimit-Reset": str(limit_info['reset_time']),
                    "Retry-After": "60"
                }
            )
        
        return user_info, limit_info
    
    async def feature_access_check(feature: str):
        """Check feature access for the user."""
        def _check(request: Request, user_info: dict = Depends(get_api_key)):
            api_key = request.headers.get("X-API-Key")
            if not api_key_manager.check_feature_access(api_key, feature):
                raise HTTPException(
                    status_code=status.HTTP_403_FORBIDDEN,
                    detail=f"Access to '{feature}' not available in {user_info['tier']} tier"
                )
            return user_info
        return _check
    
    # API Endpoints
    @app.get("/", tags=["health"])
    async def root():
        """Root endpoint with comprehensive API information."""
        return {
            "service": "PyTorch Model Serving API",
            "version": "1.0.0",
            "status": "running",
            "features": {
                "async_batching": True,
                "rate_limiting": True,
                "authentication": True,
                "monitoring": True,
                "auto_scaling": True
            },
            "endpoints": {
                "health": "/health",
                "model_info": "/model/info", 
                "predict": "/predict",
                "batch_predict": "/batch_predict",
                "stats": "/stats",
                "metrics": "/metrics",
                "docs": "/docs"
            },
            "authentication": {
                "type": "API Key",
                "header": "X-API-Key",
                "tiers": ["standard", "premium", "enterprise"]
            }
        }
    
    @app.get("/health", response_model=HealthResponse, tags=["health"])
    async def health_check():
        """Comprehensive health check endpoint."""
        return model_server.get_health_status()
    
    @app.get("/model/info", response_model=ModelInfo, tags=["prediction"])
    async def get_model_info(user_info: dict = Depends(get_api_key)):
        """Get detailed model information."""
        return model_server.get_model_info()
    
    @app.get("/stats", tags=["monitoring"])
    async def get_stats(
        user_info: dict = Depends(feature_access_check("stats"))
    ):
        """Get comprehensive server statistics."""
        server_stats = model_server.get_stats()
        
        # Get API key from request to show usage stats
        # In real implementation, extract from request context
        api_key = "test_key_123"  # Placeholder
        usage_stats = api_key_manager.get_usage_stats(api_key)
        security_stats = security_middleware.get_security_stats()
        
        return {
            "server": server_stats,
            "user": usage_stats,
            "security": security_stats,
            "timestamp": datetime.now().isoformat()
        }
    
    @app.post("/predict", response_model=PredictionResponse, tags=["prediction"])
    async def predict(
        request_data: PredictionRequest,
        req: Request,
        rate_check: tuple = Depends(rate_limit_check)
    ):
        """Single prediction endpoint with async processing."""
        user_info, limit_info = rate_check
        
        try:
            # Sanitize input data
            request_dict = security_middleware.sanitize_input(request_data.dict())
            
            # Make prediction using async batch processor
            result = await model_server.predict(request_dict)
            
            return result
            
        except Exception as e:
            raise HTTPException(
                status_code=500, 
                detail=f"Prediction failed: {str(e)}"
            )
    
    @app.post("/batch_predict", response_model=BatchPredictionResponse, tags=["prediction"])
    async def batch_predict(
        request: BatchPredictionRequest,
        req: Request,
        rate_check: tuple = Depends(rate_limit_check)
    ):
        """Batch prediction endpoint for high-throughput processing."""
        user_info, limit_info = rate_check
        
        # Check batch size limits based on tier
        max_batch_sizes = {"standard": 10, "premium": 50, "enterprise": 100}
        max_batch = max_batch_sizes.get(user_info['tier'], 10)
        
        if len(request.batch) > max_batch:
            raise HTTPException(
                status_code=400,
                detail=f"Batch size {len(request.batch)} exceeds limit of {max_batch} for {user_info['tier']} tier"
            )
        
        try:
            batch_id = request.batch_id or f"batch_{int(time.time() * 1000)}_{secrets.token_hex(4)}"
            batch_start_time = time.time()
            
            # Process batch requests
            predictions = []
            for item in request.batch:
                item_dict = security_middleware.sanitize_input(item.dict())
                result = await model_server.predict(item_dict)
                predictions.append(result)
            
            total_time = (time.time() - batch_start_time) * 1000
            
            return {
                "predictions": predictions,
                "batch_size": len(predictions),
                "total_inference_time_ms": total_time,
                "batch_id": batch_id
            }
            
        except Exception as e:
            raise HTTPException(
                status_code=500, 
                detail=f"Batch prediction failed: {str(e)}"
            )
    
    @app.get("/metrics", tags=["monitoring"])
    async def get_metrics(
        user_info: dict = Depends(feature_access_check("metrics"))
    ):
        """Prometheus-style metrics endpoint."""
        stats = model_server.get_stats()
        
        metrics = {
            # Counter metrics
            "model_requests_total": stats["total_requests"],
            "model_requests_successful_total": stats["successful_requests"],
            "model_requests_failed_total": stats["failed_requests"],
            "model_batches_processed_total": stats["total_batches"],
            
            # Gauge metrics
            "model_queue_length": stats["queue_length"],
            "model_uptime_seconds": stats["uptime_seconds"],
            
            # Histogram metrics
            "model_inference_duration_seconds": stats["avg_inference_time_ms"] / 1000,
            "model_batch_size_avg": stats["avg_batch_size"],
            
            # Rate metrics
            "model_success_rate": stats["success_rate"],
            "model_requests_per_second": stats["total_requests"] / max(stats["uptime_seconds"], 1)
        }
        
        return metrics
    
    @app.get("/admin/security", tags=["admin"])
    async def get_security_info(
        user_info: dict = Depends(feature_access_check("admin"))
    ):
        """Administrative security information."""
        return security_middleware.get_security_stats()
    
    # Comprehensive error handlers
    @app.exception_handler(HTTPException)
    async def http_exception_handler(request: Request, exc: HTTPException):
        """Custom HTTP exception handler with detailed error info."""
        return JSONResponse(
            status_code=exc.status_code,
            content={
                "error": {
                    "message": exc.detail,
                    "status_code": exc.status_code,
                    "type": "HTTPException",
                    "timestamp": datetime.now().isoformat(),
                    "path": str(request.url.path),
                    "method": request.method
                },
                "support": {
                    "docs": "/docs",
                    "contact": "support@example.com"
                }
            }
        )
    
    @app.exception_handler(Exception)
    async def general_exception_handler(request: Request, exc: Exception):
        """General exception handler for unexpected errors."""
        error_id = secrets.token_hex(8)
        
        # Log error details (in production, use proper logging)
        print(f"Error {error_id}: {str(exc)}")
        
        return JSONResponse(
            status_code=500,
            content={
                "error": {
                    "message": "Internal server error",
                    "status_code": 500,
                    "type": "InternalServerError",
                    "timestamp": datetime.now().isoformat(),
                    "path": str(request.url.path),
                    "error_id": error_id
                },
                "support": {
                    "docs": "/docs",
                    "contact": "support@example.com"
                }
            }
        )
    
    print("\n🌐 FASTAPI APPLICATION CONFIGURED")
    print("=" * 50)
    print("✅ All endpoints registered")
    print("🔐 Authentication middleware active")
    print("⚡ Rate limiting with tier-based limits")
    print("🛡️ Security checks and threat detection")
    print("📊 Comprehensive metrics and monitoring")
    print("🚨 Custom error handlers")

else:
    print("\n⚠️ FastAPI not available - API server simulation only")
    app = None
```

## 6. API Testing and Load Testing Framework

```python
# Comprehensive API Client and Load Testing Framework
class ModelAPIClient:
    """Production-ready API client with comprehensive features."""
    
    def __init__(self, base_url: str, api_key: str):
        self.base_url = base_url.rstrip('/')
        self.api_key = api_key
        
        # Request tracking and statistics
        self.request_count = 0
        self.error_count = 0
        self.total_latency = 0
        self.response_times = []
        self.error_log = []
        
        # Session configuration (would use aiohttp/httpx in production)
        self.timeout = 30
        self.retry_attempts = 3
        
    def _get_headers(self) -> Dict[str, str]:
        """Get comprehensive request headers."""
        return {
            "X-API-Key": self.api_key,
            "Content-Type": "application/json",
            "User-Agent": "PyTorch-Model-Client/1.0",
            "Accept": "application/json",
            "Cache-Control": "no-cache"
        }
    
    async def health_check(self) -> Dict:
        """Check API health with detailed response."""
        # Simulated response for demonstration
        return {
            "status": "healthy",
            "timestamp": datetime.now().isoformat(),
            "version": "1.0.0",
            "uptime_seconds": 3600,
            "model_loaded": True,
            "gpu_available": torch.cuda.is_available(),
            "queue_length": 2
        }
    
    async def get_model_info(self) -> Dict:
        """Get detailed model information."""
        return model_server.get_model_info()
    
    async def predict(
        self,
        image_data: Optional[str] = None,
        tensor_data: Optional[List] = None,
        return_probabilities: bool = True,
        confidence_threshold: float = 0.5
    ) -> Dict:
        """Make single prediction with comprehensive error handling."""
        start_time = time.time()
        
        try:
            # Prepare request data
            request_data = {
                "image_data": image_data,
                "tensor_data": tensor_data,
                "return_probabilities": return_probabilities,
                "confidence_threshold": confidence_threshold
            }
            
            # Make prediction using model server directly (simulated)
            result = await model_server.predict(request_data)
            
            # Track metrics
            latency = time.time() - start_time
            self.request_count += 1
            self.total_latency += latency
            self.response_times.append(latency)
            
            return result
            
        except Exception as e:
            self.error_count += 1
            error_info = {
                'timestamp': datetime.now().isoformat(),
                'error': str(e),
                'latency': time.time() - start_time
            }
            self.error_log.append(error_info)
            raise Exception(f"Prediction failed: {str(e)}")
    
    async def batch_predict(self, batch_requests: List[Dict], batch_id: Optional[str] = None) -> Dict:
        """Make batch prediction with performance tracking."""
        start_time = time.time()
        
        try:
            predictions = []
            for request_data in batch_requests:
                result = await model_server.predict(request_data)
                predictions.append(result)
            
            batch_id = batch_id or f"batch_{int(time.time() * 1000)}_{secrets.token_hex(4)}"
            total_time = (time.time() - start_time) * 1000
            
            # Track batch metrics
            self.request_count += len(batch_requests)
            self.total_latency += time.time() - start_time
            
            return {
                "predictions": predictions,
                "batch_size": len(predictions),
                "total_inference_time_ms": total_time,
                "batch_id": batch_id
            }
            
        except Exception as e:
            self.error_count += len(batch_requests)
            error_info = {
                'timestamp': datetime.now().isoformat(),
                'error': str(e),
                'batch_size': len(batch_requests),
                'latency': time.time() - start_time
            }
            self.error_log.append(error_info)
            raise Exception(f"Batch prediction failed: {str(e)}")
    
    def get_client_stats(self) -> Dict:
        """Get comprehensive client-side statistics."""
        if self.request_count == 0:
            return {"message": "No requests made yet"}
        
        avg_latency = self.total_latency / self.request_count
        success_rate = (self.request_count - self.error_count) / self.request_count
        
        stats = {
            "total_requests": self.request_count,
            "successful_requests": self.request_count - self.error_count,
            "error_count": self.error_count,
            "success_rate": success_rate,
            "average_latency_ms": avg_latency * 1000,
            "total_latency_seconds": self.total_latency
        }
        
        if self.response_times:
            stats.update({
                "min_latency_ms": min(self.response_times) * 1000,
                "max_latency_ms": max(self.response_times) * 1000,
                "p95_latency_ms": np.percentile(self.response_times, 95) * 1000,
                "p99_latency_ms": np.percentile(self.response_times, 99) * 1000
            })
        
        return stats

# Advanced Load Testing Framework
class LoadTester:
    """Comprehensive load testing framework with detailed analytics."""
    
    def __init__(self, client: ModelAPIClient):
        self.client = client
        self.results = []
        self.errors = []
        
    async def single_request_test(self, num_requests: int = 100):
        """Test single request performance with detailed metrics."""
        print(f"\n🧪 SINGLE REQUEST LOAD TEST")
        print(f"Target: {num_requests} requests")
        print("=" * 50)
        
        start_time = time.time()
        latencies = []
        errors = []
        
        for i in range(num_requests):
            try:
                request_start = time.time()
                
                # Generate realistic test data
                test_tensor = torch.randn(3, 32, 32).tolist()
                result = await self.client.predict(
                    tensor_data=test_tensor,
                    return_probabilities=True,
                    confidence_threshold=0.5
                )
                
                latency = time.time() - request_start
                latencies.append(latency)
                
                # Progress updates
                if (i + 1) % max(1, num_requests // 10) == 0:
                    progress = (i + 1) / num_requests * 100
                    avg_latency = np.mean(latencies[-10:]) if latencies else 0
                    print(f"Progress: {progress:.0f}% | Recent avg latency: {avg_latency*1000:.1f}ms")
                
            except Exception as e:
                errors.append({
                    'request_id': i,
                    'error': str(e),
                    'timestamp': datetime.now().isoformat()
                })
        
        total_time = time.time() - start_time
        
        # Calculate comprehensive statistics
        results = self._calculate_performance_stats(
            "single_requests", latencies, errors, total_time, num_requests
        )
        
        self.results.append(results)
        self._print_test_results(results)
        return results
    
    async def batch_request_test(self, num_batches: int = 20, batch_size: int = 8):
        """Test batch request performance with throughput analysis."""
        print(f"\n🧪 BATCH REQUEST LOAD TEST")
        print(f"Target: {num_batches} batches of {batch_size} items each")
        print("=" * 50)
        
        start_time = time.time()
        batch_latencies = []
        item_latencies = []
        errors = []
        
        for i in range(num_batches):
            try:
                batch_start = time.time()
                
                # Generate batch of test data
                batch_requests = []
                for _ in range(batch_size):
                    test_tensor = torch.randn(3, 32, 32).tolist()
                    batch_requests.append({
                        "tensor_data": test_tensor,
                        "return_probabilities": True,
                        "confidence_threshold": 0.5
                    })
                
                result = await self.client.batch_predict(batch_requests)
                
                batch_latency = time.time() - batch_start
                batch_latencies.append(batch_latency)
                
                # Calculate per-item latency
                item_latency = batch_latency / batch_size
                item_latencies.extend([item_latency] * batch_size)
                
                print(f"Batch {i+1}/{num_batches}: {batch_size} items in {batch_latency*1000:.1f}ms ({item_latency*1000:.1f}ms/item)")
                
            except Exception as e:
                errors.append({
                    'batch_id': i,
                    'error': str(e),
                    'timestamp': datetime.now().isoformat()
                })
        
        total_time = time.time() - start_time
        total_items = num_batches * batch_size
        
        # Calculate batch-specific statistics
        results = {
            "test_type": "batch_requests",
            "total_batches": num_batches,
            "batch_size": batch_size,
            "total_items": total_items,
            "successful_batches": len(batch_latencies),
            "failed_batches": len(errors),
            "total_time_seconds": total_time,
            "throughput_items_per_second": len(item_latencies) / total_time if total_time > 0 else 0,
            "throughput_batches_per_second": len(batch_latencies) / total_time if total_time > 0 else 0
        }
        
        if batch_latencies:
            results.update({
                "avg_batch_latency_ms": np.mean(batch_latencies) * 1000,
                "min_batch_latency_ms": np.min(batch_latencies) * 1000,
                "max_batch_latency_ms": np.max(batch_latencies) * 1000,
                "p95_batch_latency_ms": np.percentile(batch_latencies, 95) * 1000
            })
        
        if item_latencies:
            results.update({
                "avg_item_latency_ms": np.mean(item_latencies) * 1000,
                "p95_item_latency_ms": np.percentile(item_latencies, 95) * 1000
            })
        
        results["error_rate"] = len(errors) / num_batches if num_batches > 0 else 0
        
        self.results.append(results)
        self._print_test_results(results)
        return results
    
    async def concurrent_test(self, num_concurrent: int = 10, requests_per_client: int = 10):
        """Test concurrent request handling with detailed analysis."""
        print(f"\n🧪 CONCURRENT LOAD TEST")
        print(f"Target: {num_concurrent} concurrent clients, {requests_per_client} requests each")
        print("=" * 50)
        
        async def client_worker(client_id: int):
            """Individual client worker with error tracking."""
            latencies = []
            errors = []
            
            for i in range(requests_per_client):
                try:
                    start_time = time.time()
                    test_tensor = torch.randn(3, 32, 32).tolist()
                    await self.client.predict(tensor_data=test_tensor)
                    latencies.append(time.time() - start_time)
                except Exception as e:
                    errors.append({
                        'client_id': client_id,
                        'request_id': i,
                        'error': str(e),
                        'timestamp': datetime.now().isoformat()
                    })
            
            return latencies, errors
        
        # Run concurrent workers (simulated)
        start_time = time.time()
        all_latencies = []
        all_errors = []
        
        # In real implementation, use asyncio.gather for true concurrency
        for client_id in range(num_concurrent):
            latencies, errors = await client_worker(client_id)
            all_latencies.extend(latencies)
            all_errors.extend(errors)
            
            # Progress update
            progress = (client_id + 1) / num_concurrent * 100
            print(f"Client {client_id + 1}/{num_concurrent} completed ({progress:.0f}%)")
        
        total_time = time.time() - start_time
        total_requests = num_concurrent * requests_per_client
        
        # Calculate comprehensive concurrent statistics
        results = self._calculate_performance_stats(
            "concurrent_requests", all_latencies, all_errors, total_time, total_requests
        )
        
        results.update({
            "num_concurrent_clients": num_concurrent,
            "requests_per_client": requests_per_client
        })
        
        self.results.append(results)
        self._print_test_results(results)
        return results
    
    def _calculate_performance_stats(self, test_type: str, latencies: List[float], 
                                   errors: List[Dict], total_time: float, 
                                   total_requests: int) -> Dict:
        """Calculate comprehensive performance statistics."""
        successful_requests = len(latencies)
        failed_requests = len(errors)
        
        results = {
            "test_type": test_type,
            "total_requests": total_requests,
            "successful_requests": successful_requests,
            "failed_requests": failed_requests,
            "total_time_seconds": total_time,
            "error_rate": failed_requests / total_requests if total_requests > 0 else 0,
            "success_rate": successful_requests / total_requests if total_requests > 0 else 0
        }
        
        if latencies:
            results.update({
                "throughput_rps": successful_requests / total_time if total_time > 0 else 0,
                "avg_latency_ms": np.mean(latencies) * 1000,
                "min_latency_ms": np.min(latencies) * 1000,
                "max_latency_ms": np.max(latencies) * 1000,
                "median_latency_ms": np.median(latencies) * 1000,
                "p95_latency_ms": np.percentile(latencies, 95) * 1000,
                "p99_latency_ms": np.percentile(latencies, 99) * 1000,
                "std_latency_ms": np.std(latencies) * 1000
            })
        
        return results
    
    def _print_test_results(self, results: Dict):
        """Print formatted test results."""
        print(f"\n📊 {results['test_type'].replace('_', ' ').title()} Results:")
        print("-" * 40)
        print(f"Total Requests: {results['total_requests']:,}")
        print(f"Successful: {results['successful_requests']:,}")
        print(f"Failed: {results['failed_requests']:,}")
        print(f"Success Rate: {results['success_rate']:.1%}")
        print(f"Total Time: {results['total_time_seconds']:.1f}s")
        
        if 'throughput_rps' in results:
            print(f"Throughput: {results['throughput_rps']:.1f} RPS")
        
        if 'avg_latency_ms' in results:
            print(f"Avg Latency: {results['avg_latency_ms']:.1f}ms")
            print(f"P95 Latency: {results['p95_latency_ms']:.1f}ms")
            print(f"P99 Latency: {results['p99_latency_ms']:.1f}ms")
        
        if results['test_type'] == 'batch_requests':
            if 'throughput_items_per_second' in results:
                print(f"Item Throughput: {results['throughput_items_per_second']:.1f} items/sec")
            if 'avg_batch_latency_ms' in results:
                print(f"Avg Batch Latency: {results['avg_batch_latency_ms']:.1f}ms")
    
    def generate_comprehensive_report(self) -> str:
        """Generate detailed performance analysis report."""
        if not self.results:
            return "No test results available"
        
        report = "\n📊 COMPREHENSIVE LOAD TEST REPORT\n"
        report += "=" * 60 + "\n\n"
        
        # Executive Summary
        report += "🎯 EXECUTIVE SUMMARY\n"
        report += "-" * 30 + "\n"
        
        total_requests = sum(r['total_requests'] for r in self.results)
        total_successful = sum(r['successful_requests'] for r in self.results)
        overall_success_rate = total_successful / total_requests if total_requests > 0 else 0
        
        report += f"Total Requests Processed: {total_requests:,}\n"
        report += f"Overall Success Rate: {overall_success_rate:.1%}\n"
        report += f"Tests Conducted: {len(self.results)}\n\n"
        
        # Detailed Results
        for i, result in enumerate(self.results, 1):
            report += f"📋 Test {i}: {result['test_type'].replace('_', ' ').title()}\n"
            report += "-" * 40 + "\n"
            
            if result['test_type'] == 'single_requests':
                report += f"Requests: {result['total_requests']:,}\n"
                report += f"Success Rate: {result['success_rate']:.1%}\n"
                report += f"Throughput: {result.get('throughput_rps', 0):.1f} RPS\n"
                report += f"Avg Latency: {result.get('avg_latency_ms', 0):.1f}ms\n"
                report += f"P95 Latency: {result.get('p95_latency_ms', 0):.1f}ms\n"
                
            elif result['test_type'] == 'batch_requests':
                report += f"Batches: {result['total_batches']:,}\n"
                report += f"Batch Size: {result['batch_size']}\n"
                report += f"Total Items: {result['total_items']:,}\n"
                report += f"Item Throughput: {result.get('throughput_items_per_second', 0):.1f} items/sec\n"
                report += f"Avg Batch Latency: {result.get('avg_batch_latency_ms', 0):.1f}ms\n"
                report += f"Avg Item Latency: {result.get('avg_item_latency_ms', 0):.1f}ms\n"
                
            elif result['test_type'] == 'concurrent_requests':
                report += f"Concurrent Clients: {result['num_concurrent_clients']}\n"
                report += f"Requests per Client: {result['requests_per_client']}\n"
                report += f"Total Requests: {result['total_requests']:,}\n"
                report += f"Throughput: {result.get('throughput_rps', 0):.1f} RPS\n"
                report += f"Avg Latency: {result.get('avg_latency_ms', 0):.1f}ms\n"
                report += f"P95 Latency: {result.get('p95_latency_ms', 0):.1f}ms\n"
            
            report += "\n"
        
        # Performance Recommendations
        report += "💡 PERFORMANCE RECOMMENDATIONS\n"
        report += "-" * 40 + "\n"
        
        recommendations = []
        
        for result in self.results:
            if result.get('error_rate', 0) > 0.05:
                recommendations.append(f"High error rate in {result['test_type']} ({result['error_rate']:.1%}) - investigate error handling")
            
            if result.get('p95_latency_ms', 0) > 1000:
                recommendations.append(f"High P95 latency in {result['test_type']} ({result['p95_latency_ms']:.0f}ms) - consider optimization")
            
            if result.get('throughput_rps', 0) < 50:
                recommendations.append(f"Low throughput in {result['test_type']} ({result['throughput_rps']:.1f} RPS) - increase batch size or optimize inference")
        
        if not recommendations:
            recommendations.append("Performance looks good! System is handling load well.")
        
        for i, rec in enumerate(recommendations, 1):
            report += f"{i}. {rec}\n"
        
        report += "\n"
        return report

# Initialize client and load tester
print("\n🔧 INITIALIZING TEST FRAMEWORK")
print("=" * 50)

client = ModelAPIClient("http://localhost:8000", "test_key_123")
load_tester = LoadTester(client)

print("✅ API client initialized")
print("🧪 Load testing framework ready")
print("📊 Comprehensive metrics tracking enabled")
```

## 7. Running Comprehensive Load Tests

```python
# Execute comprehensive performance testing
async def run_comprehensive_tests():
    """Run complete test suite with detailed analysis."""
    
    print("\n🚀 STARTING COMPREHENSIVE LOAD TEST SUITE")
    print("=" * 60)
    
    # Start the model server batch processor
    await model_server.start_batch_processor()
    
    try:
        # Test 1: Single Request Performance
        print("\n🎯 Phase 1: Single Request Performance Analysis")
        single_results = await load_tester.single_request_test(num_requests=50)
        
        # Test 2: Batch Processing Efficiency
        print("\n🎯 Phase 2: Batch Processing Efficiency Analysis")
        batch_results = await load_tester.batch_request_test(num_batches=10, batch_size=8)
        
        # Test 3: Concurrent Load Handling
        print("\n🎯 Phase 3: Concurrent Load Handling Analysis")
        concurrent_results = await load_tester.concurrent_test(num_concurrent=5, requests_per_client=10)
        
        # Generate comprehensive report
        print("\n📊 GENERATING COMPREHENSIVE PERFORMANCE REPORT")
        print("=" * 60)
        
        report = load_tester.generate_comprehensive_report()
        print(report)
        
        # Save test results
        test_results = {
            'timestamp': datetime.now().isoformat(),
            'test_configuration': {
                'model_name': model_server.model_info['name'],
                'device': str(device),
                'max_batch_size': model_server.max_batch_size,
                'batch_timeout': model_server.batch_timeout
            },
            'results': load_tester.results,
            'client_stats': client.get_client_stats()
        }
        
        # Save to file
        with open(results_dir / 'load_test_results.json', 'w') as f:
            json.dump(test_results, f, indent=2, default=str)
        
        print(f"\n💾 Test results saved to {results_dir / 'load_test_results.json'}")
        
        # Display server statistics
        server_stats = model_server.get_stats()
        print("\n📈 SERVER PERFORMANCE SUMMARY")
        print("=" * 50)
        print(f"Total requests processed: {server_stats['total_requests']:,}")
        print(f"Success rate: {server_stats['success_rate']:.1%}")
        print(f"Average inference time: {server_stats['avg_inference_time_ms']:.1f}ms")
        print(f"Average batch size: {server_stats['avg_batch_size']:.1f}")
        print(f"Total batches processed: {server_stats['total_batches']}")
        print(f"Current queue length: {server_stats['queue_length']}")
        print(f"Uptime: {server_stats['uptime_seconds']:.1f}s")
        
        return test_results
        
    finally:
        # Stop batch processor
        await model_server.stop_batch_processor()

# Run the comprehensive test suite
test_results = await run_comprehensive_tests()
```

## 8. Deployment Configuration Generation

```python
class DeploymentConfigGenerator:
    """Generate production-ready deployment configurations."""
    
    def __init__(self):
        self.configs = {}
    
    def generate_dockerfile(self) -> str:
        """Generate optimized multi-stage Dockerfile."""
        
        dockerfile_content = '''# Multi-stage Dockerfile for PyTorch model serving
FROM python:3.9-slim as base

# Set environment variables for optimization
ENV PYTHONUNBUFFERED=1 \\
    PYTHONDONTWRITEBYTECODE=1 \\
    PIP_NO_CACHE_DIR=1 \\
    PIP_DISABLE_PIP_VERSION_CHECK=1 \\
    PYTHONHASHSEED=random

# Install system dependencies
RUN apt-get update && apt-get install -y \\
    build-essential \\
    curl \\
    git \\
    && rm -rf /var/lib/apt/lists/* \\
    && apt-get clean

# Create app user for security
RUN groupadd -r appuser && useradd -r -g appuser appuser

# Set working directory
WORKDIR /app

# Copy requirements first for better Docker layer caching
COPY requirements.txt .

# Install Python dependencies with optimizations
RUN pip install --no-cache-dir --upgrade pip \\
    && pip install --no-cache-dir -r requirements.txt \\
    && pip install --no-cache-dir uvicorn[standard] gunicorn

# Copy application code
COPY src/ ./src/
COPY web_app/ ./web_app/
COPY models/ ./models/
COPY config/ ./config/

# Create necessary directories with proper permissions
RUN mkdir -p /app/logs /app/data /app/results /app/temp \\
    && chown -R appuser:appuser /app

# Switch to non-root user
USER appuser

# Health check configuration
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \\
    CMD curl -f http://localhost:8000/health || exit 1

# Expose application port
EXPOSE 8000

# Use uvicorn for production ASGI server
CMD ["uvicorn", "web_app.api_server:app", \\
     "--host", "0.0.0.0", \\
     "--port", "8000", \\
     "--workers", "1", \\
     "--access-log", \\
     "--log-config", "config/logging.yaml"]
        '''
        
        return dockerfile_content
    
    def generate_docker_compose(self) -> str:
        """Generate comprehensive Docker Compose configuration."""
        
        compose_content = '''version: '3.8'

services:
  pytorch-api:
    build:
      context: .
      dockerfile: Dockerfile
      target: base
    ports:
      - "8000:8000"
    environment:
      - MODEL_PATH=/app/models/model.pth
      - LOG_LEVEL=INFO
      - MAX_BATCH_SIZE=32
      - BATCH_TIMEOUT=0.01
      - DEVICE=cpu
      - WORKERS=1
      - HOST=0.0.0.0
      - PORT=8000
    volumes:
      - ./models:/app/models:ro
      - ./logs:/app/logs
      - ./data:/app/data:ro
      - ./config:/app/config:ro
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 40s
    deploy:
      resources:
        limits:
          cpus: '2.0'
          memory: 4G
        reservations:
          cpus: '1.0'
          memory: 2G
    networks:
      - pytorch-network
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.pytorch-api.rule=Host(`api.pytorch.local`)"

  nginx:
    image: nginx:alpine
    ports:
      - "80:80"
      - "443:443"
    volumes:
      - ./nginx.conf:/etc/nginx/nginx.conf:ro
      - ./ssl:/etc/nginx/ssl:ro
      - ./logs/nginx:/var/log/nginx
    depends_on:
      - pytorch-api
    restart: unless-stopped
    networks:
      - pytorch-network

  prometheus:
    image: prom/prometheus:latest
    ports:
      - "9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - ./monitoring/alert_rules.yml:/etc/prometheus/alert_rules.yml:ro
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    restart: unless-stopped
    networks:
      - pytorch-network

  grafana:
    image: grafana/grafana:latest
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin123
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_INSTALL_PLUGINS=grafana-piechart-panel
    volumes:
      - grafana_data:/var/lib/grafana
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
    depends_on:
      - prometheus
    restart: unless-stopped
    networks:
      - pytorch-network

  alertmanager:
    image: prom/alertmanager:latest
    ports:
      - "9093:9093"
    volumes:
      - ./monitoring/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
      - alertmanager_data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'
      - '--web.external-url=http://localhost:9093'
    restart: unless-stopped
    networks:
      - pytorch-network

  redis:
    image: redis:alpine
    ports:
      - "6379:6379"
    volumes:
      - redis_data:/data
    restart: unless-stopped
    networks:
      - pytorch-network
    command: redis-server --appendonly yes

volumes:
  prometheus_data:
  grafana_data:
  alertmanager_data:
  redis_data:

networks:
  pytorch-network:
    driver: bridge
    ipam:
      config:
        - subnet: 172.20.0.0/16
        '''
        
        return compose_content
    
    def generate_kubernetes_manifests(self) -> Dict[str, str]:
        """Generate comprehensive Kubernetes deployment manifests."""
        
        # Namespace
        namespace_yaml = '''apiVersion: v1
kind: Namespace
metadata:
  name: pytorch-ml
  labels:
    name: pytorch-ml
    environment: production
'''
        
        # ConfigMap
        configmap_yaml = '''apiVersion: v1
kind: ConfigMap
metadata:
  name: pytorch-config
  namespace: pytorch-ml
data:
  MAX_BATCH_SIZE: "32"
  BATCH_TIMEOUT: "0.01"
  LOG_LEVEL: "INFO"
  WORKERS: "1"
  DEVICE: "cpu"
  PROMETHEUS_METRICS: "true"
'''
        
        # Deployment
        deployment_yaml = '''apiVersion: apps/v1
kind: Deployment
metadata:
  name: pytorch-model-server
  namespace: pytorch-ml
  labels:
    app: pytorch-model-server
    version: v1.0
    component: inference
spec:
  replicas: 3
  strategy:
    type: RollingUpdate
    rollingUpdate:
      maxSurge: 1
      maxUnavailable: 0
  selector:
    matchLabels:
      app: pytorch-model-server
  template:
    metadata:
      labels:
        app: pytorch-model-server
        version: v1.0
        component: inference
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "8000"
        prometheus.io/path: "/metrics"
    spec:
      serviceAccountName: pytorch-service-account
      securityContext:
        runAsNonRoot: true
        runAsUser: 1000
        fsGroup: 2000
      containers:
      - name: pytorch-api
        image: pytorch-model-server:v1.0
        imagePullPolicy: IfNotPresent
        ports:
        - containerPort: 8000
          name: http
          protocol: TCP
        env:
        - name: MODEL_PATH
          value: "/app/models/model.pth"
        - name: POD_NAME
          valueFrom:
            fieldRef:
              fieldPath: metadata.name
        - name: POD_NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: metadata.namespace
        envFrom:
        - configMapRef:
            name: pytorch-config
        resources:
          requests:
            memory: "2Gi"
            cpu: "1000m"
            ephemeral-storage: "1Gi"
          limits:
            memory: "4Gi"
            cpu: "2000m"
            ephemeral-storage: "2Gi"
        livenessProbe:
          httpGet:
            path: /health
            port: 8000
            scheme: HTTP
          initialDelaySeconds: 30
          periodSeconds: 10
          timeoutSeconds: 5
          failureThreshold: 3
          successThreshold: 1
        readinessProbe:
          httpGet:
            path: /health
            port: 8000
            scheme: HTTP
          initialDelaySeconds: 5
          periodSeconds: 5
          timeoutSeconds: 3
          failureThreshold: 2
          successThreshold: 1
        startupProbe:
          httpGet:
            path: /health
            port: 8000
            scheme: HTTP
          initialDelaySeconds: 10
          periodSeconds: 10
          timeoutSeconds: 5
          failureThreshold: 30
          successThreshold: 1
        volumeMounts:
        - name: model-storage
          mountPath: /app/models
          readOnly: true
        - name: log-storage
          mountPath: /app/logs
        - name: tmp-storage
          mountPath: /tmp
        securityContext:
          allowPrivilegeEscalation: false
          readOnlyRootFilesystem: true
          capabilities:
            drop:
            - ALL
      volumes:
      - name: model-storage
        persistentVolumeClaim:
          claimName: model-pvc
      - name: log-storage
        emptyDir:
          sizeLimit: 1Gi
      - name: tmp-storage
        emptyDir:
          sizeLimit: 1Gi
      imagePullSecrets:
      - name: registry-secret
      affinity:
        podAntiAffinity:
          preferredDuringSchedulingIgnoredDuringExecution:
          - weight: 100
            podAffinityTerm:
              labelSelector:
                matchExpressions:
                - key: app
                  operator: In
                  values:
                  - pytorch-model-server
              topologyKey: kubernetes.io/hostname
'''
        
        # Service
        service_yaml = '''apiVersion: v1
kind: Service
metadata:
  name: pytorch-model-service
  namespace: pytorch-ml
  labels:
    app: pytorch-model-server
  annotations:
    service.beta.kubernetes.io/aws-load-balancer-type: nlb
spec:
  type: ClusterIP
  ports:
  - port: 80
    targetPort: 8000
    protocol: TCP
    name: http
  selector:
    app: pytorch-model-server
  sessionAffinity: None
'''
        
        # Ingress
        ingress_yaml = '''apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: pytorch-model-ingress
  namespace: pytorch-ml
  annotations:
    kubernetes.io/ingress.class: nginx
    nginx.ingress.kubernetes.io/rewrite-target: /
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
    nginx.ingress.kubernetes.io/force-ssl-redirect: "true"
    cert-manager.io/cluster-issuer: letsencrypt-prod
    nginx.ingress.kubernetes.io/rate-limit: "100"
    nginx.ingress.kubernetes.io/rate-limit-window: "1m"
    nginx.ingress.kubernetes.io/rate-limit-connections: "10"
    nginx.ingress.kubernetes.io/upstream-hash-by: "$remote_addr"
spec:
  tls:
  - hosts:
    - api.yourmodel.com
    secretName: pytorch-api-tls
  rules:
  - host: api.yourmodel.com
    http:
      paths:
      - path: /
        pathType: Prefix
        backend:
          service:
            name: pytorch-model-service
            port:
              number: 80
'''
        
        # HorizontalPodAutoscaler
        hpa_yaml = '''apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: pytorch-model-hpa
  namespace: pytorch-ml
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: pytorch-model-server
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80
  - type: Pods
    pods:
      metric:
        name: http_requests_per_second
      target:
        type: AverageValue
        averageValue: "50"
  behavior:
    scaleDown:
      stabilizationWindowSeconds: 300
      policies:
      - type: Percent
        value: 50
        periodSeconds: 60
      - type: Pods
        value: 2
        periodSeconds: 60
      selectPolicy: Min
    scaleUp:
      stabilizationWindowSeconds: 60
      policies:
      - type: Percent
        value: 100
        periodSeconds: 15
      - type: Pods
        value: 4
        periodSeconds: 60
      selectPolicy: Max
'''
        
        # PersistentVolumeClaim
        pvc_yaml = '''apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: model-pvc
  namespace: pytorch-ml
spec:
  accessModes:
    - ReadOnlyMany
  resources:
    requests:
      storage: 10Gi
  storageClassName: fast-ssd
'''
        
        # ServiceAccount
        serviceaccount_yaml = '''apiVersion: v1
kind: ServiceAccount
metadata:
  name: pytorch-service-account
  namespace: pytorch-ml
  labels:
    app: pytorch-model-server
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: pytorch-cluster-role
rules:
- apiGroups: [""]
  resources: ["pods", "services", "endpoints"]
  verbs: ["get", "list", "watch"]
- apiGroups: ["apps"]
  resources: ["deployments", "replicasets"]
  verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: pytorch-cluster-role-binding
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: pytorch-cluster-role
subjects:
- kind: ServiceAccount
  name: pytorch-service-account
  namespace: pytorch-ml
'''
        
        return {
            'namespace': namespace_yaml,
            'configmap': configmap_yaml,
            'deployment': deployment_yaml,
            'service': service_yaml,
            'ingress': ingress_yaml,
            'hpa': hpa_yaml,
            'pvc': pvc_yaml,
            'serviceaccount': serviceaccount_yaml
        }
    
    def generate_helm_chart(self) -> Dict[str, str]:
        """Generate comprehensive Helm chart for deployment."""
        
        # Chart.yaml
        chart_yaml = '''apiVersion: v2
name: pytorch-model-server
description: Production-grade PyTorch model serving API with comprehensive features
type: application
version: 1.0.0
appVersion: "1.0.0"
home: https://github.com/your-org/pytorch-model-server
sources:
  - https://github.com/your-org/pytorch-model-server
keywords:
  - pytorch
  - machine-learning
  - api
  - model-serving
  - inference
  - mlops
maintainers:
  - name: ML Engineering Team
    email: ml-team@yourcompany.com
  - name: DevOps Team
    email: devops@yourcompany.com
dependencies:
  - name: prometheus
    version: "15.x.x"
    repository: https://prometheus-community.github.io/helm-charts
    condition: monitoring.prometheus.enabled
  - name: grafana
    version: "6.x.x"
    repository: https://grafana.github.io/helm-charts
    condition: monitoring.grafana.enabled
annotations:
  category: MachineLearning
'''
        
        # values.yaml
        values_yaml = '''# Default values for pytorch-model-server
# This is a YAML-formatted file.

# Image configuration
image:
  repository: pytorch-model-server
  pullPolicy: IfNotPresent
  tag: "v1.0.0"

# Image pull secrets
imagePullSecrets:
  - name: registry-secret

# Service account
serviceAccount:
  create: true
  annotations: {}
  name: ""

# Pod security context
podSecurityContext:
  runAsNonRoot: true
  runAsUser: 1000
  fsGroup: 2000

# Container security context
securityContext:
  allowPrivilegeEscalation: false
  readOnlyRootFilesystem: true
  capabilities:
    drop:
    - ALL

# Deployment configuration
replicaCount: 3

# Rolling update strategy
strategy:
  type: RollingUpdate
  rollingUpdate:
    maxSurge: 1
    maxUnavailable: 0

# Pod annotations
podAnnotations:
  prometheus.io/scrape: "true"
  prometheus.io/port: "8000"
  prometheus.io/path: "/metrics"

# Service configuration
service:
  type: ClusterIP
  port: 80
  targetPort: 8000
  annotations: {}

# Ingress configuration
ingress:
  enabled: true
  className: "nginx"
  annotations:
    nginx.ingress.kubernetes.io/rewrite-target: /
    nginx.ingress.kubernetes.io/ssl-redirect: "true"
    cert-manager.io/cluster-issuer: letsencrypt-prod
    nginx.ingress.kubernetes.io/rate-limit: "100"
    nginx.ingress.kubernetes.io/rate-limit-window: "1m"
  hosts:
    - host: api.yourmodel.com
      paths:
        - path: /
          pathType: Prefix
  tls:
    - secretName: pytorch-api-tls
      hosts:
        - api.yourmodel.com

# Resource limits and requests
resources:
  limits:
    cpu: 2000m
    memory: 4Gi
    ephemeral-storage: 2Gi
  requests:
    cpu: 1000m
    memory: 2Gi
    ephemeral-storage: 1Gi

# Autoscaling configuration
autoscaling:
  enabled: true
  minReplicas: 2
  maxReplicas: 10
  targetCPUUtilizationPercentage: 70
  targetMemoryUtilizationPercentage: 80
  customMetrics:
    - type: Pods
      pods:
        metric:
          name: http_requests_per_second
        target:
          type: AverageValue
          averageValue: "50"

# Health check configuration
healthCheck:
  livenessProbe:
    initialDelaySeconds: 30
    periodSeconds: 10
    timeoutSeconds: 5
    failureThreshold: 3
  readinessProbe:
    initialDelaySeconds: 5
    periodSeconds: 5
    timeoutSeconds: 3
    failureThreshold: 2
  startupProbe:
    initialDelaySeconds: 10
    periodSeconds: 10
    timeoutSeconds: 5
    failureThreshold: 30

# Node selector
nodeSelector: {}

# Tolerations
tolerations: []

# Affinity rules
affinity:
  podAntiAffinity:
    preferredDuringSchedulingIgnoredDuringExecution:
    - weight: 100
      podAffinityTerm:
        labelSelector:
          matchExpressions:
          - key: app.kubernetes.io/name
            operator: In
            values:
            - pytorch-model-server
        topologyKey: kubernetes.io/hostname

# Model server configuration
modelServer:
  model:
    path: "/app/models/model.pth"
    maxBatchSize: 32
    batchTimeout: 0.01
    device: "cpu"
  
  api:
    workers: 1
    logLevel: "INFO"
    
  features:
    prometheusMetrics: true
    healthChecks: true
    rateLimiting: true

# Storage configuration
storage:
  models:
    enabled: true
    storageClass: "fast-ssd"
    size: "10Gi"
    accessMode: "ReadOnlyMany"
  
  logs:
    enabled: true
    size: "1Gi"

# Monitoring configuration
monitoring:
  enabled: true
  
  prometheus:
    enabled: true
    serviceMonitor:
      enabled: true
      interval: 30s
      path: /metrics
      labels: {}
  
  grafana:
    enabled: true
    dashboards:
      enabled: true
  
  alerting:
    enabled: true
    rules:
      highErrorRate:
        enabled: true
        threshold: 0.1
      highLatency:
        enabled: true
        threshold: 1000
      highMemoryUsage:
        enabled: true
        threshold: 3000

# Environment variables
env:
  - name: MODEL_PATH
    value: "/app/models/model.pth"
  - name: MAX_BATCH_SIZE
    value: "32"
  - name: BATCH_TIMEOUT
    value: "0.01"
  - name: LOG_LEVEL
    value: "INFO"
  - name: PROMETHEUS_METRICS
    value: "true"

# Additional labels
labels: {}

# Additional annotations
annotations: {}
'''
        
        # templates/deployment.yaml
        deployment_template = '''{{- $fullName := include "pytorch-model-server.fullname" . -}}
apiVersion: apps/v1
kind: Deployment
metadata:
  name: {{ $fullName }}
  labels:
    {{- include "pytorch-model-server.labels" . | nindent 4 }}
spec:
  {{- if not .Values.autoscaling.enabled }}
  replicas: {{ .Values.replicaCount }}
  {{- end }}
  strategy:
    {{- toYaml .Values.strategy | nindent 4 }}
  selector:
    matchLabels:
      {{- include "pytorch-model-server.selectorLabels" . | nindent 6 }}
  template:
    metadata:
      annotations:
        {{- with .Values.podAnnotations }}
        {{- toYaml . | nindent 8 }}
        {{- end }}
      labels:
        {{- include "pytorch-model-server.selectorLabels" . | nindent 8 }}
    spec:
      {{- with .Values.imagePullSecrets }}
      imagePullSecrets:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      serviceAccountName: {{ include "pytorch-model-server.serviceAccountName" . }}
      securityContext:
        {{- toYaml .Values.podSecurityContext | nindent 8 }}
      containers:
        - name: {{ .Chart.Name }}
          securityContext:
            {{- toYaml .Values.securityContext | nindent 12 }}
          image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
          imagePullPolicy: {{ .Values.image.pullPolicy }}
          ports:
            - name: http
              containerPort: 8000
              protocol: TCP
          env:
            {{- toYaml .Values.env | nindent 12 }}
          livenessProbe:
            httpGet:
              path: /health
              port: http
            {{- toYaml .Values.healthCheck.livenessProbe | nindent 12 }}
          readinessProbe:
            httpGet:
              path: /health
              port: http
            {{- toYaml .Values.healthCheck.readinessProbe | nindent 12 }}
          startupProbe:
            httpGet:
              path: /health
              port: http
            {{- toYaml .Values.healthCheck.startupProbe | nindent 12 }}
          resources:
            {{- toYaml .Values.resources | nindent 12 }}
          volumeMounts:
          {{- if .Values.storage.models.enabled }}
            - name: model-storage
              mountPath: /app/models
              readOnly: true
          {{- end }}
          {{- if .Values.storage.logs.enabled }}
            - name: log-storage
              mountPath: /app/logs
          {{- end }}
            - name: tmp-storage
              mountPath: /tmp
      volumes:
      {{- if .Values.storage.models.enabled }}
        - name: model-storage
          persistentVolumeClaim:
            claimName: {{ $fullName }}-models
      {{- end }}
      {{- if .Values.storage.logs.enabled }}
        - name: log-storage
          emptyDir:
            sizeLimit: {{ .Values.storage.logs.size }}
      {{- end }}
        - name: tmp-storage
          emptyDir:
            sizeLimit: 1Gi
      {{- with .Values.nodeSelector }}
      nodeSelector:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.affinity }}
      affinity:
        {{- toYaml . | nindent 8 }}
      {{- end }}
      {{- with .Values.tolerations }}
      tolerations:
        {{- toYaml . | nindent 8 }}
      {{- end }}
'''
        
        return {
            'Chart.yaml': chart_yaml,
            'values.yaml': values_yaml,
            'templates/deployment.yaml': deployment_template
        }

# Generate all deployment configurations
print("\n🔧 GENERATING DEPLOYMENT CONFIGURATIONS")
print("=" * 60)

config_generator = DeploymentConfigGenerator()

# Generate configurations
dockerfile = config_generator.generate_dockerfile()
docker_compose = config_generator.generate_docker_compose()
k8s_manifests = config_generator.generate_kubernetes_manifests()
helm_chart = config_generator.generate_helm_chart()

# Save configurations to files
configs_dir = results_dir / 'deployment_configs'
configs_dir.mkdir(exist_ok=True)

# Save Dockerfile
with open(configs_dir / 'Dockerfile', 'w') as f:
    f.write(dockerfile)

# Save Docker Compose
with open(configs_dir / 'docker-compose.yml', 'w') as f:
    f.write(docker_compose)

# Save Kubernetes manifests
k8s_dir = configs_dir / 'kubernetes'
k8s_dir.mkdir(exist_ok=True)
for name, content in k8s_manifests.items():
    with open(k8s_dir / f'{name}.yaml', 'w') as f:
        f.write(content)

# Save Helm chart
helm_dir = configs_dir / 'helm' / 'pytorch-model-server'
helm_dir.mkdir(parents=True, exist_ok=True)
for name, content in helm_chart.items():
    file_path = helm_dir / name
    file_path.parent.mkdir(parents=True, exist_ok=True)
    with open(file_path, 'w') as f:
        f.write(content)

print("✅ Dockerfile generated")
print("✅ Docker Compose configuration created")
print("✅ Kubernetes manifests generated (8 files)")
print("✅ Helm chart created with templates")
print(f"📁 All configurations saved to: {configs_dir}")
```

## 9. Production Monitoring and Alerting Setup

```python
class MonitoringConfigGenerator:
    """Generate comprehensive monitoring and alerting configurations."""
    
    def generate_prometheus_config(self) -> str:
        """Generate Prometheus configuration with comprehensive scraping."""
        
        prometheus_yml = '''# Prometheus configuration for PyTorch model serving
global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'pytorch-production'
    environment: 'production'

rule_files:
  - "alert_rules.yml"
  - "recording_rules.yml"

scrape_configs:
  # PyTorch model server metrics
  - job_name: 'pytorch-model-server'
    static_configs:
      - targets: ['pytorch-api:8000']
    metrics_path: '/metrics'
    scrape_interval: 5s
    scrape_timeout: 3s
    honor_labels: true
    params:
      format: ['prometheus']

  # Prometheus self-monitoring
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']
    scrape_interval: 30s

  # Node exporter for system metrics
  - job_name: 'node-exporter'
    static_configs:
      - targets: ['node-exporter:9100']
    scrape_interval: 15s

  # Nginx metrics
  - job_name: 'nginx'
    static_configs:
      - targets: ['nginx:9113']
    scrape_interval: 15s

  # Redis metrics
  - job_name: 'redis'
    static_configs:
      - targets: ['redis-exporter:9121']
    scrape_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093
      timeout: 10s
      api_version: v2

# Remote write for long-term storage (optional)
# remote_write:
#   - url: "https://prometheus.example.com/api/v1/write"
#     basic_auth:
#       username: "prometheus"
#       password: "password"
        '''
        
        return prometheus_yml
    
    def generate_alert_rules(self) -> str:
        """Generate comprehensive Prometheus alert rules."""
        
        alert_rules_yml = '''# Alert rules for PyTorch model serving
groups:
- name: pytorch-model-server-alerts
  interval: 30s
  rules:
  
  # Service availability alerts
  - alert: ModelServerDown
    expr: up{job="pytorch-model-server"} == 0
    for: 1m
    labels:
      severity: critical
      service: pytorch-model-server
    annotations:
      summary: "PyTorch model server is down"
      description: "PyTorch model server {{ $labels.instance }} has been down for more than 1 minute."
      runbook_url: "https://wiki.example.com/runbooks/pytorch-server-down"

  # Error rate alerts
  - alert: HighErrorRate
    expr: |
      (
        rate(model_requests_failed_total[5m]) / 
        rate(model_requests_total[5m])
      ) > 0.1
    for: 2m
    labels:
      severity: warning
      service: pytorch-model-server
    annotations:
      summary: "High error rate detected"
      description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes on {{ $labels.instance }}."

  - alert: CriticalErrorRate
    expr: |
      (
        rate(model_requests_failed_total[5m]) / 
        rate(model_requests_total[5m])
      ) > 0.25
    for: 1m
    labels:
      severity: critical
      service: pytorch-model-server
    annotations:
      summary: "Critical error rate detected"
      description: "Error rate is {{ $value | humanizePercentage }} over the last 5 minutes on {{ $labels.instance }}."

  # Latency alerts
  - alert: HighLatency
    expr: |
      histogram_quantile(0.95, 
        rate(model_inference_duration_seconds_bucket[5m])
      ) > 0.1
    for: 3m
    labels:
      severity: warning
      service: pytorch-model-server
    annotations:
      summary: "High inference latency"
      description: "95th percentile latency is {{ $value }}s over the last 5 minutes on {{ $labels.instance }}."

  - alert: ExtremeLatency
    expr: |
      histogram_quantile(0.95, 
        rate(model_inference_duration_seconds_bucket[5m])
      ) > 0.5
    for: 1m
    labels:
      severity: critical
      service: pytorch-model-server
    annotations:
      summary: "Extreme inference latency"
      description: "95th percentile latency is {{ $value }}s over the last 5 minutes on {{ $labels.instance }}."

  # Resource usage alerts
  - alert: HighMemoryUsage
    expr: process_resident_memory_bytes / (1024 * 1024) > 3000
    for: 5m
    labels:
      severity: warning
      service: pytorch-model-server
    annotations:
      summary: "High memory usage"
      description: "Memory usage is {{ $value }}MB on {{ $labels.instance }}, above 3GB threshold."

  - alert: CriticalMemoryUsage
    expr: process_resident_memory_bytes / (1024 * 1024) > 4500
    for: 2m
    labels:
      severity: critical
      service: pytorch-model-server
    annotations:
      summary: "Critical memory usage"
      description: "Memory usage is {{ $value }}MB on {{ $labels.instance }}, approaching limit."

  - alert: HighCPUUsage
    expr: rate(process_cpu_seconds_total[5m]) * 100 > 80
    for: 5m
    labels:
      severity: warning
      service: pytorch-model-server
    annotations:
      summary: "High CPU usage"
      description: "CPU usage is {{ $value }}% over the last 5 minutes on {{ $labels.instance }}."

  # Throughput alerts
  - alert: LowThroughput
    expr: rate(model_requests_total[5m]) < 1
    for: 10m
    labels:
      severity: info
      service: pytorch-model-server
    annotations:
      summary: "Low request throughput"
      description: "Request rate is {{ $value }} requests/second over the last 5 minutes on {{ $labels.instance }}."

  # Queue length alerts
  - alert: HighQueueLength
    expr: model_queue_length > 50
    for: 2m
    labels:
      severity: warning
      service: pytorch-model-server
    annotations:
      summary: "High request queue length"
      description: "Request queue length is {{ $value }} on {{ $labels.instance }}."

  # Batch processing alerts
  - alert: LowBatchEfficiency
    expr: model_batch_size_avg < 4
    for: 10m
    labels:
      severity: info
      service: pytorch-model-server
    annotations:
      summary: "Low batch processing efficiency"
      description: "Average batch size is {{ $value }} on {{ $labels.instance }}, consider tuning batch parameters."

- name: infrastructure-alerts
  interval: 30s
  rules:
  
  # System resource alerts
  - alert: HighSystemLoad
    expr: node_load1 > 4
    for: 5m
    labels:
      severity: warning
      service: infrastructure
    annotations:
      summary: "High system load"
      description: "System load is {{ $value }} on {{ $labels.instance }}."

  - alert: LowDiskSpace
    expr: |
      (
        node_filesystem_avail_bytes{mountpoint="/"} / 
        node_filesystem_size_bytes{mountpoint="/"}
      ) * 100 < 20
    for: 5m
    labels:
      severity: warning
      service: infrastructure
    annotations:
      summary: "Low disk space"
      description: "Disk space is {{ $value }}% available on {{ $labels.instance }}."

  - alert: HighNetworkLatency
    expr: avg_over_time(probe_duration_seconds[5m]) > 0.1
    for: 3m
    labels:
      severity: warning
      service: infrastructure
    annotations:
      summary: "High network latency"
      description: "Network latency is {{ $value }}s to {{ $labels.instance }}."
        '''
        
        return alert_rules_yml
    
    def generate_grafana_dashboard(self) -> str:
        """Generate comprehensive Grafana dashboard configuration."""
        
        dashboard_json = '''{
  "dashboard": {
    "id": null,
    "title": "PyTorch Model Server - Production Dashboard",
    "tags": ["pytorch", "ml", "api", "production"],
    "timezone": "browser",
    "refresh": "5s",
    "time": {
      "from": "now-1h",
      "to": "now"
    },
    "panels": [
      {
        "id": 1,
        "title": "Request Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(model_requests_total[5m])",
            "legendFormat": "Requests/sec",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "reqps",
            "color": {
              "mode": "thresholds"
            },
            "thresholds": {
              "steps": [
                {"color": "red", "value": 0},
                {"color": "yellow", "value": 10},
                {"color": "green", "value": 50}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 0, "y": 0}
      },
      {
        "id": 2,
        "title": "Error Rate",
        "type": "stat",
        "targets": [
          {
            "expr": "rate(model_requests_failed_total[5m]) / rate(model_requests_total[5m])",
            "legendFormat": "Error Rate",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "percentunit",
            "color": {
              "mode": "thresholds"
            },
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 0.05},
                {"color": "red", "value": 0.1}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 6, "y": 0}
      },
      {
        "id": 3,
        "title": "Response Time",
        "type": "stat",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(model_inference_duration_seconds_bucket[5m]))",
            "legendFormat": "P95 Latency",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s",
            "color": {
              "mode": "thresholds"
            },
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 0.1},
                {"color": "red", "value": 0.5}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 12, "y": 0}
      },
      {
        "id": 4,
        "title": "Active Connections",
        "type": "stat",
        "targets": [
          {
            "expr": "model_queue_length",
            "legendFormat": "Queue Length",
            "refId": "A"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "short",
            "color": {
              "mode": "thresholds"
            },
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 20},
                {"color": "red", "value": 50}
              ]
            }
          }
        },
        "gridPos": {"h": 8, "w": 6, "x": 18, "y": 0}
      },
      {
        "id": 5,
        "title": "Request Rate Over Time",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(model_requests_total[5m])",
            "legendFormat": "Total Requests/sec",
            "refId": "A"
          },
          {
            "expr": "rate(model_requests_successful_total[5m])",
            "legendFormat": "Successful Requests/sec",
            "refId": "B"
          },
          {
            "expr": "rate(model_requests_failed_total[5m])",
            "legendFormat": "Failed Requests/sec",
            "refId": "C"
          }
        ],
        "yAxes": [
          {
            "label": "Requests/sec",
            "min": 0
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 8}
      },
      {
        "id": 6,
        "title": "Latency Percentiles",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.50, rate(model_inference_duration_seconds_bucket[5m]))",
            "legendFormat": "P50",
            "refId": "A"
          },
          {
            "expr": "histogram_quantile(0.95, rate(model_inference_duration_seconds_bucket[5m]))",
            "legendFormat": "P95",
            "refId": "B"
          },
          {
            "expr": "histogram_quantile(0.99, rate(model_inference_duration_seconds_bucket[5m]))",
            "legendFormat": "P99",
            "refId": "C"
          }
        ],
        "yAxes": [
          {
            "label": "Seconds",
            "min": 0
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 8}
      },
      {
        "id": 7,
        "title": "Resource Usage",
        "type": "graph",
        "targets": [
          {
            "expr": "process_resident_memory_bytes / (1024 * 1024)",
            "legendFormat": "Memory (MB)",
            "refId": "A"
          },
          {
            "expr": "rate(process_cpu_seconds_total[5m]) * 100",
            "legendFormat": "CPU (%)",
            "refId": "B"
          }
        ],
        "yAxes": [
          {
            "label": "Usage",
            "min": 0
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 16}
      },
      {
        "id": 8,
        "title": "Batch Processing Efficiency",
        "type": "graph",
        "targets": [
          {
            "expr": "model_batch_size_avg",
            "legendFormat": "Avg Batch Size",
            "refId": "A"
          },
          {
            "expr": "rate(model_batches_processed_total[5m])",
            "legendFormat": "Batches/sec",
            "refId": "B"
          }
        ],
        "yAxes": [
          {
            "label": "Count",
            "min": 0
          }
        ],
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 16}
      }
    ],
    "templating": {
      "list": [
        {
          "name": "instance",
          "type": "query",
          "query": "label_values(up{job=\"pytorch-model-server\"}, instance)",
          "refresh": 1,
          "includeAll": true,
          "multi": true
        }
      ]
    },
    "annotations": {
      "list": [
        {
          "name": "Deployments",
          "datasource": "Prometheus",
          "expr": "resets(process_start_time_seconds[1h]) > 0"
        }
      ]
    }
  }
}'''
        
        return dashboard_json
    
    def generate_alertmanager_config(self) -> str:
        """Generate Alertmanager configuration for notification routing."""
        
        alertmanager_yml = '''# Alertmanager configuration
global:
  smtp_smarthost: 'localhost:587'
  smtp_from: 'alerts@yourcompany.com'
  slack_api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'

# Templates for notifications
templates:
  - '/etc/alertmanager/templates/*.tmpl'

# Routing tree
route:
  group_by: ['alertname', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  routes:
  
  # Critical alerts - immediate notification
  - match:
      severity: critical
    receiver: 'critical-alerts'
    group_wait: 0s
    repeat_interval: 5m
    
  # Warning alerts - standard notification
  - match:
      severity: warning
    receiver: 'warning-alerts'
    repeat_interval: 30m
    
  # Info alerts - low priority
  - match:
      severity: info
    receiver: 'info-alerts'
    repeat_interval: 4h

# Notification receivers
receivers:
- name: 'default'
  slack_configs:
  - channel: '#alerts'
    title: 'PyTorch Model Server Alert'
    text: '{{ range .Alerts }}{{ .Annotations.summary }}\\n{{ .Annotations.description }}{{ end }}'

- name: 'critical-alerts'
  email_configs:
  - to: 'oncall@yourcompany.com'
    subject: '[CRITICAL] PyTorch Model Server Alert'
    body: |
      {{ range .Alerts }}
      Alert: {{ .Annotations.summary }}
      Description: {{ .Annotations.description }}
      Severity: {{ .Labels.severity }}
      Service: {{ .Labels.service }}
      Instance: {{ .Labels.instance }}
      {{ end }}
  slack_configs:
  - channel: '#critical-alerts'
    title: '🚨 CRITICAL: PyTorch Model Server'
    text: |
      {{ range .Alerts }}
      *{{ .Annotations.summary }}*
      {{ .Annotations.description }}
      Severity: {{ .Labels.severity }}
      {{ end }}
    color: 'danger'
  pagerduty_configs:
  - service_key: 'YOUR_PAGERDUTY_SERVICE_KEY'
    description: '{{ range .Alerts }}{{ .Annotations.summary }}{{ end }}'

- name: 'warning-alerts'
  slack_configs:
  - channel: '#alerts'
    title: '⚠️ WARNING: PyTorch Model Server'
    text: |
      {{ range .Alerts }}
      *{{ .Annotations.summary }}*
      {{ .Annotations.description }}
      {{ end }}
    color: 'warning'

- name: 'info-alerts'
  slack_configs:
  - channel: '#monitoring'
    title: 'ℹ️ INFO: PyTorch Model Server'
    text: |
      {{ range .Alerts }}
      {{ .Annotations.summary }}
      {{ end }}
    color: 'good'

# Inhibition rules to reduce noise
inhibit_rules:
- source_match:
    severity: 'critical'
  target_match:
    severity: 'warning'
  equal: ['alertname', 'service', 'instance']

- source_match:
    alertname: 'ModelServerDown'
  target_match_re:
    alertname: 'High.*'
  equal: ['service', 'instance']
        '''
        
        return alertmanager_yml

# Generate monitoring configurations
print("\n📊 GENERATING MONITORING CONFIGURATIONS")
print("=" * 60)

monitoring_generator = MonitoringConfigGenerator()

# Generate configurations
prometheus_config = monitoring_generator.generate_prometheus_config()
alert_rules = monitoring_generator.generate_alert_rules()
grafana_dashboard = monitoring_generator.generate_grafana_dashboard()
alertmanager_config = monitoring_generator.generate_alertmanager_config()

# Save monitoring configurations
monitoring_dir = results_dir / 'monitoring'
monitoring_dir.mkdir(exist_ok=True)

# Save Prometheus config
with open(monitoring_dir / 'prometheus.yml', 'w') as f:
    f.write(prometheus_config)

# Save alert rules
with open(monitoring_dir / 'alert_rules.yml', 'w') as f:
    f.write(alert_rules)

# Save Grafana dashboard
with open(monitoring_dir / 'grafana_dashboard.json', 'w') as f:
    f.write(grafana_dashboard)

# Save Alertmanager config
with open(monitoring_dir / 'alertmanager.yml', 'w') as f:
    f.write(alertmanager_config)

print("✅ Prometheus configuration generated")
print("✅ Comprehensive alert rules created (20+ alerts)")
print("✅ Grafana dashboard with 8 panels created")
print("✅ Alertmanager configuration with routing generated")
print(f"📁 Monitoring configs saved to: {monitoring_dir}")
```

## 10. CLI Tools and Management Scripts

```python
class CLIToolsGenerator:
    """Generate comprehensive CLI tools for deployment and management."""
    
    def generate_deployment_script(self) -> str:
        """Generate advanced deployment automation script."""
        
        deploy_script = '''#!/bin/bash

# PyTorch Model Server Deployment Script v2.0
# Comprehensive deployment automation with health checks and rollback

set -euo pipefail

# Configuration variables
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)"
readonly DOCKER_IMAGE="pytorch-model-server"
readonly DOCKER_TAG="${DOCKER_TAG:-v1.0}"
readonly NAMESPACE="${NAMESPACE:-pytorch-ml}"
readonly RELEASE_NAME="${RELEASE_NAME:-pytorch-model-server}"
readonly TIMEOUT="${TIMEOUT:-300}"

# Colors for output
readonly RED='\\033[0;31m'
readonly GREEN='\\033[0;32m'
readonly YELLOW='\\033[1;33m'
readonly BLUE='\\033[0;34m'
readonly NC='\\033[0m' # No Color

# Logging functions
log_info() {
    echo -e "${GREEN}[INFO]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
}

log_debug() {
    if [[ "${DEBUG:-}" == "true" ]]; then
        echo -e "${BLUE}[DEBUG]${NC} $(date '+%Y-%m-%d %H:%M:%S') $1"
    fi
}

# Error handling
trap 'log_error "Script failed at line $LINENO"' ERR

# Help function
show_help() {
    cat << EOF
PyTorch Model Server Deployment Script

USAGE:
    $0 COMMAND [OPTIONS]

COMMANDS:
    docker                  Deploy using Docker Compose
    kubernetes|k8s          Deploy to Kubernetes cluster
    helm                    Deploy using Helm chart
    test                    Run load tests
    health                  Check service health
    logs                    View service logs
    cleanup                 Clean up deployments
    help                    Show this help message

OPTIONS:
    --tag TAG              Docker image tag (default: v1.0)
    --namespace NS         Kubernetes namespace (default: pytorch-ml)
    --timeout SECONDS      Deployment timeout (default: 300)
    --debug               Enable debug logging
    --dry-run             Show what would be done without executing

EXAMPLES:
    $0 docker --tag v1.1
    $0 kubernetes --namespace production --timeout 600
    $0 helm --dry-run
    $0 test --requests 1000

ENVIRONMENT VARIABLES:
    DOCKER_TAG            Override default Docker tag
    NAMESPACE             Override default Kubernetes namespace
    KUBECONFIG            Path to Kubernetes config file
    DEBUG                 Enable debug mode (true/false)

EOF
}

# Check prerequisites
check_prerequisites() {
    log_info "Checking prerequisites..."
    
    local missing_tools=()
    
    if ! command -v docker &> /dev/null; then
        missing_tools+=("docker")
    fi
    
    if [[ "$1" == "kubernetes" || "$1" == "k8s" || "$1" == "helm" ]]; then
        if ! command -v kubectl &> /dev/null; then
            missing_tools+=("kubectl")
        fi
        
        if [[ "$1" == "helm" ]] && ! command -v helm &> /dev/null; then
            missing_tools+=("helm")
        fi
    fi
    
    if [[ ${#missing_tools[@]} -gt 0 ]]; then
        log_error "Missing required tools: ${missing_tools[*]}"
        log_info "Please install the missing tools and try again"
        exit 1
    fi
    
    log_info "Prerequisites check passed"
}

# Build Docker image with optimizations
build_image() {
    log_info "Building Docker image: ${DOCKER_IMAGE}:${DOCKER_TAG}"
    
    # Check if Dockerfile exists
    if [[ ! -f "${PROJECT_ROOT}/Dockerfile" ]]; then
        log_error "Dockerfile not found in ${PROJECT_ROOT}"
        exit 1
    fi
    
    # Build with build cache and multi-stage optimization
    docker build \\
        --build-arg BUILDKIT_INLINE_CACHE=1 \\
        --cache-from "${DOCKER_IMAGE}:latest" \\
        --tag "${DOCKER_IMAGE}:${DOCKER_TAG}" \\
        --tag "${DOCKER_IMAGE}:latest" \\
        "${PROJECT_ROOT}" || {
        log_error "Docker build failed"
        exit 1
    }
    
    # Get image size
    local image_size
    image_size=$(docker images "${DOCKER_IMAGE}:${DOCKER_TAG}" --format "table {{.Size}}" | tail -n 1)
    log_info "Docker image built successfully (Size: ${image_size})"
}

# Deploy with Docker Compose
deploy_docker_compose() {
    log_info "Deploying with Docker Compose..."
    
    cd "${PROJECT_ROOT}"
    
    # Check if docker-compose.yml exists
    if [[ ! -f "docker-compose.yml" ]]; then
        log_error "docker-compose.yml not found"
        exit 1
    fi
    
    # Stop existing services
    log_info "Stopping existing services..."
    docker-compose down --remove-orphans || true
    
    # Deploy services
    log_info "Starting services..."
    docker-compose up -d --build
    
    # Wait for health check
    log_info "Waiting for service to be healthy..."
    local max_attempts=30
    local attempt=1
    
    while [[ ${attempt} -le ${max_attempts} ]]; do
        if curl -f http://localhost:8000/health &> /dev/null; then
            log_info "Service is healthy after ${attempt} attempts"
            break
        fi
        
        if [[ ${attempt} -eq ${max_attempts} ]]; then
            log_error "Service failed to become healthy after ${max_attempts} attempts"
            docker-compose logs pytorch-api
            exit 1
        fi
        
        log_debug "Health check attempt ${attempt}/${max_attempts} failed, retrying..."
        sleep 5
        ((attempt++))
    done
    
    log_info "Docker Compose deployment completed successfully"
    docker-compose ps
}

# Deploy to Kubernetes
deploy_kubernetes() {
    log_info "Deploying to Kubernetes..."
    
    # Check cluster connectivity
    if ! kubectl cluster-info &> /dev/null; then
        log_error "Cannot connect to Kubernetes cluster"
        log_info "Please check your KUBECONFIG and cluster connectivity"
        exit 1
    fi
    
    # Create namespace if it doesn't exist
    log_info "Creating namespace: ${NAMESPACE}"
    kubectl create namespace "${NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
    
    # Apply Kubernetes manifests
    local manifests_dir="${PROJECT_ROOT}/deployment_configs/kubernetes"
    
    if [[ ! -d "${manifests_dir}" ]]; then
        log_error "Kubernetes manifests not found in ${manifests_dir}"
        exit 1
    fi
    
    log_info "Applying Kubernetes manifests..."
    kubectl apply -f "${manifests_dir}/" --namespace="${NAMESPACE}"
    
    # Wait for deployment to be ready
    log_info "Waiting for deployment to be ready..."
    kubectl rollout status deployment/pytorch-model-server \\
        --namespace="${NAMESPACE}" \\
        --timeout="${TIMEOUT}s" || {
        log_error "Deployment failed to become ready"
        kubectl describe deployment pytorch-model-server --namespace="${NAMESPACE}"
        exit 1
    }
    
    log_info "Kubernetes deployment completed successfully"
    kubectl get pods --namespace="${NAMESPACE}" -l app=pytorch-model-server
}

# Deploy using Helm
deploy_helm() {
    log_info "Deploying using Helm..."
    
    # Check if Helm chart exists
    local chart_dir="${PROJECT_ROOT}/deployment_configs/helm/pytorch-model-server"
    
    if [[ ! -d "${chart_dir}" ]]; then
        log_error "Helm chart not found in ${chart_dir}"
        exit 1
    fi
    
    # Validate chart
    log_info "Validating Helm chart..."
    helm lint "${chart_dir}" || {
        log_error "Helm chart validation failed"
        exit 1
    }
    
    # Deploy or upgrade
    log_info "Installing/upgrading Helm release: ${RELEASE_NAME}"
    helm upgrade --install "${RELEASE_NAME}" "${chart_dir}" \\
        --namespace "${NAMESPACE}" \\
        --create-namespace \\
        --set image.tag="${DOCKER_TAG}" \\
        --wait \\
        --timeout="${TIMEOUT}s" || {
        log_error "Helm deployment failed"
        helm status "${RELEASE_NAME}" --namespace="${NAMESPACE}"
        exit 1
    }
    
    log_info "Helm deployment completed successfully"
    helm status "${RELEASE_NAME}" --namespace="${NAMESPACE}"
}

# Health check function
health_check() {
    local deployment_type="$1"
    log_info "Performing health check for ${deployment_type} deployment..."
    
    local endpoint=""
    case "${deployment_type}" in
        "docker")
            endpoint="http://localhost:8000/health"
            ;;
        "kubernetes"|"k8s"|"helm")
            # Get ingress or service endpoint
            local ingress_host
            ingress_host=$(kubectl get ingress pytorch-model-ingress \\
                --namespace="${NAMESPACE}" \\
                -o jsonpath='{.spec.rules[0].host}' 2>/dev/null || echo "")
            
            if [[ -n "${ingress_host}" ]]; then
                endpoint="https://${ingress_host}/health"
            else
                # Use port-forward as fallback
                log_info "Using port-forward for health check..."
                kubectl port-forward service/pytorch-model-service 8080:80 \\
                    --namespace="${NAMESPACE}" &
                local pf_pid=$!
                sleep 5
                endpoint="http://localhost:8080/health"
                trap "kill ${pf_pid} 2>/dev/null || true" EXIT
            fi
            ;;
        *)
            log_error "Unknown deployment type: ${deployment_type}"
            exit 1
            ;;
    esac
    
    # Perform health check
    local response
    response=$(curl -s "${endpoint}" 2>/dev/null || echo "FAILED")
    
    if echo "${response}" | grep -q "healthy"; then
        log_info "Health check passed ✅"
        echo "${response}" | python3 -m json.tool 2>/dev/null || echo "${response}"
    else
        log_error "Health check failed ❌"
        log_error "Response: ${response}"
        exit 1
    fi
}

# Load testing function
run_load_test() {
    log_info "Running load test..."
    
    local requests="${REQUESTS:-100}"
    local concurrent="${CONCURRENT:-10}"
    local endpoint="${ENDPOINT:-http://localhost:8000}"
    
    # Check if Python and required packages are available
    if ! python3 -c "import requests, asyncio" &> /dev/null; then
        log_error "Python 3 with 'requests' and 'asyncio' packages required for load testing"
        exit 1
    fi
    
    # Run load test using Python
    python3 << EOF
import asyncio
import time
import requests
import json
import statistics
from concurrent.futures import ThreadPoolExecutor
import sys

def single_request(endpoint, api_key="test_key_123"):
    """Make a single request."""
    try:
        start_time = time.time()
        response = requests.post(
            f"{endpoint}/predict",
            headers={"X-API-Key": api_key, "Content-Type": "application/json"},
            json={
                "tensor_data": [[[0.5] * 32] * 32] * 3,
                "return_probabilities": True
            },
            timeout=30
        )
        latency = time.time() - start_time
        return latency if response.status_code == 200 else None
    except Exception as e:
        return None

def run_load_test(endpoint, num_requests=100, num_workers=10):
    """Run comprehensive load test."""
    print(f"Running load test against: {endpoint}")
    print(f"Requests: {num_requests}, Concurrent workers: {num_workers}")
    print("-" * 50)
    
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        start_time = time.time()
        futures = [executor.submit(single_request, endpoint) for _ in range(num_requests)]
        results = [future.result() for future in futures]
        total_time = time.time() - start_time
    
    # Analyze results
    successful = [r for r in results if r is not None]
    failed = len(results) - len(successful)
    
    if successful:
        avg_latency = statistics.mean(successful)
        min_latency = min(successful)
        max_latency = max(successful)
        p95_latency = statistics.quantiles(successful, n=20)[18] if len(successful) > 5 else 0
        throughput = len(successful) / total_time
    else:
        avg_latency = min_latency = max_latency = p95_latency = throughput = 0
    
    # Print results
    print(f"Load Test Results:")
    print(f"  Total Requests: {num_requests}")
    print(f"  Successful: {len(successful)}")
    print(f"  Failed: {failed}")
    print(f"  Success Rate: {len(successful)/num_requests*100:.1f}%")
    print(f"  Total Time: {total_time:.2f}s")
    print(f"  Throughput: {throughput:.1f} RPS")
    print(f"  Avg Latency: {avg_latency*1000:.1f}ms")
    print(f"  Min Latency: {min_latency*1000:.1f}ms")
    print(f"  Max Latency: {max_latency*1000:.1f}ms")
    print(f"  P95 Latency: {p95_latency*1000:.1f}ms")
    
    # Exit with error code if too many failures
    if failed / num_requests > 0.1:
        print(f"\\nERROR: High failure rate ({failed/num_requests*100:.1f}%)")
        sys.exit(1)

if __name__ == "__main__":
    run_load_test("${endpoint}", ${requests}, ${concurrent})
EOF
    
    local exit_code=$?
    if [[ ${exit_code} -eq 0 ]]; then
        log_info "Load test completed successfully"
    else
        log_error "Load test failed"
        exit 1
    fi
}

# View logs function
view_logs() {
    local deployment_type="$1"
    log_info "Viewing logs for ${deployment_type} deployment..."
    
    case "${deployment_type}" in
        "docker")
            docker-compose logs -f pytorch-api
            ;;
        "kubernetes"|"k8s"|"helm")
            kubectl logs -f deployment/pytorch-model-server \\
                --namespace="${NAMESPACE}" \\
                --max-log-requests=10
            ;;
        *)
            log_error "Unknown deployment type: ${deployment_type}"
            exit 1
            ;;
    esac
}

# Cleanup function
cleanup_deployment() {
    local deployment_type="$1"
    log_info "Cleaning up ${deployment_type} deployment..."
    
    case "${deployment_type}" in
        "docker")
            docker-compose down --volumes --remove-orphans
            docker system prune -f
            ;;
        "kubernetes"|"k8s")
            kubectl delete namespace "${NAMESPACE}" --ignore-not-found=true
            ;;
        "helm")
            helm uninstall "${RELEASE_NAME}" --namespace="${NAMESPACE}" || true
            kubectl delete namespace "${NAMESPACE}" --ignore-not-found=true
            ;;
        *)
            log_error "Unknown deployment type: ${deployment_type}"
            exit 1
            ;;
    esac
    
    log_info "Cleanup completed"
}

# Parse command line arguments
parse_args() {
    local command=""
    
    while [[ $# -gt 0 ]]; do
        case $1 in
            docker|kubernetes|k8s|helm|test|health|logs|cleanup|help)
                command="$1"
                shift
                ;;
            --tag)
                DOCKER_TAG="$2"
                shift 2
                ;;
            --namespace)
                NAMESPACE="$2"
                shift 2
                ;;
            --timeout)
                TIMEOUT="$2"
                shift 2
                ;;
            --requests)
                REQUESTS="$2"
                shift 2
                ;;
            --concurrent)
                CONCURRENT="$2"
                shift 2
                ;;
            --endpoint)
                ENDPOINT="$2"
                shift 2
                ;;
            --debug)
                DEBUG="true"
                shift
                ;;
            --dry-run)
                DRY_RUN="true"
                shift
                ;;
            *)
                log_error "Unknown option: $1"
                show_help
                exit 1
                ;;
        esac
    done
    
    echo "${command}"
}

# Main execution function
main() {
    local command
    command=$(parse_args "$@")
    
    if [[ -z "${command}" ]]; then
        show_help
        exit 1
    fi
    
    log_info "Starting deployment script for: ${command}"
    log_info "Configuration: Docker tag=${DOCKER_TAG}, Namespace=${NAMESPACE}, Timeout=${TIMEOUT}s"
    
    if [[ "${DRY_RUN:-}" == "true" ]]; then
        log_info "DRY RUN MODE - No actual changes will be made"
    fi
    
    case "${command}" in
        "docker")
            check_prerequisites docker
            if [[ "${DRY_RUN:-}" != "true" ]]; then
                build_image
                deploy_docker_compose
                health_check docker
            fi
            ;;
        "kubernetes"|"k8s")
            check_prerequisites kubernetes
            if [[ "${DRY_RUN:-}" != "true" ]]; then
                build_image
                deploy_kubernetes
                health_check kubernetes
            fi
            ;;
        "helm")
            check_prerequisites helm
            if [[ "${DRY_RUN:-}" != "true" ]]; then
                build_image
                deploy_helm
                health_check helm
            fi
            ;;
        "test")
            if [[ "${DRY_RUN:-}" != "true" ]]; then
                run_load_test
            fi
            ;;
        "health")
            local deploy_type="${DEPLOY_TYPE:-docker}"
            if [[ "${DRY_RUN:-}" != "true" ]]; then
                health_check "${deploy_type}"
            fi
            ;;
        "logs")
            local deploy_type="${DEPLOY_TYPE:-docker}"
            if [[ "${DRY_RUN:-}" != "true" ]]; then
                view_logs "${deploy_type}"
            fi
            ;;
        "cleanup")
            local deploy_type="${DEPLOY_TYPE:-docker}"
            if [[ "${DRY_RUN:-}" != "true" ]]; then
                cleanup_deployment "${deploy_type}"
            fi
            ;;
        "help")
            show_help
            ;;
        *)
            log_error "Unknown command: ${command}"
            show_help
            exit 1
            ;;
    esac
    
    log_info "Deployment script completed successfully"
}

# Run main function with all arguments
main "$@"
        '''
        
        return deploy_script
    
    def generate_model_management_script(self) -> str:
        """Generate comprehensive model management CLI tool."""
        
        model_mgmt_script = '''#!/usr/bin/env python3
"""
PyTorch Model Management CLI Tool v2.0
Comprehensive model deployment, monitoring, and management utilities.

Features:
- Health monitoring and alerting
- Performance benchmarking
- Model versioning and deployment
- Real-time metrics visualization
- Automated testing and validation
"""

import argparse
import asyncio
import json
import time
import sys
import os
import statistics
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
import concurrent.futures
import threading

# Third-party imports (with fallbacks)
try:
    import requests
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    HAS_VISUALIZATION = True
except ImportError:
    HAS_VISUALIZATION = False
    print("Warning: Visualization libraries not available (install pandas, matplotlib, seaborn)")

try:
    import yaml
    HAS_YAML = True
except ImportError:
    HAS_YAML = False

class Colors:
    """Terminal color codes for better output formatting."""
    RED = '\\033[91m'
    GREEN = '\\033[92m'
    YELLOW = '\\033[93m'
    BLUE = '\\033[94m'
    PURPLE = '\\033[95m'
    CYAN = '\\033[96m'
    WHITE = '\\033[97m'
    BOLD = '\\033[1m'
    UNDERLINE = '\\033[4m'
    END = '\\033[0m'

class ModelManager:
    """Advanced CLI tool for model management and monitoring."""
    
    def __init__(self, api_url: str = "http://localhost:8000", api_key: str = "test_key_123", 
                 config_file: Optional[str] = None):
        self.api_url = api_url.rstrip('/')
        self.api_key = api_key
        self.timeout = 30
        self.retry_attempts = 3
        self.session = None
        
        # Load configuration from file if provided
        if config_file and os.path.exists(config_file):
            self._load_config(config_file)
        
        # Initialize session
        self._init_session()
        
        # Metrics storage
        self.metrics_history = []
        
    def _load_config(self, config_file: str):
        """Load configuration from YAML or JSON file."""
        try:
            with open(config_file, 'r') as f:
                if config_file.endswith('.yaml') or config_file.endswith('.yml'):
                    if HAS_YAML:
                        config = yaml.safe_load(f)
                    else:
                        print(f"{Colors.YELLOW}Warning: YAML support not available{Colors.END}")
                        return
                else:
                    config = json.load(f)
            
            self.api_url = config.get('api_url', self.api_url)
            self.api_key = config.get('api_key', self.api_key)
            self.timeout = config.get('timeout', self.timeout)
            self.retry_attempts = config.get('retry_attempts', self.retry_attempts)
            
            print(f"{Colors.GREEN}✓ Configuration loaded from {config_file}{Colors.END}")
            
        except Exception as e:
            print(f"{Colors.RED}Error loading config: {e}{Colors.END}")
    
    def _init_session(self):
        """Initialize HTTP session with proper headers and settings."""
        if 'requests' in sys.modules:
            self.session = requests.Session()
            self.session.headers.update({
                "X-API-Key": self.api_key,
                "Content-Type": "application/json",
                "User-Agent": "PyTorch-Model-Manager/2.0",
                "Accept": "application/json"
            })
    
    def _make_request(self, method: str, endpoint: str, **kwargs) -> requests.Response:
        """Make HTTP request with retry logic and error handling."""
        url = f"{self.api_url}{endpoint}"
        
        for attempt in range(self.retry_attempts):
            try:
                if self.session:
                    response = self.session.request(method, url, timeout=self.timeout, **kwargs)
                else:
                    # Fallback for when requests is not available
                    print(f"{Colors.YELLOW}Making simulated request to {url}{Colors.END}")
                    return type('Response', (), {'status_code': 200, 'json': lambda: {'status': 'simulated'}})()
                
                response.raise_for_status()
                return response
                
            except requests.exceptions.RequestException as e:
                if attempt == self.retry_attempts - 1:
                    raise
                print(f"{Colors.YELLOW}Request failed (attempt {attempt + 1}/{self.retry_attempts}): {e}{Colors.END}")
                time.sleep(2 ** attempt)  # Exponential backoff
        
        raise Exception("Max retry attempts exceeded")
    
    def health_check(self) -> Dict:
        """Perform comprehensive health check."""
        try:
            start_time = time.time()
            response = self._make_request('GET', '/health')
            latency = (time.time() - start_time) * 1000
            
            health_data = response.json()
            health_data['response_time_ms'] = latency
            
            # Determine health status
            status = health_data.get('status', 'unknown')
            if status == 'healthy' and latency < 1000:
                print(f"{Colors.GREEN}✓ Service is healthy (response time: {latency:.1f}ms){Colors.END}")
            elif status == 'healthy':
                print(f"{Colors.YELLOW}⚠ Service is healthy but slow (response time: {latency:.1f}ms){Colors.END}")
            else:
                print(f"{Colors.RED}✗ Service is unhealthy{Colors.END}")
            
            return health_data
            
        except Exception as e:
            print(f"{Colors.RED}Health check failed: {e}{Colors.END}")
            return {'status': 'error', 'error': str(e)}
    
    def get_model_info(self) -> Dict:
        """Get comprehensive model information."""
        try:
            response = self._make_request('GET', '/model/info')
            model_info = response.json()
            
            print(f"{Colors.BLUE}Model Information:{Colors.END}")
            print(f"  Name: {model_info.get('model_name', 'Unknown')}")
            print(f"  Version: {model_info.get('model_version', 'Unknown')}")
            print(f"  Type: {model_info.get('model_type', 'Unknown')}")
            print(f"  Parameters: {model_info.get('parameters', 0):,}")
            print(f"  Size: {model_info.get('model_size_mb', 0):.2f} MB")
            print(f"  Input Shape: {model_info.get('input_shape', 'Unknown')}")
            print(f"  Classes: {len(model_info.get('output_classes', []))}")
            
            return model_info
            
        except Exception as e:
            print(f"{Colors.RED}Failed to get model info: {e}{Colors.END}")
            return {}
    
    def get_stats(self) -> Dict:
        """Get comprehensive server statistics."""
        try:
            response = self._make_request('GET', '/stats')
            stats = response.json()
            
            # Display server stats
            server_stats = stats.get('server', {})
            print(f"{Colors.BLUE}Server Statistics:{Colors.END}")
            print(f"  Total Requests: {server_stats.get('total_requests', 0):,}")
            print(f"  Success Rate: {server_stats.get('success_rate', 0):.1%}")
            print(f"  Avg Inference Time: {server_stats.get('avg_inference_time_ms', 0):.1f}ms")
            print(f"  Avg Batch Size: {server_stats.get('avg_batch_size', 0):.1f}")
            print(f"  Queue Length: {server_stats.get('queue_length', 0)}")
            print(f"  Uptime: {server_stats.get('uptime_seconds', 0):.0f}s")
            
            # Display user stats if available
            user_stats = stats.get('user', {})
            if user_stats:
                print(f"{Colors.CYAN}User Statistics:{Colors.END}")
                print(f"  Tier: {user_stats.get('tier', 'Unknown')}")
                print(f"  Total Requests: {user_stats.get('total_requests', 0):,}")
                print(f"  Rate Limit: {user_stats.get('rate_limit', 0)}")
                print(f"  Current Minute: {user_stats.get('current_minute_requests', 0)}")
            
            return stats
            
        except Exception as e:
            print(f"{Colors.RED}Failed to get stats: {e}{Colors.END}")
            return {}
    
    def test_prediction(self, test_data_path: Optional[str] = None, num_tests: int = 1) -> Dict:
        """Test model prediction with comprehensive validation."""
        results = []
        
        print(f"{Colors.BLUE}Running {num_tests} prediction test(s)...{Colors.END}")
        
        for i in range(num_tests):
            try:
                # Generate or load test data
                if test_data_path and os.path.exists(test_data_path):
                    # Load from file (would need proper implementation based on format)
                    test_data = {"tensor_data": [[[0.5] * 32] * 32] * 3}
                else:
                    # Generate random test data
                    import random
                    test_data = {
                        "tensor_data": [[[random.random() for _ in range(32)] for _ in range(32)] for _ in range(3)],
                        "return_probabilities": True,
                        "confidence_threshold": 0.5
                    }
                
                start_time = time.time()
                response = self._make_request('POST', '/predict', json=test_data)
                latency = (time.time() - start_time) * 1000
                
                result = response.json()
                result['client_latency_ms'] = latency
                results.append(result)
                
                if num_tests == 1:
                    print(f"  Predicted Class: {result.get('predicted_class', 'Unknown')}")
                    print(f"  Class Name: {result.get('class_name', 'Unknown')}")
                    print(f"  Confidence: {result.get('confidence', 0):.3f}")
                    print(f"  Inference Time: {result.get('inference_time_ms', 0):.1f}ms")
                    print(f"  Client Latency: {latency:.1f}ms")
                else:
                    if (i + 1) % max(1, num_tests // 10) == 0:
                        print(f"  Completed {i + 1}/{num_tests} tests")
                
            except Exception as e:
                print(f"{Colors.RED}Prediction test {i + 1} failed: {e}{Colors.END}")
                results.append({'error': str(e)})
        
        # Calculate summary statistics for multiple tests
        if num_tests > 1:
            successful_results = [r for r in results if 'error' not in r]
            if successful_results:
                latencies = [r['client_latency_ms'] for r in successful_results]
                confidences = [r.get('confidence', 0) for r in successful_results]
                
                print(f"{Colors.GREEN}Test Summary:{Colors.END}")
                print(f"  Successful: {len(successful_results)}/{num_tests}")
                print(f"  Avg Latency: {statistics.mean(latencies):.1f}ms")
                print(f"  P95 Latency: {statistics.quantiles(latencies, n=20)[18]:.1f}ms" if len(latencies) > 5 else "")
                print(f"  Avg Confidence: {statistics.mean(confidences):.3f}")
        
        return {'results': results, 'summary': {'total': num_tests, 'successful': len([r for r in results if 'error' not in r])}}
    
    def benchmark(self, num_requests: int = 100, concurrent: int = 1, duration: Optional[int] = None) -> Dict:
        """Run comprehensive benchmark test."""
        print(f"{Colors.BLUE}Running benchmark: {num_requests} requests, {concurrent} concurrent{Colors.END}")
        if duration:
            print(f"Duration limit: {duration}s")
        
        results = []
        errors = []
        start_time = time.time()
        
        def worker():
            """Worker function for concurrent execution."""
            requests_per_worker = num_requests // concurrent
            
            for _ in range(requests_per_worker):
                if duration and (time.time() - start_time) > duration:
                    break
                    
                try:
                    test_data = {
                        "tensor_data": [[[0.5] * 32] * 32] * 3,
                        "return_probabilities": True
                    }
                    
                    request_start = time.time()
                    response = self._make_request('POST', '/predict', json=test_data)
                    latency = (time.time() - request_start) * 1000
                    
                    result = response.json()
                    results.append({
                        'latency_ms': latency,
                        'confidence': result.get('confidence', 0),
                        'timestamp': time.time()
                    })
                    
                except Exception as e:
                    errors.append({
                        'error': str(e),
                        'timestamp': time.time()
                    })
        
        # Run concurrent workers
        with concurrent.futures.ThreadPoolExecutor(max_workers=concurrent) as executor:
            futures = [executor.submit(worker) for _ in range(concurrent)]
            concurrent.futures.wait(futures)
        
        total_time = time.time() - start_time
        
        # Calculate comprehensive statistics
        if results:
            latencies = [r['latency_ms'] for r in results]
            confidences = [r['confidence'] for r in results]
            
            benchmark_stats = {
                "total_requests": len(results) + len(errors),
                "successful_requests": len(results),
                "failed_requests": len(errors),
                "success_rate": len(results) / (len(results) + len(errors)),
                "total_time_seconds": total_time,
                "throughput_rps": len(results) / total_time,
                "latency_stats": {
                    "mean_ms": statistics.mean(latencies),
                    "median_ms": statistics.median(latencies),
                    "min_ms": min(latencies),
                    "max_ms": max(latencies),
                    "std_ms": statistics.stdev(latencies) if len(latencies) > 1 else 0
                },
                "confidence_stats": {
                    "mean": statistics.mean(confidences),
                    "median": statistics.median(confidences),
                    "min": min(confidences),
                    "max": max(confidences)
                }
            }
            
            if len(latencies) > 5:
                percentiles = statistics.quantiles(latencies, n=100)
                benchmark_stats["latency_stats"].update({
                    "p50_ms": percentiles[49],
                    "p95_ms": percentiles[94],
                    "p99_ms": percentiles[98]
                })
            
            # Display results
            print(f"{Colors.GREEN}Benchmark Results:{Colors.END}")
            print(f"  Requests: {benchmark_stats['successful_requests']:,}/{benchmark_stats['total_requests']:,}")
            print(f"  Success Rate: {benchmark_stats['success_rate']:.1%}")
            print(f"  Throughput: {benchmark_stats['throughput_rps']:.1f} RPS")
            print(f"  Avg Latency: {benchmark_stats['latency_stats']['mean_ms']:.1f}ms")
            print(f"  P95 Latency: {benchmark_stats['latency_stats'].get('p95_ms', 0):.1f}ms")
            print(f"  P99 Latency: {benchmark_stats['latency_stats'].get('p99_ms', 0):.1f}ms")
            
            return benchmark_stats
        else:
            print(f"{Colors.RED}All requests failed{Colors.END}")
            return {"error": "All requests failed", "errors": errors}
    
    def monitor(self, interval: int = 30, duration: int = 300, save_data: bool = False):
        """Monitor API metrics with real-time display."""
        print(f"{Colors.BLUE}Monitoring for {duration}s (interval: {interval}s){Colors.END}")
        print(f"{'Time':<12} {'RPS':<8} {'Latency':<10} {'Success%':<9} {'Queue':<8} {'Memory':<10}")
        print("-" * 70)
        
        end_time = time.time() + duration
        prev_stats = None
        monitoring_data = []
        
        try:
            while time.time() < end_time:
                try:
                    stats = self.get_stats()
                    current_time = datetime.now().strftime("%H:%M:%S")
                    
                    server_stats = stats.get('server', {})
                    
                    # Calculate RPS
                    rps = 0
                    if prev_stats:
                        req_delta = server_stats.get('total_requests', 0) - prev_stats.get('server', {}).get('total_requests', 0)
                        rps = req_delta / interval
                    
                    # Format display
                    latency = server_stats.get('avg_inference_time_ms', 0)
                    success_rate = server_stats.get('success_rate', 0) * 100
                    queue_length = server_stats.get('queue_length', 0)
                    memory_mb = server_stats.get('memory_usage_mb', 0)
                    
                    print(f"{current_time:<12} {rps:<8.1f} {latency:<10.1f} {success_rate:<9.1f} {queue_length:<8} {memory_mb:<10.1f}")
                    
                    # Save monitoring data
                    if save_data:
                        monitoring_data.append({
                            'timestamp': datetime.now().isoformat(),
                            'rps': rps,
                            'latency_ms': latency,
                            'success_rate': success_rate,
                            'queue_length': queue_length,
                            'memory_mb': memory_mb
                        })
                    
                    prev_stats = stats
                    time.sleep(interval)
                    
                except KeyboardInterrupt:
                    print(f"\\n{Colors.YELLOW}Monitoring stopped by user{Colors.END}")
                    break
                except Exception as e:
                    print(f"{Colors.RED}Monitoring error: {e}{Colors.END}")
                    time.sleep(interval)
        
        finally:
            # Save monitoring data if requested
            if save_data and monitoring_data:
                filename = f"monitoring_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
                with open(filename, 'w') as f:
                    json.dump(monitoring_data, f, indent=2)
                print(f"{Colors.GREEN}Monitoring data saved to {filename}{Colors.END}")
    
    def generate_report(self, output_file: Optional[str] = None):
        """Generate comprehensive system report."""
        print(f"{Colors.BLUE}Generating comprehensive system report...{Colors.END}")
        
        report_data = {
            'timestamp': datetime.now().isoformat(),
            'health': self.health_check(),
            'model_info': self.get_model_info(),
            'stats': self.get_stats(),
            'benchmark': self.benchmark(num_requests=50, concurrent=5)
        }
        
        # Format report
        report = f"""
# PyTorch Model Server Report
Generated: {report_data['timestamp']}

## Health Status
Status: {report_data['health'].get('status', 'Unknown')}
Response Time: {report_data['health'].get('response_time_ms', 0):.1f}ms
Uptime: {report_data['health'].get('uptime_seconds', 0):.0f}s

## Model Information
Name: {report_data['model_info'].get('model_name', 'Unknown')}
Version: {report_data['model_info'].get('model_version', 'Unknown')}
Parameters: {report_data['model_info'].get('parameters', 0):,}
Size: {report_data['model_info'].get('model_size_mb', 0):.2f} MB

## Performance Metrics
Total Requests: {report_data['stats'].get('server', {}).get('total_requests', 0):,}
Success Rate: {report_data['stats'].get('server', {}).get('success_rate', 0):.1%}
Avg Inference Time: {report_data['stats'].get('server', {}).get('avg_inference_time_ms', 0):.1f}ms
Current Queue Length: {report_data['stats'].get('server', {}).get('queue_length', 0)}

## Benchmark Results
Throughput: {report_data['benchmark'].get('throughput_rps', 0):.1f} RPS
Avg Latency: {report_data['benchmark'].get('latency_stats', {}).get('mean_ms', 0):.1f}ms
P95 Latency: {report_data['benchmark'].get('latency_stats', {}).get('p95_ms', 0):.1f}ms
Success Rate: {report_data['benchmark'].get('success_rate', 0):.1%}
        """
        
        if output_file:
            with open(output_file, 'w') as f:
                f.write(report)
            print(f"{Colors.GREEN}Report saved to {output_file}{Colors.END}")
        else:
            print(report)
        
        # Save raw data
        data_file = f"report_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        with open(data_file, 'w') as f:
            json.dump(report_data, f, indent=2, default=str)
        print(f"{Colors.GREEN}Raw data saved to {data_file}{Colors.END}")

def main():
    """Main CLI interface."""
    parser = argparse.ArgumentParser(
        description="PyTorch Model Management CLI v2.0",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog='''
Examples:
  %(prog)s health --api-url http://prod.example.com:8000
  %(prog)s benchmark --requests 1000 --concurrent 20
  %(prog)s monitor --duration 600 --interval 10 --save-data
  %(prog)s test --num-tests 10 --test-data ./test_image.jpg
  %(prog)s report --output system_report.md
        '''
    )
    
    # Global options
    parser.add_argument("--api-url", default="http://localhost:8000", help="API URL")
    parser.add_argument("--api-key", default="test_key_123", help="API key")
    parser.add_argument("--config", help="Configuration file (YAML/JSON)")
    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
    
    subparsers = parser.add_subparsers(dest="command", help="Available commands")
    
    # Health check command
    health_parser = subparsers.add_parser("health", help="Check API health")
    
    # Model info command
    info_parser = subparsers.add_parser("info", help="Get model information")
    
    # Statistics command
    stats_parser = subparsers.add_parser("stats", help="Get server statistics")
    
    # Test prediction command
    test_parser = subparsers.add_parser("test", help="Test model prediction")
    test_parser.add_argument("--test-data", help="Path to test data file")
    test_parser.add_argument("--num-tests", type=int, default=1, help="Number of tests to run")
    
    # Benchmark command
    bench_parser = subparsers.add_parser("benchmark", help="Run benchmark test")
    bench_parser.add_argument("--requests", type=int, default=100, help="Number of requests")
    bench_parser.add_argument("--concurrent", type=int, default=1, help="Concurrent workers")
    bench_parser.add_argument("--duration", type=int, help="Duration limit in seconds")
    
    # Monitor command
    monitor_parser = subparsers.add_parser("monitor", help="Monitor API metrics")
    monitor_parser.add_argument("--interval", type=int, default=30, help="Monitoring interval")
    monitor_parser.add_argument("--duration", type=int, default=300, help="Monitoring duration")
    monitor_parser.add_argument("--save-data", action="store_true", help="Save monitoring data")
    
    # Report command
    report_parser = subparsers.add_parser("report", help="Generate comprehensive report")
    report_parser.add_argument("--output", help="Output file path")
    
    args = parser.parse_args()
    
    if not args.command:
        parser.print_help()
        return
    
    # Initialize manager
    manager = ModelManager(args.api_url, args.api_key, args.config)
    
    # Execute command
    try:
        if args.command == "health":
            manager.health_check()
        elif args.command == "info":
            manager.get_model_info()
        elif args.command == "stats":
            manager.get_stats()
        elif args.command == "test":
            manager.test_prediction(args.test_data, args.num_tests)
        elif args.command == "benchmark":
            manager.benchmark(args.requests, args.concurrent, getattr(args, 'duration', None))
        elif args.command == "monitor":
            manager.monitor(args.interval, args.duration, args.save_data)
        elif args.command == "report":
            manager.generate_report(args.output)
    
    except KeyboardInterrupt:
        print(f"\\n{Colors.YELLOW}Operation cancelled by user{Colors.END}")
    except Exception as e:
        print(f"{Colors.RED}Error: {e}{Colors.END}")
        if args.verbose:
            import traceback
            traceback.print_exc()
        sys.exit(1)

if __name__ == "__main__":
    main()
        '''
        
        return model_mgmt_script

# Generate CLI tools
print("\n🔧 GENERATING CLI TOOLS AND SCRIPTS")
print("=" * 60)

cli_generator = CLIToolsGenerator()

# Generate scripts
deploy_script = cli_generator.generate_deployment_script()
model_mgmt_script = cli_generator.generate_model_management_script()

# Save CLI tools
scripts_dir = results_dir / 'scripts'
scripts_dir.mkdir(exist_ok=True)

# Save deployment script
with open(scripts_dir / 'deploy.sh', 'w') as f:
    f.write(deploy_script)
os.chmod(scripts_dir / 'deploy.sh', 0o755)

# Save model management script
with open(scripts_dir / 'model_manager.py', 'w') as f:
    f.write(model_mgmt_script)
os.chmod(scripts_dir / 'model_manager.py', 0o755)

print("✅ Advanced deployment script generated (deploy.sh)")
print("✅ Comprehensive model management CLI created (model_manager.py)")
print(f"📁 CLI tools saved to: {scripts_dir}")

# Display usage examples
print(f"\n💡 USAGE EXAMPLES")
print("-" * 40)
print("Deployment Script:")
print("  ./deploy.sh docker --tag v1.1 --debug")
print("  ./deploy.sh kubernetes --namespace production --timeout 600")
print("  ./deploy.sh helm --dry-run")
print("  ./deploy.sh test --requests 1000 --concurrent 20")
print()
print("Model Manager:")
print("  ./model_manager.py health --api-url http://prod.example.com:8000")
print("  ./model_manager.py benchmark --requests 1000 --concurrent 10")
print("  ./model_manager.py monitor --duration 600 --save-data")
print("  ./model_manager.py report --output system_report.md")
```

## 11. Comprehensive Summary and Production Readiness

```python
def generate_comprehensive_summary():
    """Generate final summary of the complete model serving system."""
    
    summary = {
        'system_overview': {
            'name': 'PyTorch Model Serving APIs',
            'version': '1.0.0',
            'description': 'Production-grade machine learning inference system',
            'features': [
                'High-performance async batch processing',
                'Tiered authentication and rate limiting',
                'Comprehensive monitoring and alerting',
                'Enterprise security features',
                'Container-native deployment',
                'Auto-scaling and load balancing',
                'CLI tools for management'
            ]
        },
        'performance_characteristics': {
            'throughput': '100+ RPS with optimal batching',
            'latency': 'Sub-50ms for single requests',
            'batch_efficiency': 'Automatic request aggregation',
            'scalability': 'Kubernetes HPA with demand-responsive scaling',
            'reliability': '99.9% uptime with proper deployment'
        },
        'security_features': [
            'API key-based authentication with tiers',
            'Rate limiting per user tier',
            'Request validation and sanitization',
            'Threat detection and blocking',
            'Security headers and SSL termination',
            'Input sanitization and validation'
        ],
        'monitoring_capabilities': [
            'Prometheus metrics collection',
            'Grafana dashboards with 8+ panels',
            'Comprehensive alert rules (20+ alerts)',
            'Real-time health monitoring',
            'Performance tracking and analytics',
            'Custom metrics and logging'
        ],
        'deployment_options': [
            'Docker Compose for development',
            'Kubernetes manifests for production',
            'Helm charts for easy deployment',
            'Multi-stage Docker builds',
            'Nginx load balancing and SSL',
            'Auto-scaling configurations'
        ],
        'files_generated': [
            'FastAPI application with comprehensive endpoints',
            'Docker and Kubernetes deployment configs',
            'Monitoring and alerting configurations',
            'CLI tools for deployment and management',
            'Load testing framework',
            'Security and authentication systems'
        ],
        'production_readiness': {
            'infrastructure': '✅ Complete',
            'security': '✅ Enterprise-grade',
            'monitoring': '✅ Comprehensive',
            'deployment': '✅ Container-native',
            'testing': '✅ Load testing included',
            'documentation': '✅ Comprehensive',
            'management': '✅ CLI tools provided'
        }
    }
    
    return summary

# Generate final summary
final_summary = generate_comprehensive_summary()

print("\n" + "="*80)
print("🎯 PYTORCH MODEL SERVING APIS - COMPREHENSIVE SUMMARY")
print("="*80)

print(f"\n📋 SYSTEM OVERVIEW")
print("-" * 40)
print(f"Name: {final_summary['system_overview']['name']}")
print(f"Version: {final_summary['system_overview']['version']}")
print(f"Description: {final_summary['system_overview']['description']}")

print(f"\n⚡ KEY FEATURES")
print("-" * 40)
for i, feature in enumerate(final_summary['system_overview']['features'], 1):
    print(f"{i:2d}. {feature}")

print(f"\n📈 PERFORMANCE CHARACTERISTICS")
print("-" * 40)
for key, value in final_summary['performance_characteristics'].items():
    print(f"• {key.replace('_', ' ').title()}: {value}")

print(f"\n🔐 SECURITY FEATURES")
print("-" * 40)
for i, feature in enumerate(final_summary['security_features'], 1):
    print(f"{i:2d}. {feature}")

print(f"\n📊 MONITORING CAPABILITIES")
print("-" * 40)
for i, capability in enumerate(final_summary['monitoring_capabilities'], 1):
    print(f"{i:2d}. {capability}")

print(f"\n🚀 DEPLOYMENT OPTIONS")
print("-" * 40)
for i, option in enumerate(final_summary['deployment_options'], 1):
    print(f"{i:2d}. {option}")

print(f"\n📁 GENERATED ASSETS")
print("-" * 40)
for i, asset in enumerate(final_summary['files_generated'], 1):
    print(f"{i:2d}. {asset}")

print(f"\n✅ PRODUCTION READINESS CHECKLIST")
print("-" * 40)
for category, status in final_summary['production_readiness'].items():
    print(f"• {category.replace('_', ' ').title()}: {status}")

# Count generated files
total_files = 0
for item in results_dir.rglob('*'):
    if item.is_file():
        total_files += 1

print(f"\n📊 SYSTEM STATISTICS")
print("-" * 40)
print(f"• Total files generated: {total_files}")
print(f"• Lines of code: 2,500+ (estimated)")
print(f"• Docker configurations: 3 (Dockerfile, Compose, Multi-stage)")
print(f"• Kubernetes manifests: 8 files")
print(f"• Monitoring configs: 4 files")
print(f"• CLI tools: 2 scripts")
print(f"• Test frameworks: Load testing included")

print(f"\n🎯 NEXT STEPS")
print("-" * 40)
print("1. Review and customize configurations for your environment")
print("2. Build and test Docker images")
print("3. Deploy to staging environment")
print("4. Run load tests and monitor performance")
print("5. Configure production monitoring and alerting")
print("6. Deploy to production with proper security")
print("7. Set up CI/CD pipelines for automated deployment")

print(f"\n📚 ADDITIONAL RESOURCES")
print("-" * 40)
print(f"• All configurations saved to: {results_dir}")
print(f"• Documentation: See README files in each directory")
print(f"• Examples: Usage examples included in CLI tools")
print(f"• Monitoring: Grafana dashboards ready to import")
print(f"• Security: Review security configurations before production")

# Save final summary
with open(results_dir / 'system_summary.json', 'w') as f:
    json.dump(final_summary, f, indent=2)

print(f"\n💾 Complete system summary saved to: {results_dir / 'system_summary.json'}")

# List all generated directories and files
print(f"\n📂 COMPLETE DIRECTORY STRUCTURE")
print("-" * 50)

def print_tree(directory, prefix="", max_depth=3, current_depth=0):
    """Print directory tree structure."""
    if current_depth >= max_depth:
        return
    
    items = list(directory.iterdir())
    items.sort(key=lambda x: (x.is_file(), x.name))
    
    for i, item in enumerate(items):
        is_last = i == len(items) - 1
        current_prefix = "└── " if is_last else "├── "
        print(f"{prefix}{current_prefix}{item.name}")
        
        if item.is_dir() and current_depth < max_depth - 1:
            extension_prefix = "    " if is_last else "│   "
            print_tree(item, prefix + extension_prefix, max_depth, current_depth + 1)

print_tree(results_dir)

print(f"\n🎉 PYTORCH MODEL SERVING API SYSTEM COMPLETE!")
print("=" * 80)
print("Your production-ready machine learning inference system is now available.")
print("All configurations, scripts, and tools have been generated and are ready for deployment.")
print(f"\n📁 Location: {results_dir}")
print("🚀 Ready for production deployment!")