In [None]:
# infrastructure_monitor.py - Comprehensive system monitoring
import psutil
import socket
import subprocess
import requests
import json
import logging
import time
from datetime import datetime, timedelta
import smtplib
from email.mime.text import MimeText
import sqlite3
import threading
from collections import defaultdict, deque

class EnterpriseMonitor:
    def __init__(self, config_file='monitor_config.json'):
        self.config = self.load_config(config_file)
        self.logger = self.setup_logging()
        self.alerts_db = self.setup_database()
        self.metrics_history = defaultdict(lambda: deque(maxlen=100))
        self.alert_thresholds = self.config.get('thresholds', {})
        
    def load_config(self, config_file):
        """Load monitoring configuration"""
        try:
            with open(config_file, 'r') as f:
                return json.load(f)
        except FileNotFoundError:
            return {
                'check_interval': 60,
                'alert_email': 'admin@company.com',
                'smtp_server': 'smtp.company.com',
                'endpoints': [],
                'thresholds': {
                    'cpu_percent': 80,
                    'memory_percent': 85,
                    'disk_percent': 90,
                    'response_time': 5000
                }
            }
    
    def setup_logging(self):
        """Configure logging for monitoring system"""
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('system_monitor.log'),
                logging.StreamHandler()
            ]
        )
        return logging.getLogger('InfrastructureMonitor')
    
    def setup_database(self):
        """Setup SQLite database for storing alerts and metrics"""
        conn = sqlite3.connect('monitoring.db', check_same_thread=False)
        cursor = conn.cursor()
        
        # Create tables
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS system_metrics (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
                metric_type TEXT,
                metric_name TEXT,
                value REAL,
                hostname TEXT
            )
        ''')
        
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS alerts (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
                alert_type TEXT,
                severity TEXT,
                message TEXT,
                resolved BOOLEAN DEFAULT FALSE,
                hostname TEXT
            )
        ''')
        
        conn.commit()
        return conn
    
    def collect_system_metrics(self):
        """Collect comprehensive system metrics"""
        hostname = socket.gethostname()
        timestamp = datetime.now()
        
        metrics = {
            'cpu_percent': psutil.cpu_percent(interval=1),
            'memory_percent': psutil.virtual_memory().percent,
            'disk_percent': psutil.disk_usage('/').percent,
            'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else 0,
            'network_sent': psutil.net_io_counters().bytes_sent,
            'network_recv': psutil.net_io_counters().bytes_recv,
            'processes_count': len(psutil.pids()),
            'boot_time': datetime.fromtimestamp(psutil.boot_time())
        }
        
        # Add disk space for all mounted drives
        for partition in psutil.disk_partitions():
            try:
                usage = psutil.disk_usage(partition.mountpoint)
                metrics[f'disk_{partition.device.replace("/", "_")}_percent'] = (
                    usage.used / usage.total * 100
                )
            except PermissionError:
                continue
        
        # Store metrics in database
        cursor = self.alerts_db.cursor()
        for metric_name, value in metrics.items():
            if isinstance(value, (int, float)):
                cursor.execute(
                    'INSERT INTO system_metrics (metric_type, metric_name, value, hostname) VALUES (?, ?, ?, ?)',
                    ('system', metric_name, value, hostname)
                )
                
                # Store in memory for trend analysis
                self.metrics_history[metric_name].append({
                    'timestamp': timestamp,
                    'value': value
                })
        
        self.alerts_db.commit()
        self.logger.info(f"Collected {len(metrics)} system metrics")
        return metrics
    
    def check_service_health(self, services):
        """Check health of critical services"""
        results = {}
        
        for service in services:
            try:
                if service['type'] == 'http':
                    start_time = time.time()
                    response = requests.get(
                        service['url'], 
                        timeout=service.get('timeout', 10),
                        headers=service.get('headers', {})
                    )
                    response_time = (time.time() - start_time) * 1000
                    
                    results[service['name']] = {
                        'status': 'healthy' if response.status_code == 200 else 'unhealthy',
                        'response_time': response_time,
                        'status_code': response.status_code
                    }
                    
                elif service['type'] == 'port':
                    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                    sock.settimeout(service.get('timeout', 5))
                    result = sock.connect_ex((service['host'], service['port']))
                    sock.close()
                    
                    results[service['name']] = {
                        'status': 'healthy' if result == 0 else 'unhealthy',
                        'port': service['port']
                    }
                    
                elif service['type'] == 'process':
                    process_running = any(
                        service['process_name'] in p.info['name'] 
                        for p in psutil.process_iter(['name'])
                    )
                    
                    results[service['name']] = {
                        'status': 'healthy' if process_running else 'unhealthy',
                        'process_name': service['process_name']
                    }
                    
            except Exception as e:
                results[service['name']] = {
                    'status': 'error',
                    'error': str(e)
                }
                self.logger.error(f"Error checking service {service['name']}: {str(e)}")
        
        return results
    
    def analyze_trends(self, metric_name, window_minutes=30):
        """Analyze metric trends for predictive alerting"""
        if metric_name not in self.metrics_history:
            return None
        
        history = list(self.metrics_history[metric_name])
        if len(history) < 5:
            return None
        
        # Filter to recent window
        cutoff_time = datetime.now() - timedelta(minutes=window_minutes)
        recent_data = [
            point for point in history 
            if point['timestamp'] >= cutoff_time
        ]
        
        if len(recent_data) < 3:
            return None
        
        values = [point['value'] for point in recent_data]
        
        # Simple trend analysis
        trend = {
            'current': values[-1],
            'average': sum(values) / len(values),
            'min': min(values),
            'max': max(values),
            'trend_direction': 'increasing' if values[-1] > values[0] else 'decreasing',
            'volatility': max(values) - min(values)
        }
        
        return trend
    
    def evaluate_alerts(self, metrics, service_health):
        """Evaluate metrics against thresholds and generate alerts"""
        alerts = []
        hostname = socket.gethostname()
        
        # System metric alerts
        for metric, value in metrics.items():
            if metric in self.alert_thresholds:
                threshold = self.alert_thresholds[metric]
                
                if isinstance(value, (int, float)) and value > threshold:
                    severity = 'critical' if value > threshold * 1.2 else 'warning'
                    
                    # Check if this is a recurring alert
                    trend = self.analyze_trends(metric)
                    trend_info = f" (Trend: {trend['trend_direction']})" if trend else ""
                    
                    alert = {
                        'type': 'system_metric',
                        'severity': severity,
                        'metric': metric,
                        'value': value,
                        'threshold': threshold,
                        'message': f"{metric} is {value:.2f}% (threshold: {threshold}%){trend_info}",
                        'hostname': hostname
                    }
                    alerts.append(alert)
        
        # Service health alerts
        for service_name, health in service_health.items():
            if health['status'] != 'healthy':
                alert = {
                    'type': 'service_health',
                    'severity': 'critical' if health['status'] == 'error' else 'warning',
                    'service': service_name,
                    'status': health['status'],
                    'message': f"Service {service_name} is {health['status']}",
                    'hostname': hostname
                }
                
                if 'response_time' in health:
                    response_threshold = self.alert_thresholds.get('response_time', 5000)
                    if health['response_time'] > response_threshold:
                        alert['message'] += f" (Response time: {health['response_time']:.0f}ms)"
                
                alerts.append(alert)
        
        # Store alerts in database
        if alerts:
            cursor = self.alerts_db.cursor()
            for alert in alerts:
                cursor.execute(
                    'INSERT INTO alerts (alert_type, severity, message, hostname) VALUES (?, ?, ?, ?)',
                    (alert['type'], alert['severity'], alert['message'], alert['hostname'])
                )
            self.alerts_db.commit()
        
        return alerts
    
    def send_alert_notifications(self, alerts):
        """Send alert notifications via email"""
        if not alerts:
            return
        
        critical_alerts = [a for a in alerts if a['severity'] == 'critical']
        warning_alerts = [a for a in alerts if a['severity'] == 'warning']
        
        subject = f"Infrastructure Alert - {len(critical_alerts)} Critical, {len(warning_alerts)} Warnings"
        
        body = f"""
        Infrastructure Monitoring Alert Report
        Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
        Hostname: {socket.gethostname()}
        
        CRITICAL ALERTS ({len(critical_alerts)}):
        {'=' * 50}
        """
        
        for alert in critical_alerts:
            body += f"- {alert['message']}\n"
        
        body += f"""
        
        WARNING ALERTS ({len(warning_alerts)}):
        {'=' * 50}
        """
        
        for alert in warning_alerts:
            body += f"- {alert['message']}\n"
        
        body += """
        
        This alert was generated automatically by the Enterprise Infrastructure Monitor.
        Please check the monitoring dashboard for detailed metrics and trends.
        """
        
        try:
            msg = MimeText(body)
            msg['Subject'] = subject
            msg['From'] = 'monitor@company.com'
            msg['To'] = self.config['alert_email']
            
            server = smtplib.SMTP(self.config['smtp_server'], 587)
            server.starttls()
            server.send_message(msg)
            server.quit()
            
            self.logger.info(f"Alert notification sent for {len(alerts)} alerts")
            
        except Exception as e:
            self.logger.error(f"Failed to send alert notification: {str(e)}")
    
    def generate_health_report(self):
        """Generate comprehensive system health report"""
        cursor = self.alerts_db.cursor()
        
        # Get recent metrics
        cursor.execute('''
            SELECT metric_name, AVG(value) as avg_value, MAX(value) as max_value
            FROM system_metrics 
            WHERE timestamp >= datetime('now', '-1 hour')
            GROUP BY metric_name
        ''')
        metrics_summary = cursor.fetchall()
        
        # Get recent alerts
        cursor.execute('''
            SELECT alert_type, severity, COUNT(*) as count
            FROM alerts 
            WHERE timestamp >= datetime('now', '-24 hours')
            GROUP BY alert_type, severity
        ''')
        alerts_summary = cursor.fetchall()
        
        report = {
            'timestamp': datetime.now().isoformat(),
            'hostname': socket.gethostname(),
            'metrics_summary': [
                {'metric': row[0], 'avg': row[1], 'max': row[2]} 
                for row in metrics_summary
            ],
            'alerts_summary': [
                {'type': row[0], 'severity': row[1], 'count': row[2]} 
                for row in alerts_summary
            ],
            'system_uptime': str(datetime.now() - datetime.fromtimestamp(psutil.boot_time())),
            'monitoring_status': 'active'
        }
        
        return report
    
    def start_monitoring(self):
        """Start the monitoring loop"""
        self.logger.info("Starting enterprise infrastructure monitoring...")
        
        while True:
            try:
                # Collect system metrics
                metrics = self.collect_system_metrics()
                
                # Check service health
                services = self.config.get('services', [])
                service_health = self.check_service_health(services)
                
                # Evaluate alerts
                alerts = self.evaluate_alerts(metrics, service_health)
                
                # Send notifications if needed
                if alerts:
                    self.send_alert_notifications(alerts)
                
                # Log current status
                self.logger.info(
                    f"Monitoring cycle complete. "
                    f"Metrics: {len(metrics)}, "
                    f"Services: {len(service_health)}, "
                    f"Alerts: {len(alerts)}"
                )
                
                # Wait for next cycle
                time.sleep(self.config.get('check_interval', 60))
                
            except KeyboardInterrupt:
                self.logger.info("Monitoring stopped by user")
                break
            except Exception as e:
                self.logger.error(f"Error in monitoring loop: {str(e)}")
                time.sleep(30)  # Wait before retrying

# Usage example
if __name__ == "__main__":
    # Configuration example
    config = {
        'check_interval': 60,
        'alert_email': 'admin@company.com',
        'smtp_server': 'smtp.company.com',
        'thresholds': {
            'cpu_percent': 80,
            'memory_percent': 85,
            'disk_percent': 90,
            'response_time': 5000
        },
        'services': [
            {
                'name': 'Web Server',
                'type': 'http',
                'url': 'http://localhost:80/health',
                'timeout': 10
            },
            {
                'name': 'Database',
                'type': 'port',
                'host': 'localhost',
                'port': 5432,
                'timeout': 5
            },
            {
                'name': 'API Gateway',
                'type': 'process',
                'process_name': 'nginx'
            }
        ]
    }
    
    # Save configuration
    with open('monitor_config.json', 'w') as f:
        json.dump(config, f, indent=2)
    
    # Start monitoring
    monitor = EnterpriseMonitor()
    monitor.start_monitoring()
