# Exploratory Data Analysis - IoT Edge Allocation

This notebook provides visualization and analysis tools for:
- Simulation data from iFogSim
- Training metrics from RL agents
- Network topology visualization
- Performance comparison across methods

In [None]:
# Imports
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from pathlib import Path

# Custom modules
from src.utils.data_loader import IoTDataLoader, compute_statistics
from src.utils.graph_utils import IoTGraphBuilder, compute_graph_statistics

# Settings
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

%matplotlib inline
%load_ext autoreload
%autoreload 2

## 1. Load and Explore Simulation Data

In [None]:
# Load raw simulation data
data_path = Path('../data/raw/sim_results.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df):,} records")
    print(f"\nColumns: {list(df.columns)}")
    print(f"\nTime range: {df['timestamp'].min():.1f}s - {df['timestamp'].max():.1f}s")
    print(f"Number of nodes: {df['node_id'].nunique()}")
    df.head()
else:
    print("❌ Raw data not found. Run simulation first!")

In [None]:
# Summary statistics
feature_cols = ['cpu_util', 'mem_util', 'energy', 'latency', 'bandwidth', 'queue_len']
stats = compute_statistics(df, feature_cols)
stats

## 2. Visualize Feature Distributions

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(feature_cols):
    axes[i].hist(df[feature], bins=50, alpha=0.7, color='steelblue', edgecolor='black')
    axes[i].axvline(df[feature].mean(), color='red', linestyle='--', label=f'Mean: {df[feature].mean():.2f}')
    axes[i].set_title(f'{feature} Distribution')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Frequency')
    axes[i].legend()
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## 3. Time-Series Analysis

In [None]:
# Aggregate metrics over time
time_series = df.groupby('timestamp')[feature_cols].mean().reset_index()

fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# CPU and Memory utilization
axes[0].plot(time_series['timestamp'], time_series['cpu_util'], label='CPU Utilization', linewidth=2)
axes[0].plot(time_series['timestamp'], time_series['mem_util'], label='Memory Utilization', linewidth=2)
axes[0].set_ylabel('Utilization')
axes[0].set_title('Resource Utilization Over Time')
axes[0].legend()
axes[0].grid(alpha=0.3)

# Latency and Energy
ax2 = axes[1]
ax2.plot(time_series['timestamp'], time_series['latency'], label='Latency (ms)', color='orange', linewidth=2)
ax2.set_ylabel('Latency (ms)', color='orange')
ax2.tick_params(axis='y', labelcolor='orange')

ax2_twin = ax2.twinx()
ax2_twin.plot(time_series['timestamp'], time_series['energy'], label='Energy (J)', color='green', linewidth=2)
ax2_twin.set_ylabel('Energy (J)', color='green')
ax2_twin.tick_params(axis='y', labelcolor='green')
ax2.set_title('Latency and Energy Over Time')
ax2.grid(alpha=0.3)

# Queue length
axes[2].plot(time_series['timestamp'], time_series['queue_len'], label='Queue Length', color='purple', linewidth=2)
axes[2].set_xlabel('Time (s)')
axes[2].set_ylabel('Queue Length')
axes[2].set_title('Queue Length Over Time')
axes[2].legend()
axes[2].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/time_series.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Network Topology Visualization

In [None]:
# Build topology graph
builder = IoTGraphBuilder()
graph = builder.build_hierarchical_topology(num_sensors=8, num_fog=10, num_cloud=1)

# Graph statistics
stats = compute_graph_statistics(graph)
print("Graph Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

In [None]:
# Visualize topology
pos = nx.spring_layout(graph, k=2, iterations=50, seed=42)

# Color by node type
node_colors = []
node_sizes = []
for node in graph.nodes():
    node_type = graph.nodes[node]['type']
    if node_type == 'sensor':
        node_colors.append('lightblue')
        node_sizes.append(300)
    elif node_type == 'fog':
        node_colors.append('orange')
        node_sizes.append(600)
    else:  # cloud
        node_colors.append('red')
        node_sizes.append(1000)

plt.figure(figsize=(14, 10))
nx.draw(
    graph, pos,
    node_color=node_colors,
    node_size=node_sizes,
    with_labels=True,
    font_size=8,
    font_weight='bold',
    edge_color='gray',
    alpha=0.8,
    linewidths=2
)

# Add legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor='lightblue', markersize=10, label='Sensor'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='orange', markersize=12, label='Fog Node'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='red', markersize=14, label='Cloud Server')
]
plt.legend(handles=legend_elements, loc='upper left', fontsize=12)

plt.title('IoT Edge Computing Network Topology', fontsize=16, fontweight='bold')
plt.axis('off')
plt.tight_layout()
plt.savefig('../reports/figures/topology.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Correlation Analysis

In [None]:
# Correlation matrix
corr = df[feature_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../reports/figures/correlation.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Training Results Comparison

In [None]:
# Load experiment results (if available)
import json

results_path = Path('../logs/experiment/experiment_results.json')
if results_path.exists():
    with open(results_path, 'r') as f:
        results = json.load(f)
    
    # Create comparison plot
    methods = list(results.keys())
    mean_rewards = [results[m]['mean_reward'] for m in methods]
    std_rewards = [results[m]['std_reward'] for m in methods]
    
    fig, ax = plt.subplots(figsize=(10, 6))
    x = np.arange(len(methods))
    bars = ax.bar(x, mean_rewards, yerr=std_rewards, capsize=10, 
                   color=['steelblue', 'orange', 'green'], alpha=0.7, edgecolor='black')
    
    ax.set_xlabel('Method', fontsize=12, fontweight='bold')
    ax.set_ylabel('Mean Reward', fontsize=12, fontweight='bold')
    ax.set_title('Performance Comparison Across Methods', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels([m.upper() for m in methods])
    ax.grid(axis='y', alpha=0.3)
    
    # Add value labels on bars
    for i, (bar, val, std) in enumerate(zip(bars, mean_rewards, std_rewards)):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{val:.1f}±{std:.1f}',
                ha='center', va='bottom', fontweight='bold')
    
    plt.tight_layout()
    plt.savefig('../reports/figures/method_comparison.png', dpi=150, bbox_inches='tight')
    plt.show()
else:
    print("❌ Experiment results not found. Run experiments first!")

## 7. Node-Level Analysis

In [None]:
# Analyze performance per node
node_stats = df.groupby('node_id')[feature_cols].mean().reset_index()

fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# CPU utilization per node
axes[0].bar(node_stats['node_id'], node_stats['cpu_util'], color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Node ID')
axes[0].set_ylabel('Mean CPU Utilization')
axes[0].set_title('CPU Utilization by Node')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(axis='y', alpha=0.3)

# Latency per node
axes[1].bar(node_stats['node_id'], node_stats['latency'], color='orange', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Node ID')
axes[1].set_ylabel('Mean Latency (ms)')
axes[1].set_title('Latency by Node')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../reports/figures/node_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 8. Export Summary Report

In [None]:
# Generate summary report
summary = f"""
=== IoT Edge Allocation - Data Analysis Summary ===

Dataset:
  - Total records: {len(df):,}
  - Time range: {df['timestamp'].min():.1f}s - {df['timestamp'].max():.1f}s
  - Number of nodes: {df['node_id'].nunique()}

Average Metrics:
  - CPU Utilization: {df['cpu_util'].mean():.3f} ± {df['cpu_util'].std():.3f}
  - Memory Utilization: {df['mem_util'].mean():.3f} ± {df['mem_util'].std():.3f}
  - Latency: {df['latency'].mean():.2f} ± {df['latency'].std():.2f} ms
  - Energy: {df['energy'].mean():.2f} ± {df['energy'].std():.2f} J
  - Queue Length: {df['queue_len'].mean():.2f} ± {df['queue_len'].std():.2f}

Network Topology:
  - Nodes: {stats['num_nodes']}
  - Edges: {stats['num_edges']}
  - Density: {stats['density']:.3f}
  - Avg Degree: {stats['avg_degree']:.2f}

Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

print(summary)

# Save to file
with open('../reports/data_summary.txt', 'w') as f:
    f.write(summary)

print("\n✅ Summary report saved to reports/data_summary.txt")

## Conclusion

This notebook provides comprehensive visualization and analysis of:
- Simulation data characteristics
- Network topology structure
- Time-series patterns
- Performance comparisons

Use these insights to:
- Tune hyperparameters
- Design reward functions
- Interpret training results
- Generate publication figures