# Cloud Monitor Review Notebook

## Purpose
This notebook is used for **post-hoc strategy review** of cloud monitoring data.

### Analysis Areas:
1. **Market Regime Analysis**: Which regimes cause IC drops / gate tightening
2. **Feature Stability**: Which features are unstable in certain time periods
3. **Alert Patterns**: When and why alerts occur
4. **Feedback Loop**: Insights for feature engineering, gate design, position sizing

### Data Sources:
- `warehouse/monitoring/hourly_runlog.jsonl` (local or pulled from EC2)
- `cloud_logs/YYYY-MM-DD/` (daily backups from EC2)

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
plt.style.use('seaborn-v0_8-whitegrid')

print("Cloud Monitor Review Notebook loaded")
print(f"Analysis timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Load Runlog Data

In [None]:
def load_runlog(path: str) -> pd.DataFrame:
    """
    Load hourly_runlog.jsonl into a DataFrame
    """
    records = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                try:
                    records.append(json.loads(line))
                except json.JSONDecodeError:
                    continue
    
    df = pd.DataFrame(records)
    if 'ts_run' in df.columns:
        df['ts_run'] = pd.to_datetime(df['ts_run'])
        df = df.sort_values('ts_run').reset_index(drop=True)
    
    return df

def load_multiple_runlogs(base_dir: str, date_range: list = None) -> pd.DataFrame:
    """
    Load runlogs from multiple date directories in cloud_logs/
    """
    base = Path(base_dir)
    all_records = []
    
    for date_dir in sorted(base.iterdir()):
        if not date_dir.is_dir():
            continue
        
        if date_range:
            dir_date = date_dir.name
            if dir_date < date_range[0] or dir_date > date_range[1]:
                continue
        
        runlog_path = date_dir / 'hourly_runlog.jsonl'
        if runlog_path.exists():
            df_day = load_runlog(str(runlog_path))
            all_records.append(df_day)
            print(f"  Loaded {len(df_day)} records from {date_dir.name}")
    
    if all_records:
        return pd.concat(all_records, ignore_index=True)
    return pd.DataFrame()

In [None]:
# Option 1: Load from local warehouse (if running on EC2 or synced)
LOCAL_RUNLOG = Path('warehouse/monitoring/hourly_runlog.jsonl')

# Option 2: Load from cloud_logs backup directory
CLOUD_LOGS_DIR = Path('cloud_logs')

# Choose data source
if LOCAL_RUNLOG.exists():
    print("Loading from local runlog...")
    df_runlog = load_runlog(str(LOCAL_RUNLOG))
elif CLOUD_LOGS_DIR.exists():
    print("Loading from cloud_logs directory...")
    df_runlog = load_multiple_runlogs(str(CLOUD_LOGS_DIR))
else:
    print("No runlog data found. Run the EC2 monitoring first or pull logs.")
    df_runlog = pd.DataFrame()

if len(df_runlog) > 0:
    print(f"\nTotal records loaded: {len(df_runlog)}")
    print(f"Date range: {df_runlog['ts_run'].min()} to {df_runlog['ts_run'].max()}")
    print(f"\nColumns: {list(df_runlog.columns)}")

In [None]:
# Quick overview
if len(df_runlog) > 0:
    display(df_runlog.head())
    display(df_runlog.describe())

## 2. Execution Success Rate Analysis

In [None]:
if len(df_runlog) > 0 and 'status' in df_runlog.columns:
    # Daily success rate
    df_runlog['date'] = df_runlog['ts_run'].dt.date
    
    daily_stats = df_runlog.groupby('date').agg({
        'status': ['count', lambda x: (x == 'SUCCESS').sum()]
    }).reset_index()
    daily_stats.columns = ['date', 'total', 'success']
    daily_stats['success_rate'] = daily_stats['success'] / daily_stats['total']
    
    print("Daily Execution Summary:")
    display(daily_stats)
    
    # Plot
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.bar(range(len(daily_stats)), daily_stats['success_rate'], color='steelblue')
    ax.set_xticks(range(len(daily_stats)))
    ax.set_xticklabels([str(d) for d in daily_stats['date']], rotation=45, ha='right')
    ax.set_ylabel('Success Rate')
    ax.set_title('Daily Execution Success Rate')
    ax.axhline(y=0.95, color='green', linestyle='--', label='95% target')
    ax.legend()
    plt.tight_layout()
    plt.show()

## 3. IC/IR/PMR Time Series Analysis

In [None]:
if len(df_runlog) > 0 and 'ic_15d' in df_runlog.columns:
    # Filter successful runs with metrics
    df_metrics = df_runlog[df_runlog['status'] == 'SUCCESS'].copy()
    
    fig, axes = plt.subplots(3, 1, figsize=(14, 10), sharex=True)
    
    # IC 15d
    axes[0].plot(df_metrics['ts_run'], df_metrics['ic_15d'], 'b-', alpha=0.7, label='IC 15d')
    axes[0].axhline(y=0.02, color='red', linestyle='--', label='Hard Gate (0.02)')
    axes[0].fill_between(df_metrics['ts_run'], 0.02, df_metrics['ic_15d'].min(), 
                         where=df_metrics['ic_15d'] < 0.02, alpha=0.3, color='red')
    axes[0].set_ylabel('IC (15d Rolling)')
    axes[0].legend(loc='upper right')
    axes[0].set_title('Rolling IC Performance')
    
    # IR 15d
    axes[1].plot(df_metrics['ts_run'], df_metrics['ir_15d'], 'g-', alpha=0.7, label='IR 15d')
    axes[1].axhline(y=0.5, color='red', linestyle='--', label='Hard Gate (0.5)')
    axes[1].set_ylabel('IR (15d Rolling)')
    axes[1].legend(loc='upper right')
    
    # PMR 15d
    axes[2].plot(df_metrics['ts_run'], df_metrics['pmr_15d'], 'purple', alpha=0.7, label='PMR 15d')
    axes[2].axhline(y=0.55, color='red', linestyle='--', label='Hard Gate (0.55)')
    axes[2].set_ylabel('PMR (15d Rolling)')
    axes[2].set_xlabel('Time')
    axes[2].legend(loc='upper right')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\nMetric Summary (15-day rolling):")
    print(df_metrics[['ic_15d', 'ir_15d', 'pmr_15d']].describe())

## 4. Position Distribution Analysis

In [None]:
if len(df_runlog) > 0 and 'position' in df_runlog.columns:
    df_pos = df_runlog[df_runlog['status'] == 'SUCCESS'].copy()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Position time series
    axes[0].plot(df_pos['ts_run'], df_pos['position'] * 100, 'b-', alpha=0.7)
    axes[0].axhline(y=0, color='gray', linestyle='-', alpha=0.5)
    axes[0].axhline(y=15, color='orange', linestyle='--', label='Initial Weight (15%)')
    axes[0].axhline(y=-15, color='orange', linestyle='--')
    axes[0].axhline(y=30, color='red', linestyle='--', label='Max Weight (30%)')
    axes[0].axhline(y=-30, color='red', linestyle='--')
    axes[0].set_ylabel('Position (%)')
    axes[0].set_xlabel('Time')
    axes[0].set_title('Position Over Time')
    axes[0].legend(loc='upper right')
    
    # Position histogram
    axes[1].hist(df_pos['position'] * 100, bins=30, color='steelblue', edgecolor='white', alpha=0.7)
    axes[1].axvline(x=0, color='gray', linestyle='-', alpha=0.5)
    axes[1].set_xlabel('Position (%)')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Position Distribution')
    
    plt.tight_layout()
    plt.show()
    
    # Extreme positions
    print("\nExtreme Positions:")
    print(f"  Max Long:  {df_pos['position'].max() * 100:.2f}%")
    print(f"  Max Short: {df_pos['position'].min() * 100:.2f}%")
    print(f"  Mean:      {df_pos['position'].mean() * 100:.2f}%")
    print(f"  Std:       {df_pos['position'].std() * 100:.2f}%")

## 5. Alert Pattern Analysis

In [None]:
if len(df_runlog) > 0 and 'alerts' in df_runlog.columns:
    # Flatten alerts
    all_alerts = []
    for idx, row in df_runlog.iterrows():
        alerts = row.get('alerts', [])
        if isinstance(alerts, list):
            for alert in alerts:
                alert_record = {
                    'ts_run': row['ts_run'],
                    'level': alert.get('level'),
                    'gate': alert.get('gate'),
                    'metric': alert.get('metric'),
                    'value': alert.get('value'),
                    'threshold': alert.get('threshold'),
                    'message': alert.get('message')
                }
                all_alerts.append(alert_record)
    
    if all_alerts:
        df_alerts = pd.DataFrame(all_alerts)
        
        print(f"Total alerts: {len(df_alerts)}")
        print("\nAlerts by Level:")
        print(df_alerts['level'].value_counts())
        print("\nAlerts by Gate:")
        print(df_alerts['gate'].value_counts())
        
        # Plot alert timeline
        if len(df_alerts) > 0:
            fig, ax = plt.subplots(figsize=(14, 4))
            
            level_colors = {'CRITICAL': 'red', 'WARNING': 'orange', 'INFO': 'blue'}
            for level in df_alerts['level'].unique():
                mask = df_alerts['level'] == level
                ax.scatter(df_alerts.loc[mask, 'ts_run'], 
                          [level] * mask.sum(),
                          c=level_colors.get(level, 'gray'),
                          s=100, alpha=0.7, label=level)
            
            ax.set_xlabel('Time')
            ax.set_ylabel('Alert Level')
            ax.set_title('Alert Timeline')
            ax.legend()
            plt.tight_layout()
            plt.show()
    else:
        print("No alerts found in the data.")

## 6. Hard Gate Status Analysis

In [None]:
if len(df_runlog) > 0 and 'hard_gate_status' in df_runlog.columns:
    print("Hard Gate Status Distribution:")
    print(df_runlog['hard_gate_status'].value_counts())
    
    # Time when status changed
    df_runlog['status_change'] = df_runlog['hard_gate_status'] != df_runlog['hard_gate_status'].shift(1)
    status_changes = df_runlog[df_runlog['status_change']]
    
    if len(status_changes) > 1:
        print("\nStatus Change Events:")
        display(status_changes[['ts_run', 'hard_gate_status', 'ic_15d', 'ir_15d', 'pmr_15d']])

## 7. Feature Stability Analysis

This section requires joining with feature data to analyze which features correlate with IC drops.

In [None]:
# Load features data if available
FEATURES_PATH = Path('features_hourly_with_term.parquet')

if FEATURES_PATH.exists() and len(df_runlog) > 0:
    df_features = pd.read_parquet(FEATURES_PATH)
    print(f"Features loaded: {len(df_features)} rows, {len(df_features.columns)} columns")
    
    # Feature columns used in the model
    model_features = [
        'OIL_CORE_norm_art_cnt', 'GEOPOL_norm_art_cnt',
        'USD_RATE_norm_art_cnt', 'SUPPLY_CHAIN_norm_art_cnt',
        'MACRO_norm_art_cnt', 'cl1_cl2', 'ovx'
    ]
    
    available_features = [f for f in model_features if f in df_features.columns]
    print(f"\nAvailable model features: {available_features}")
    
    if available_features:
        # Rolling std of features (stability measure)
        stability_df = df_features[available_features].rolling(24).std()
        
        fig, ax = plt.subplots(figsize=(14, 6))
        for col in available_features:
            ax.plot(stability_df.index, stability_df[col], alpha=0.7, label=col)
        ax.set_xlabel('Time')
        ax.set_ylabel('Rolling 24h Std')
        ax.set_title('Feature Stability (Lower = More Stable)')
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()
else:
    print("Features data not available for stability analysis.")

## 8. System Health Summary

In [None]:
if len(df_runlog) > 0:
    print("="*60)
    print("SYSTEM HEALTH SUMMARY")
    print("="*60)
    
    # Execution health
    total_runs = len(df_runlog)
    success_runs = (df_runlog['status'] == 'SUCCESS').sum()
    success_rate = success_runs / total_runs if total_runs > 0 else 0
    
    print(f"\n[Execution]")
    print(f"  Total Runs:    {total_runs}")
    print(f"  Success Rate:  {success_rate:.1%}")
    
    # IC health
    if 'ic_15d' in df_runlog.columns:
        recent_ic = df_runlog['ic_15d'].dropna().tail(24)
        ic_below_gate = (recent_ic < 0.02).sum()
        print(f"\n[IC Performance - Last 24 observations]")
        print(f"  Mean IC:       {recent_ic.mean():.4f}")
        print(f"  Below Gate:    {ic_below_gate} times")
    
    # Alert health
    if 'hard_gate_status' in df_runlog.columns:
        recent_status = df_runlog['hard_gate_status'].tail(24)
        healthy_pct = (recent_status == 'HEALTHY').sum() / len(recent_status) if len(recent_status) > 0 else 0
        print(f"\n[Hard Gate Status - Last 24 observations]")
        print(f"  Healthy Rate:  {healthy_pct:.1%}")
    
    # Host diversity
    if 'source_host' in df_runlog.columns:
        hosts = df_runlog['source_host'].unique()
        print(f"\n[Infrastructure]")
        print(f"  Source Hosts:  {list(hosts)}")
    
    print("\n" + "="*60)

## 9. Actionable Insights

Based on the analysis above, document insights for:

### Feature Engineering
- Which features show high instability?
- Are there time periods where specific features fail?

### Gate Threshold Design
- Are current thresholds too tight/loose?
- Should we add market-regime-specific gates?

### Position Sizing Logic
- Is the position scaling appropriate?
- Should we reduce sizing during high-volatility periods?

In [None]:
# Manual notes section
insights = """
## Review Notes

### Date: [FILL IN]

### Observations:
1. 
2. 
3. 

### Recommended Actions:
- [ ] 
- [ ] 
- [ ] 

### Follow-up Questions:
- 
"""

print(insights)