# 24/7 Data Integrity Testing

This notebook tests and ensures complete 24-hour coverage for both CMG and weather data.

**Key Features:**
- Uses correct API parameters (startDate/endDate)
- Handles rate limits with retry logic
- Combines multiple endpoints for complete coverage
- Verifies data integrity for each day

In [1]:
import pandas as pd
import numpy as np
import requests
import json
import time
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import our complete fetcher
import sys
sys.path.append('.')
from fetch_complete_hourly_data import CompleteHourlyFetcher, CHILOE_NODES, WEATHER_VARS

# Setup
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')

print("Setup complete!")

Setup complete!


## Initialize Fetcher

In [2]:
# Initialize the fetcher
fetcher = CompleteHourlyFetcher(cache_dir="cache_integrity")

print("Fetcher initialized!")
print(f"\nTarget CMG Real nodes: {len(CHILOE_NODES['CMG_REAL'])}")
for node in CHILOE_NODES['CMG_REAL']:
    print(f"  - {node}")
    
print(f"\nTarget CMG PID nodes: {len(CHILOE_NODES['CMG_PID'])}")
for node in CHILOE_NODES['CMG_PID']:
    print(f"  - {node}")
    
print(f"\nWeather variables: {len(WEATHER_VARS)}")
print(f"  {', '.join(WEATHER_VARS[:5])}...")

Fetcher initialized!

Target CMG Real nodes: 6
  - QUELLON_______013
  - QUELLON_______110
  - CHILOE________220
  - CHILOE________110
  - CHONCHI_______110
  - DALCAHUE______023

Target CMG PID nodes: 6
  - BA S/E CHONCHI 110KV BP1
  - BA S/E CHILOE 110KV BP1
  - BA S/E CHILOE 220KV BP1
  - BA S/E QUELLON 110KV BP1
  - BA S/E QUELLON 13KV BP1
  - BA S/E DALCAHUE 23KV BP1

Weather variables: 13
  temperature_2m, relative_humidity_2m, dew_point_2m, apparent_temperature, precipitation...


## Test Single Day - Complete Fetch

In [3]:
# Test fetching a complete day
test_date = "2025-08-01"

print(f"Testing complete fetch for {test_date}")
print("="*60)

# Fetch complete day
cmg_data, weather_data, integrity = fetcher.fetch_complete_day(test_date)

# Display results
print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)

# CMG Results
print("\n📊 CMG Data:")
for source, df in cmg_data.items():
    if not df.empty:
        print(f"  {source}: {len(df)} records")
        
        # Show sample
        if source == 'real':
            print(f"    Columns: {list(df.columns[:5])}...")
            print(f"    Nodes: {df['barra_transf'].nunique() if 'barra_transf' in df.columns else 'N/A'}")
        elif source == 'pid':
            print(f"    Nodes: {df['nmb_barra_info'].nunique() if 'nmb_barra_info' in df.columns else 'N/A'}")

# Weather Results
print(f"\n🌤️ Weather Data:")
if not weather_data.empty:
    print(f"  Records: {len(weather_data)}")
    print(f"  Hours: {weather_data['timestamp'].dt.hour.nunique()}")
    print(f"  Variables: {len([col for col in weather_data.columns if col != 'timestamp'])}")

# Integrity Check
print(f"\n✅ Data Integrity:")
print(f"  Overall: {'COMPLETE' if integrity['complete'] else 'INCOMPLETE'}")
for source, info in integrity['cmg'].items():
    status = "✅" if info['complete'] else "⚠️"
    print(f"  {status} CMG {source}: {info['hours']}/24 hours")
if integrity['weather']:
    status = "✅" if integrity['weather']['complete'] else "⚠️"
    print(f"  {status} Weather: {integrity['weather']['hours']}/24 hours")

2025-08-24 20:03:48,309 - INFO - 
2025-08-24 20:03:48,311 - INFO - FETCHING COMPLETE DATA FOR 2025-08-01
2025-08-24 20:03:48,317 - INFO - Fetching /costo-marginal-real/v4/findByDate for 2025-08-01


Testing complete fetch for 2025-08-01


2025-08-24 20:03:50,988 - INFO -     Page 1: 5 records
2025-08-24 20:03:53,292 - INFO -     Page 2: 4 records
2025-08-24 20:03:56,100 - INFO -     Page 3: 3 records
2025-08-24 20:03:59,056 - INFO -     Page 4: 6 records
2025-08-24 20:04:01,880 - INFO -     Page 5: 3 records
2025-08-24 20:04:03,942 - INFO -     Page 6: 3 records
2025-08-24 20:04:06,623 - INFO -     Page 7: 6 records
2025-08-24 20:04:08,600 - INFO -     Page 8: 3 records
2025-08-24 20:04:11,539 - INFO -     Page 9: 5 records
2025-08-24 20:04:14,571 - INFO -     Page 10: 4 records
2025-08-24 20:04:16,917 - INFO -     Page 11: 3 records
2025-08-24 20:04:20,368 - INFO -     Page 12: 5 records
2025-08-24 20:04:23,750 - INFO -     Page 13: 4 records
2025-08-24 20:04:26,588 - INFO -     Page 14: 3 records
2025-08-24 20:04:29,199 - INFO -     Page 15: 6 records
2025-08-24 20:04:32,998 - INFO -     Page 16: 3 records
2025-08-24 20:04:36,088 - INFO -     Page 17: 5 records
2025-08-24 20:04:38,197 - INFO -     Page 18: 4 records
2


RESULTS SUMMARY

📊 CMG Data:
  real: 440 records
    Columns: ['id_info', 'barra_info', 'barra_transf', 'fecha', 'hra']...
    Nodes: 5
  pid: 182 records
    Nodes: 2
  online: 384 records

🌤️ Weather Data:
  Records: 24
  Hours: 24
  Variables: 13

✅ Data Integrity:
  Overall: COMPLETE
  ⚠️ CMG real: 22/24 hours
  ✅ CMG pid: 24/24 hours
  ✅ CMG online: 24/24 hours
  ✅ Weather: 24/24 hours


## Analyze Hour Coverage

In [None]:
# Analyze which hours we have for each source
def analyze_hour_coverage(cmg_data, weather_data):
    """Analyze hour coverage across all data sources"""
    
    coverage = {}
    
    # Analyze CMG sources
    for source, df in cmg_data.items():
        if df.empty:
            continue
            
        if 'hra' in df.columns:
            hours = sorted(df['hra'].unique())
        elif 'fecha_hora' in df.columns:
            df['hour'] = pd.to_datetime(df['fecha_hora']).dt.hour
            hours = sorted(df['hour'].unique())
        else:
            hours = []
            
        coverage[f'CMG {source}'] = hours
    
    # Analyze weather
    if not weather_data.empty:
        weather_hours = sorted(weather_data['timestamp'].dt.hour.unique())
        coverage['Weather'] = weather_hours
    
    return coverage

coverage = analyze_hour_coverage(cmg_data, weather_data)

# Create visualization
fig, ax = plt.subplots(figsize=(15, len(coverage) * 0.5 + 1))

# Create matrix
sources = list(coverage.keys())
matrix = np.zeros((len(sources), 24))

for i, source in enumerate(sources):
    for hour in coverage[source]:
        matrix[i, hour] = 1

# Plot heatmap
sns.heatmap(matrix, 
            xticklabels=range(24),
            yticklabels=sources,
            cmap='RdYlGn',
            cbar_kws={'label': 'Data Available'},
            linewidths=0.5,
            ax=ax)

ax.set_title(f'Hourly Data Coverage for {test_date}')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Data Source')

plt.tight_layout()
plt.show()

# Print summary
print("Hour Coverage Summary:")
print("="*40)
for source, hours in coverage.items():
    completeness = "✅" if len(hours) == 24 else "⚠️"
    print(f"{completeness} {source}: {len(hours)}/24 hours")
    if len(hours) < 24:
        missing = [h for h in range(24) if h not in hours]
        print(f"   Missing: {missing}")

## Test Multiple Days

In [None]:
# Test multiple days to check consistency
test_dates = [
    "2024-10-01",  # Should use CMG PID only
    "2025-01-01",  # All sources available
    "2025-06-01",  # All sources available
    "2025-08-01",  # Recent date
    (datetime.now() - timedelta(days=10)).strftime('%Y-%m-%d')  # 10 days ago
]

all_integrity = []

print("Testing multiple days for data integrity...")
print("="*60)

for date in test_dates:
    print(f"\n📅 {date}:")
    
    # Fetch data
    cmg_data, weather_data, integrity = fetcher.fetch_complete_day(date)
    all_integrity.append(integrity)
    
    # Quick summary
    cmg_complete = any(d.get('complete', False) for d in integrity['cmg'].values())
    weather_complete = integrity['weather'].get('complete', False)
    
    cmg_status = "✅" if cmg_complete else "⚠️"
    weather_status = "✅" if weather_complete else "⚠️"
    overall_status = "✅" if integrity['complete'] else "⚠️"
    
    print(f"  {cmg_status} CMG | {weather_status} Weather | {overall_status} Overall")
    
    # Small delay
    time.sleep(2)

# Summary statistics
complete_days = sum(1 for i in all_integrity if i['complete'])
total_days = len(all_integrity)

print("\n" + "="*60)
print("MULTI-DAY SUMMARY")
print("="*60)
print(f"\nComplete days: {complete_days}/{total_days} ({complete_days/total_days*100:.1f}%)")

if complete_days == total_days:
    print("✅ EXCELLENT! All test days have complete 24/7 data!")
elif complete_days > 0:
    print(f"⚠️ Partial success. Check individual days for issues.")
else:
    print("❌ No complete days. Check API availability and parameters.")

## Create Coverage Matrix Visualization

In [None]:
# Create comprehensive coverage matrix
def create_coverage_matrix(integrity_reports):
    """Create visualization of data coverage across multiple days"""
    
    dates = [r['date'] for r in integrity_reports]
    
    # Separate matrices for CMG and Weather
    fig, axes = plt.subplots(2, 1, figsize=(15, 8))
    
    # CMG Coverage
    cmg_matrix = np.zeros((len(dates), 24))
    for i, report in enumerate(integrity_reports):
        # Combine all CMG sources
        all_hours = set()
        for source_data in report['cmg'].values():
            all_hours.update(range(24)[:source_data['hours']])
        for hour in all_hours:
            cmg_matrix[i, hour] = 1
    
    sns.heatmap(cmg_matrix,
                xticklabels=range(24),
                yticklabels=dates,
                cmap='RdYlGn',
                cbar_kws={'label': 'Data Available'},
                ax=axes[0])
    axes[0].set_title('CMG Data Coverage')
    axes[0].set_xlabel('Hour of Day')
    axes[0].set_ylabel('Date')
    
    # Weather Coverage
    weather_matrix = np.zeros((len(dates), 24))
    for i, report in enumerate(integrity_reports):
        if report['weather']:
            for hour in range(report['weather']['hours']):
                weather_matrix[i, hour] = 1
    
    sns.heatmap(weather_matrix,
                xticklabels=range(24),
                yticklabels=dates,
                cmap='RdYlGn',
                cbar_kws={'label': 'Data Available'},
                ax=axes[1])
    axes[1].set_title('Weather Data Coverage')
    axes[1].set_xlabel('Hour of Day')
    axes[1].set_ylabel('Date')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate statistics
    cmg_coverage = np.mean(cmg_matrix) * 100
    weather_coverage = np.mean(weather_matrix) * 100
    
    print(f"\nOverall Coverage:")
    print(f"  CMG: {cmg_coverage:.1f}%")
    print(f"  Weather: {weather_coverage:.1f}%")

# Create visualization
if all_integrity:
    create_coverage_matrix(all_integrity)

## Combine Data Sources for Complete Coverage

In [None]:
def combine_cmg_sources(cmg_data):
    """
    Combine multiple CMG sources to achieve complete 24-hour coverage
    Priority: Real > PID > Online
    """
    combined_hours = {}
    
    # Start with Real (most accurate)
    if 'real' in cmg_data and not cmg_data['real'].empty:
        df_real = cmg_data['real']
        if 'hra' in df_real.columns:
            for hour in df_real['hra'].unique():
                hour_data = df_real[df_real['hra'] == hour].iloc[0]
                combined_hours[hour] = {
                    'source': 'real',
                    'value': hour_data.get('cmg_clp_kwh_', None)
                }
    
    # Fill gaps with PID
    if 'pid' in cmg_data and not cmg_data['pid'].empty:
        df_pid = cmg_data['pid']
        df_pid['hour'] = pd.to_datetime(df_pid['fecha_hora']).dt.hour
        for hour in df_pid['hour'].unique():
            if hour not in combined_hours:
                hour_data = df_pid[df_pid['hour'] == hour].iloc[0]
                combined_hours[hour] = {
                    'source': 'pid',
                    'value': hour_data.get('cmg_usd_mwh', None)
                }
    
    # Fill remaining gaps with Online
    if 'online' in cmg_data and not cmg_data['online'].empty:
        df_online = cmg_data['online']
        if 'hra' in df_online.columns:
            for hour in df_online['hra'].unique():
                if hour not in combined_hours:
                    hour_data = df_online[df_online['hra'] == hour].iloc[0]
                    combined_hours[hour] = {
                        'source': 'online',
                        'value': hour_data.get('cmg_clp_kwh_', None)
                    }
    
    return combined_hours

# Test combining sources
if cmg_data:
    combined = combine_cmg_sources(cmg_data)
    
    print("Combined CMG Coverage:")
    print("="*40)
    print(f"Total hours covered: {len(combined)}/24")
    
    if len(combined) == 24:
        print("✅ COMPLETE 24-hour coverage achieved!")
    else:
        missing = [h for h in range(24) if h not in combined]
        print(f"⚠️ Missing hours: {missing}")
    
    # Show source distribution
    source_counts = {}
    for hour_data in combined.values():
        source = hour_data['source']
        source_counts[source] = source_counts.get(source, 0) + 1
    
    print("\nData sources used:")
    for source, count in source_counts.items():
        print(f"  {source}: {count} hours")
    
    # Visualize combined coverage
    fig, ax = plt.subplots(figsize=(15, 3))
    
    colors = {'real': 'green', 'pid': 'yellow', 'online': 'orange'}
    for hour in range(24):
        if hour in combined:
            color = colors.get(combined[hour]['source'], 'gray')
        else:
            color = 'red'
        ax.bar(hour, 1, color=color, width=0.8)
    
    ax.set_xlim(-0.5, 23.5)
    ax.set_xticks(range(24))
    ax.set_xlabel('Hour of Day')
    ax.set_title(f'Combined CMG Coverage for {test_date}')
    ax.set_yticks([])
    
    # Legend
    from matplotlib.patches import Patch
    legend_elements = [Patch(facecolor='green', label='Real'),
                       Patch(facecolor='yellow', label='PID'),
                       Patch(facecolor='orange', label='Online'),
                       Patch(facecolor='red', label='Missing')]
    ax.legend(handles=legend_elements, loc='upper right')
    
    plt.tight_layout()
    plt.show()

## Save Complete Dataset

In [None]:
# Function to save complete dataset with integrity guarantees
def save_complete_dataset(cmg_data, weather_data, date, output_dir="complete_data"):
    """Save complete dataset with metadata"""
    
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    
    # Save CMG data
    for source, df in cmg_data.items():
        if not df.empty:
            filename = output_dir / f"cmg_{source}_{date}.csv"
            df.to_csv(filename, index=False)
            print(f"💾 Saved: {filename}")
    
    # Save weather data
    if not weather_data.empty:
        filename = output_dir / f"weather_{date}.csv"
        weather_data.to_csv(filename, index=False)
        print(f"💾 Saved: {filename}")
    
    # Save metadata
    metadata = {
        'date': date,
        'timestamp': datetime.now().isoformat(),
        'cmg_sources': list(cmg_data.keys()),
        'cmg_records': {k: len(v) for k, v in cmg_data.items()},
        'weather_records': len(weather_data),
        'weather_variables': list(weather_data.columns) if not weather_data.empty else []
    }
    
    metadata_file = output_dir / f"metadata_{date}.json"
    with open(metadata_file, 'w') as f:
        json.dump(metadata, f, indent=2)
    print(f"📄 Saved metadata: {metadata_file}")

# Save the test data
if cmg_data and weather_data is not None:
    save_complete_dataset(cmg_data, weather_data, test_date)
    print("\n✅ Complete dataset saved successfully!")