# Download and Filter INSTANCE Dataset for Norcia 2016 Earthquakes

This notebook helps download and filter the INSTANCE dataset for the Norcia 2016 earthquake sequence.

**To use in Google Colab:**
1. Upload this notebook to Google Colab
2. Run the cells in order
3. The dataset will be saved to your Google Drive

## Setup and Mount Google Drive

In [None]:
# Check if running in Google Colab and mount Drive
import os
import sys

try:
    from google.colab import drive
    drive.mount('/content/drive')
    IN_COLAB = True
    print("✓ Running in Google Colab")
    print("✓ Google Drive mounted at /content/drive")
    
    # Create dataset directory
    os.makedirs('/content/drive/MyDrive/datasets/norcia', exist_ok=True)
    print("✓ Created dataset directory: /content/drive/MyDrive/datasets/norcia")
except ImportError:
    IN_COLAB = False
    print("⚠ Not running in Google Colab")
    print("  Upload this notebook to Colab for best performance")

## Install Required Libraries

In [None]:
# Install required packages
!pip install -q pandas numpy h5py matplotlib

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt

print("Libraries loaded successfully")

## Option 1: Download Full INSTANCE Dataset (Colab Recommended)

**Note:** The full dataset is large (>100GB). Only run this if you have sufficient storage.

In [None]:
# INSTANCE dataset information
print("INSTANCE Dataset Information:")
print("="*60)
print("DOI: http://doi.org/10.13127/instance")
print("Description: Italian seismic waveform dataset")
print("")
print("To download the full dataset:")
print("1. Visit the DOI link above")
print("2. Find the download links for:")
print("   - instance_events_counts.hdf5 (waveform data)")
print("   - instance_metadata.csv (metadata)")
print("3. Replace the URLs in the next cell")

In [None]:
# Download full dataset (only if in Colab with Drive mounted)
if IN_COLAB:
    # REPLACE THESE WITH ACTUAL URLS FROM http://doi.org/10.13127/instance
    METADATA_URL = "https://example.com/instance_metadata.csv"  # Replace with actual URL
    WAVEFORM_URL = "https://example.com/instance_events_counts.hdf5"  # Replace with actual URL
    
    download_full = input("Download full dataset? (yes/no): ").lower() == 'yes'
    
    if download_full:
        print("Downloading metadata...")
        !wget -P /content/drive/MyDrive/datasets/norcia/ "{METADATA_URL}"
        
        print("\nDownloading waveform data (this may take a while)...")
        !wget -P /content/drive/MyDrive/datasets/norcia/ "{WAVEFORM_URL}"
        
        print("✓ Download complete!")
else:
    print("Please run this notebook in Google Colab for large downloads")

## Option 2: Metadata-First Approach (Filter Before Downloading)

In [None]:
# Step 1: Load metadata (smaller file ~100MB)
def load_metadata():
    """Load INSTANCE metadata CSV"""
    
    if IN_COLAB:
        metadata_path = '/content/drive/MyDrive/datasets/norcia/instance_metadata.csv'
    else:
        metadata_path = 'instance_metadata.csv'
    
    try:
        metadata = pd.read_csv(metadata_path)
        print(f"✓ Metadata loaded: {len(metadata)} total traces")
        print(f"\nColumns: {list(metadata.columns)[:10]}...")
        return metadata
    except FileNotFoundError:
        print("⚠ Metadata file not found")
        print("  Please download from http://doi.org/10.13127/instance")
        return None

# Load metadata
metadata = load_metadata()

In [None]:
# Step 2: Filter to Norcia 2016 earthquake sequence
def filter_norcia_events(metadata):
    """Filter metadata to Norcia 2016 events"""
    
    if metadata is None:
        return None
    
    # Convert time column to datetime if needed
    if 'source_origin_time' in metadata.columns:
        metadata['source_origin_time'] = pd.to_datetime(metadata['source_origin_time'])
    
    # Norcia 2016 sequence parameters
    # Main event: October 30, 2016 M6.5
    # Location: ~42.8°N, 13.1°E
    
    norcia = metadata[
        (metadata['source_origin_time'] >= '2016-08-01') &
        (metadata['source_origin_time'] <= '2017-01-31') &
        (metadata['source_latitude_deg'].between(42.5, 43.2)) &
        (metadata['source_longitude_deg'].between(12.8, 13.5))
    ]
    
    print(f"✓ Filtered to Norcia 2016 sequence:")
    print(f"  Total traces: {len(norcia)}")
    print(f"  Date range: {norcia['source_origin_time'].min()} to {norcia['source_origin_time'].max()}")
    
    # Estimate storage requirements
    n_traces = len(norcia)
    duration_s = 120  # typical trace duration
    sample_rate = 100  # Hz
    n_channels = 3  # 3-component
    bytes_per_sample = 4  # float32
    
    size_gb = (n_traces * duration_s * sample_rate * n_channels * bytes_per_sample) / (1024**3)
    print(f"\n  Estimated waveform size: ~{size_gb:.1f} GB")
    print(f"  ({n_traces} traces × {duration_s}s × {sample_rate}Hz × {n_channels} channels)")
    
    return norcia

# Filter to Norcia events
if metadata is not None:
    norcia_metadata = filter_norcia_events(metadata)
else:
    norcia_metadata = None

In [None]:
# Step 3: Save filtered metadata for targeted download
if norcia_metadata is not None and not norcia_metadata.empty:
    if IN_COLAB:
        output_path = '/content/drive/MyDrive/datasets/norcia/norcia_metadata.csv'
    else:
        output_path = 'norcia_metadata.csv'
    
    norcia_metadata.to_csv(output_path, index=False)
    print(f"✓ Filtered metadata saved to: {output_path}")
    print(f"\nYou can now:")
    print("1. Use this CSV to request only Norcia waveforms from INSTANCE")
    print("2. Share this filtered list with the data provider")
    print("3. Use trace IDs to extract specific waveforms from the full dataset")

## Visualize Norcia Events

In [None]:
# Plot event locations and timeline
if norcia_metadata is not None and not norcia_metadata.empty:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Map of epicenters
    scatter = ax1.scatter(
        norcia_metadata['source_longitude_deg'],
        norcia_metadata['source_latitude_deg'],
        c=norcia_metadata['source_magnitude'],
        s=norcia_metadata['source_magnitude']**2 * 10,
        cmap='hot_r',
        alpha=0.6,
        edgecolors='black',
        linewidth=0.5
    )
    ax1.set_xlabel('Longitude')
    ax1.set_ylabel('Latitude')
    ax1.set_title('Norcia 2016 Earthquake Sequence - Epicenters')
    ax1.grid(True, alpha=0.3)
    plt.colorbar(scatter, ax=ax1, label='Magnitude')
    
    # Timeline
    norcia_metadata['date'] = pd.to_datetime(norcia_metadata['source_origin_time']).dt.date
    events_per_day = norcia_metadata.groupby('date').size()
    
    ax2.bar(events_per_day.index, events_per_day.values, color='steelblue', alpha=0.7)
    ax2.set_xlabel('Date')
    ax2.set_ylabel('Number of Events')
    ax2.set_title('Temporal Distribution of Events')
    ax2.tick_params(axis='x', rotation=45)
    ax2.grid(True, alpha=0.3)
    
    # Mark main event (Oct 30, 2016)
    main_event_date = pd.to_datetime('2016-10-30').date()
    if main_event_date in events_per_day.index:
        ax2.axvline(main_event_date, color='red', linestyle='--', alpha=0.7, label='M6.5 Main Event')
        ax2.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\nSummary Statistics:")
    print("="*40)
    print(f"Total events: {norcia_metadata['source_id'].nunique() if 'source_id' in norcia_metadata.columns else 'N/A'}")
    print(f"Magnitude range: {norcia_metadata['source_magnitude'].min():.1f} - {norcia_metadata['source_magnitude'].max():.1f}")
    print(f"Largest event: M{norcia_metadata['source_magnitude'].max():.1f}")

## Work with Downloaded HDF5 Data

In [None]:
# Load and explore HDF5 waveform data
import h5py

def explore_hdf5(filepath):
    """Explore structure of INSTANCE HDF5 file"""
    try:
        with h5py.File(filepath, 'r') as f:
            print(f"HDF5 File Structure:")
            print("="*40)
            
            def print_structure(name, obj):
                indent = "  " * name.count('/')
                if isinstance(obj, h5py.Dataset):
                    print(f"{indent}{name.split('/')[-1]}: {obj.shape} {obj.dtype}")
                else:
                    print(f"{indent}{name.split('/')[-1]}/")
            
            f.visititems(print_structure)
            
            # Get sample waveform if available
            if 'waveforms' in f:
                waveforms = f['waveforms']
                print(f"\nWaveform array shape: {waveforms.shape}")
                print(f"Data type: {waveforms.dtype}")
                
                # Plot sample waveform
                if len(waveforms) > 0:
                    sample = waveforms[0]
                    
                    fig, axes = plt.subplots(3, 1, figsize=(12, 8), sharex=True)
                    for i, ax in enumerate(axes):
                        if len(sample.shape) > 1 and sample.shape[1] > i:
                            ax.plot(sample[:, i], linewidth=0.5)
                            ax.set_ylabel(f'Component {i+1}')
                            ax.grid(True, alpha=0.3)
                    
                    axes[-1].set_xlabel('Sample')
                    axes[0].set_title('Sample Waveform from INSTANCE')
                    plt.tight_layout()
                    plt.show()
            
            return True
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return False
    except Exception as e:
        print(f"Error reading HDF5: {e}")
        return False

# Try to load HDF5 file if it exists
if IN_COLAB:
    hdf5_path = '/content/drive/MyDrive/datasets/norcia/instance_events_counts.hdf5'
else:
    hdf5_path = 'instance_events_counts.hdf5'

explore_hdf5(hdf5_path)

## Extract Norcia Waveforms from Full Dataset

In [None]:
# Extract only Norcia waveforms from full HDF5
def extract_norcia_waveforms(hdf5_path, norcia_metadata, output_path=None):
    """Extract Norcia-specific waveforms from full INSTANCE dataset"""
    
    if norcia_metadata is None or norcia_metadata.empty:
        print("No Norcia metadata available for extraction")
        return
    
    try:
        # Get trace IDs to extract
        if 'trace_id' in norcia_metadata.columns:
            trace_ids = set(norcia_metadata['trace_id'].unique())
            print(f"Extracting {len(trace_ids)} unique traces...")
        else:
            print("No trace_id column found in metadata")
            return
        
        # Open source HDF5
        with h5py.File(hdf5_path, 'r') as src:
            # Create output HDF5 with only Norcia data
            if output_path is None:
                if IN_COLAB:
                    output_path = '/content/drive/MyDrive/datasets/norcia/norcia_waveforms.hdf5'
                else:
                    output_path = 'norcia_waveforms.hdf5'
            
            with h5py.File(output_path, 'w') as dst:
                # Copy relevant waveforms
                # Note: Actual implementation depends on INSTANCE HDF5 structure
                print(f"Creating filtered dataset: {output_path}")
                
                # This is a placeholder - actual extraction code would go here
                # based on the specific structure of INSTANCE HDF5
                
        print(f"✓ Norcia waveforms extracted to: {output_path}")
        
    except Exception as e:
        print(f"Error extracting waveforms: {e}")

# Run extraction if we have both HDF5 and filtered metadata
if norcia_metadata is not None:
    extract_norcia_waveforms(hdf5_path, norcia_metadata)

## Next Steps

1. **Download the full INSTANCE dataset** from http://doi.org/10.13127/instance
2. **Use the filtered metadata** to request only Norcia waveforms
3. **Process waveforms** for your specific analysis needs

### Useful Resources:
- INSTANCE documentation: http://doi.org/10.13127/instance
- Norcia 2016 earthquake info: https://earthquake.usgs.gov/earthquakes/eventpage/us1000731j
- ObsPy for seismic processing: https://www.obspy.org/

In [None]:
# Summary
print("\n" + "="*60)
print("Download Summary")
print("="*60)

if IN_COLAB:
    print("✓ Running in Google Colab")
    print(f"  Data directory: /content/drive/MyDrive/datasets/norcia/")
else:
    print("⚠ Not in Colab - limited storage available")
    print("  Upload this notebook to Colab for best results")

if norcia_metadata is not None:
    print(f"\n✓ Norcia metadata filtered:")
    print(f"  {len(norcia_metadata)} traces identified")
    print(f"  Saved to: norcia_metadata.csv")
else:
    print("\n⚠ No metadata loaded")
    print("  Download from http://doi.org/10.13127/instance")

print("\nNext: Visit INSTANCE website to download data")