# Aditya-L1 CME Detection - Data Exploration

This notebook explores the SWIS-ASPEX particle data and CACTUS CME events to understand patterns and characteristics for CME detection.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Data exploration notebook for Aditya-L1 CME Detection System")
print("=" * 60)

## 1. Load and Examine Data

In [None]:
# Load simulated data for exploration
import sys
sys.path.append('../')

from src.data_ingestion.swis_ingestion import SWISIngestion
from src.data_ingestion.cactus_ingestion import CACTUSIngestion

# Generate sample data
swis_ingestion = SWISIngestion()
cactus_ingestion = CACTUSIngestion()

# Simulate 7 days of particle data
particle_data = swis_ingestion.simulate_particle_data(duration_minutes=7*24*60)
particle_df = pd.DataFrame(particle_data)
particle_df['timestamp'] = pd.to_datetime(particle_df['timestamp'])

# Simulate CME events
cme_events = cactus_ingestion.simulate_cme_events(days=7)
cme_df = pd.DataFrame(cme_events)
cme_df['timestamp'] = pd.to_datetime(cme_df['timestamp'])

print(f"Particle data shape: {particle_df.shape}")
print(f"CME events shape: {cme_df.shape}")
print(f"\nParticle data time range: {particle_df['timestamp'].min()} to {particle_df['timestamp'].max()}")
print(f"CME events time range: {cme_df['timestamp'].min()} to {cme_df['timestamp'].max()}")

In [None]:
# Examine particle data structure
print("Particle Data Info:")
print(particle_df.info())
print("\nParticle Data Sample:")
particle_df.head()

In [None]:
# Examine CME events structure
print("CME Events Info:")
print(cme_df.info())
print("\nCME Events Sample:")
cme_df.head()

## 2. Particle Data Analysis

In [None]:
# Statistical summary of particle data
print("Particle Data Statistical Summary:")
particle_df.describe()

In [None]:
# Time series plots of particle flux
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('SWIS-ASPEX Particle Data Time Series', fontsize=16)

# Particle flux
axes[0, 0].plot(particle_df['timestamp'], particle_df['proton_flux'], label='Proton', alpha=0.7)
axes[0, 0].plot(particle_df['timestamp'], particle_df['electron_flux'], label='Electron', alpha=0.7)
axes[0, 0].plot(particle_df['timestamp'], particle_df['alpha_flux'], label='Alpha', alpha=0.7)
axes[0, 0].set_title('Particle Flux')
axes[0, 0].set_ylabel('Flux (particles/cm²/s)')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# Solar wind velocity
axes[0, 1].plot(particle_df['timestamp'], particle_df['velocity'], color='purple', alpha=0.7)
axes[0, 1].set_title('Solar Wind Velocity')
axes[0, 1].set_ylabel('Velocity (km/s)')
axes[0, 1].grid(True, alpha=0.3)

# Temperature
axes[1, 0].plot(particle_df['timestamp'], particle_df['temperature']/1000, color='red', alpha=0.7)
axes[1, 0].set_title('Temperature')
axes[1, 0].set_ylabel('Temperature (×1000 K)')
axes[1, 0].grid(True, alpha=0.3)

# Magnetic field magnitude
magnetic_field_mag = np.sqrt(
    particle_df['magnetic_field'].apply(lambda x: x['bx']**2 + x['by']**2 + x['bz']**2)
)
axes[1, 1].plot(particle_df['timestamp'], magnetic_field_mag, color='orange', alpha=0.7)
axes[1, 1].set_title('Magnetic Field Magnitude')
axes[1, 1].set_ylabel('|B| (nT)')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle('Particle Data Distributions', fontsize=16)

# Proton flux distribution
axes[0, 0].hist(particle_df['proton_flux'], bins=50, alpha=0.7, color='blue')
axes[0, 0].set_title('Proton Flux Distribution')
axes[0, 0].set_xlabel('Flux (particles/cm²/s)')
axes[0, 0].set_ylabel('Frequency')

# Velocity distribution
axes[0, 1].hist(particle_df['velocity'], bins=50, alpha=0.7, color='purple')
axes[0, 1].set_title('Velocity Distribution')
axes[0, 1].set_xlabel('Velocity (km/s)')
axes[0, 1].set_ylabel('Frequency')

# Temperature distribution
axes[1, 0].hist(particle_df['temperature']/1000, bins=50, alpha=0.7, color='red')
axes[1, 0].set_title('Temperature Distribution')
axes[1, 0].set_xlabel('Temperature (×1000 K)')
axes[1, 0].set_ylabel('Frequency')

# Density distribution
axes[1, 1].hist(particle_df['density'], bins=50, alpha=0.7, color='green')
axes[1, 1].set_title('Density Distribution')
axes[1, 1].set_xlabel('Density (particles/cm³)')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. CME Events Analysis

In [None]:
# CME events summary
print("CME Events Summary:")
print(f"Total events: {len(cme_df)}")
print(f"Event types: {cme_df['type'].value_counts()}")
print(f"\nVelocity statistics:")
print(cme_df['velocity'].describe())
print(f"\nConfidence statistics:")
print(cme_df['confidence'].describe())

In [None]:
# CME events visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('CME Events Analysis', fontsize=16)

# Event types
cme_df['type'].value_counts().plot(kind='bar', ax=axes[0, 0], color=['red', 'orange', 'blue'])
axes[0, 0].set_title('CME Event Types')
axes[0, 0].set_ylabel('Count')
axes[0, 0].tick_params(axis='x', rotation=45)

# Velocity vs Confidence
colors = {'halo': 'red', 'partial_halo': 'orange', 'non_halo': 'blue'}
for event_type in cme_df['type'].unique():
    subset = cme_df[cme_df['type'] == event_type]
    axes[0, 1].scatter(subset['velocity'], subset['confidence'], 
                      label=event_type, color=colors[event_type], alpha=0.7)
axes[0, 1].set_title('Velocity vs Confidence')
axes[0, 1].set_xlabel('Velocity (km/s)')
axes[0, 1].set_ylabel('Confidence')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Velocity distribution by type
for event_type in cme_df['type'].unique():
    subset = cme_df[cme_df['type'] == event_type]
    axes[1, 0].hist(subset['velocity'], bins=20, alpha=0.6, 
                   label=event_type, color=colors[event_type])
axes[1, 0].set_title('Velocity Distribution by Event Type')
axes[1, 0].set_xlabel('Velocity (km/s)')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()

# Width distribution
axes[1, 1].hist(cme_df['width'], bins=30, alpha=0.7, color='green')
axes[1, 1].set_title('CME Width Distribution')
axes[1, 1].set_xlabel('Width (degrees)')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 4. Correlation Analysis

In [None]:
# Prepare data for correlation analysis
# Extract magnetic field components
particle_df['magnetic_field_bx'] = particle_df['magnetic_field'].apply(lambda x: x['bx'])
particle_df['magnetic_field_by'] = particle_df['magnetic_field'].apply(lambda x: x['by'])
particle_df['magnetic_field_bz'] = particle_df['magnetic_field'].apply(lambda x: x['bz'])
particle_df['magnetic_field_magnitude'] = particle_df['magnetic_field'].apply(lambda x: x['magnitude'])

# Select numerical columns for correlation
numerical_cols = ['proton_flux', 'electron_flux', 'alpha_flux', 'velocity', 
                 'temperature', 'density', 'magnetic_field_magnitude']

# Correlation matrix
correlation_matrix = particle_df[numerical_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5)
plt.title('Particle Data Correlation Matrix')
plt.tight_layout()
plt.show()

## 5. CME Event Detection Patterns

In [None]:
# Analyze particle data around CME events
def analyze_cme_signatures(particle_df, cme_df, window_hours=2):
    """
    Analyze particle data signatures around CME events
    """
    signatures = []
    
    for _, cme_event in cme_df.iterrows():
        cme_time = cme_event['timestamp']
        
        # Define time window around CME
        start_time = cme_time - timedelta(hours=window_hours)
        end_time = cme_time + timedelta(hours=window_hours)
        
        # Get particle data in this window
        window_data = particle_df[
            (particle_df['timestamp'] >= start_time) & 
            (particle_df['timestamp'] <= end_time)
        ].copy()
        
        if len(window_data) > 0:
            # Calculate statistics
            signature = {
                'cme_id': cme_event['id'],
                'cme_type': cme_event['type'],
                'cme_velocity': cme_event['velocity'],
                'cme_confidence': cme_event['confidence'],
                'max_proton_flux': window_data['proton_flux'].max(),
                'mean_proton_flux': window_data['proton_flux'].mean(),
                'max_velocity': window_data['velocity'].max(),
                'mean_velocity': window_data['velocity'].mean(),
                'max_temperature': window_data['temperature'].max(),
                'mean_temperature': window_data['temperature'].mean(),
                'max_magnetic_field': window_data['magnetic_field_magnitude'].max(),
                'mean_magnetic_field': window_data['magnetic_field_magnitude'].mean()
            }
            signatures.append(signature)
    
    return pd.DataFrame(signatures)

# Analyze signatures
signatures_df = analyze_cme_signatures(particle_df, cme_df)
print(f"Analyzed {len(signatures_df)} CME event signatures")
signatures_df.head()

In [None]:
# Compare signatures by CME type
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('CME Signatures by Event Type', fontsize=16)

# Max proton flux by CME type
signatures_df.boxplot(column='max_proton_flux', by='cme_type', ax=axes[0, 0])
axes[0, 0].set_title('Max Proton Flux by CME Type')
axes[0, 0].set_ylabel('Max Proton Flux')

# Max velocity by CME type
signatures_df.boxplot(column='max_velocity', by='cme_type', ax=axes[0, 1])
axes[0, 1].set_title('Max Velocity by CME Type')
axes[0, 1].set_ylabel('Max Velocity (km/s)')

# Max temperature by CME type
signatures_df.boxplot(column='max_temperature', by='cme_type', ax=axes[1, 0])
axes[1, 0].set_title('Max Temperature by CME Type')
axes[1, 0].set_ylabel('Max Temperature (K)')

# Max magnetic field by CME type
signatures_df.boxplot(column='max_magnetic_field', by='cme_type', ax=axes[1, 1])
axes[1, 1].set_title('Max Magnetic Field by CME Type')
axes[1, 1].set_ylabel('Max |B| (nT)')

plt.tight_layout()
plt.show()

## 6. Feature Engineering Insights

In [None]:
# Calculate moving averages and gradients
def calculate_features(df, windows=[5, 10, 30]):
    """
    Calculate basic features for CME detection
    """
    df = df.copy().sort_values('timestamp')
    
    for window in windows:
        # Moving averages
        df[f'proton_flux_ma_{window}'] = df['proton_flux'].rolling(window=window, min_periods=1).mean()
        df[f'velocity_ma_{window}'] = df['velocity'].rolling(window=window, min_periods=1).mean()
        
        # Gradients (differences)
        df[f'proton_flux_grad_{window}'] = df['proton_flux'].diff(periods=window)
        df[f'velocity_grad_{window}'] = df['velocity'].diff(periods=window)
    
    # Ratios
    df['proton_electron_ratio'] = df['proton_flux'] / (df['electron_flux'] + 1e-6)
    
    return df

# Apply feature engineering
particle_features = calculate_features(particle_df)
print(f"Added features. New shape: {particle_features.shape}")
print(f"New columns: {[col for col in particle_features.columns if col not in particle_df.columns]}")

In [None]:
# Visualize feature behavior around CME events
def plot_cme_event_context(particle_df, cme_event, hours_before=4, hours_after=4):
    """
    Plot particle data context around a specific CME event
    """
    cme_time = pd.to_datetime(cme_event['timestamp'])
    start_time = cme_time - timedelta(hours=hours_before)
    end_time = cme_time + timedelta(hours=hours_after)
    
    # Filter data
    context_data = particle_df[
        (particle_df['timestamp'] >= start_time) & 
        (particle_df['timestamp'] <= end_time)
    ].copy()
    
    if len(context_data) == 0:
        print("No data available for this time range")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(f'CME Event Context: {cme_event["id"]} ({cme_event["type"]})', fontsize=16)
    
    # Proton flux
    axes[0, 0].plot(context_data['timestamp'], context_data['proton_flux'], 'b-', alpha=0.7)
    if 'proton_flux_ma_10' in context_data.columns:
        axes[0, 0].plot(context_data['timestamp'], context_data['proton_flux_ma_10'], 'r-', linewidth=2)
    axes[0, 0].axvline(cme_time, color='red', linestyle='--', alpha=0.8, label='CME Event')
    axes[0, 0].set_title('Proton Flux')
    axes[0, 0].set_ylabel('Flux (particles/cm²/s)')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Velocity
    axes[0, 1].plot(context_data['timestamp'], context_data['velocity'], 'g-', alpha=0.7)
    if 'velocity_ma_10' in context_data.columns:
        axes[0, 1].plot(context_data['timestamp'], context_data['velocity_ma_10'], 'r-', linewidth=2)
    axes[0, 1].axvline(cme_time, color='red', linestyle='--', alpha=0.8, label='CME Event')
    axes[0, 1].set_title('Solar Wind Velocity')
    axes[0, 1].set_ylabel('Velocity (km/s)')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3)
        
    # Temperature
    axes[1, 0].plot(context_data['timestamp'], context_data['temperature']/1000, 'orange', alpha=0.7)
    axes[1, 0].axvline(cme_time, color='red', linestyle='--', alpha=0.8, label='CME Event')
    axes[1, 0].set_title('Temperature')
    axes[1, 0].set_ylabel('Temperature (×1000 K)')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3)
    
    # Magnetic field
    axes[1, 1].plot(context_data['timestamp'], context_data['magnetic_field_magnitude'], 'purple', alpha=0.7)
    axes[1, 1].axvline(cme_time, color='red', linestyle='--', alpha=0.8, label='CME Event')
    axes[1, 1].set_title('Magnetic Field Magnitude')
    axes[1, 1].set_ylabel('|B| (nT)')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# Plot context for a few CME events
if len(cme_df) > 0:
    # Plot first halo event if available
    halo_events = cme_df[cme_df['type'] == 'halo']
    if len(halo_events) > 0:
        plot_cme_event_context(particle_features, halo_events.iloc[0])
    else:
        # Plot first available event
        plot_cme_event_context(particle_features, cme_df.iloc[0])

## 7. Detection Thresholds Analysis

In [None]:
# Analyze optimal thresholds for CME detection
def analyze_thresholds(signatures_df):
    """
    Analyze detection thresholds for different CME types
    """
    print("Threshold Analysis for CME Detection")
    print("=" * 40)
    
    for cme_type in signatures_df['cme_type'].unique():
        subset = signatures_df[signatures_df['cme_type'] == cme_type]
        
        print(f"\n{cme_type.upper()} CMEs ({len(subset)} events):")
        print(f"  Max Proton Flux - Mean: {subset['max_proton_flux'].mean():.0f}, Std: {subset['max_proton_flux'].std():.0f}")
        print(f"  Max Velocity - Mean: {subset['max_velocity'].mean():.0f}, Std: {subset['max_velocity'].std():.0f}")
        print(f"  Max Temperature - Mean: {subset['max_temperature'].mean():.0f}, Std: {subset['max_temperature'].std():.0f}")
        print(f"  Max Magnetic Field - Mean: {subset['max_magnetic_field'].mean():.1f}, Std: {subset['max_magnetic_field'].std():.1f}")
        
        # Suggested thresholds (mean - 1 std)
        flux_threshold = subset['max_proton_flux'].mean() - subset['max_proton_flux'].std()
        velocity_threshold = subset['max_velocity'].mean() - subset['max_velocity'].std()
        
        print(f"  Suggested thresholds:")
        print(f"    Proton Flux: {flux_threshold:.0f} particles/cm²/s")
        print(f"    Velocity: {velocity_threshold:.0f} km/s")

analyze_thresholds(signatures_df)

## 8. Summary and Recommendations

In [None]:
print("DATA EXPLORATION SUMMARY")
print("=" * 50)

print(f"\n1. DATASET OVERVIEW:")
print(f"   - Particle data points: {len(particle_df):,}")
print(f"   - CME events: {len(cme_df)}")
print(f"   - Time span: {(particle_df['timestamp'].max() - particle_df['timestamp'].min()).days} days")

print(f"\n2. CME EVENT DISTRIBUTION:")
for event_type, count in cme_df['type'].value_counts().items():
    percentage = (count / len(cme_df)) * 100
    print(f"   - {event_type}: {count} events ({percentage:.1f}%)")

print(f"\n3. KEY FINDINGS:")
print(f"   - Halo CMEs show higher velocities (mean: {signatures_df[signatures_df['cme_type']=='halo']['cme_velocity'].mean():.0f} km/s)")
print(f"   - Strong correlation between proton flux and electron flux")
print(f"   - CME events show enhanced particle flux and velocity signatures")

print(f"\n4. RECOMMENDED DETECTION FEATURES:")
print(f"   - Proton flux (raw and moving averages)")
print(f"   - Solar wind velocity")
print(f"   - Temperature enhancements")
print(f"   - Magnetic field magnitude")
print(f"   - Flux gradients and ratios")

print(f"\n5. NEXT STEPS:")
print(f"   - Implement feature engineering pipeline")
print(f"   - Train machine learning models")
print(f"   - Optimize detection thresholds")
print(f"   - Validate with real Aditya-L1 data")