# Data Visualization from H5 File

This notebook will explore and visualize the data from the `shot_66441_data.h5` file.

In [3]:
# Install required packages
!uv add h5py pandas matplotlib seaborn plotly numpy

[2mResolved [1m58 packages[0m [2min 7ms[0m[0m
[2mAudited [1m53 packages[0m [2min 0.45ms[0m[0m


In [4]:
# Import required libraries
import h5py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [5]:
# Load and explore the H5 file structure
# file_path = 'data/shot_66441_data.h5'
# file_path = 'data/shot_66674_data.h5'
file_path = 'data/shot_60123_data.h5'

def explore_h5_structure(file_path, level=0, max_level=3):
    """Recursively explore H5 file structure"""
    def print_structure(name, obj):
        indent = "  " * level
        if isinstance(obj, h5py.Dataset):
            print(f"{indent}{name}: Dataset, shape={obj.shape}, dtype={obj.dtype}")
        elif isinstance(obj, h5py.Group):
            print(f"{indent}{name}: Group")
            if level < max_level:
                obj.visititems(lambda n, o: print_structure(n, o) if n.count('/') == level else None)
    
    with h5py.File(file_path, 'r') as f:
        print("H5 File Structure:")
        print("==================")
        f.visititems(print_structure)

explore_h5_structure(file_path)

H5 File Structure:
astra: Group
q: Dataset, shape=(2310, 61), dtype=float64
rhopol: Dataset, shape=(2310, 61), dtype=float64
time: Dataset, shape=(1, 2310), dtype=float64
astra/q: Dataset, shape=(2310, 61), dtype=float64
astra/rhopol: Dataset, shape=(2310, 61), dtype=float64
astra/time: Dataset, shape=(1, 2310), dtype=float64
mhd: Group
amplitude: Dataset, shape=(1, 29021), dtype=float64
time: Dataset, shape=(1, 29021), dtype=float64
mhd/amplitude: Dataset, shape=(1, 29021), dtype=float64
mhd/time: Dataset, shape=(1, 29021), dtype=float64
width: Group
freqs: Dataset, shape=(1, 1900), dtype=float64
mpol: Dataset, shape=(1, 1900), dtype=float64
ntor: Dataset, shape=(1, 1900), dtype=float64
rhoRes: Dataset, shape=(1, 1900), dtype=float64
time: Dataset, shape=(1, 1900), dtype=float64
width: Dataset, shape=(1, 1900), dtype=float64
width/freqs: Dataset, shape=(1, 1900), dtype=float64
width/mpol: Dataset, shape=(1, 1900), dtype=float64
width/ntor: Dataset, shape=(1, 1900), dtype=float64
width

In [6]:
# Extract and print values from the specified datasets
with h5py.File(file_path, 'r') as f:
    # Extract the datasets (they are all within the width group)
    width_time = f['width/time'][:]
    width_width = f['width/width'][:]
    freqs = f['width/freqs'][:]
    mpol = f['width/mpol'][:]
    ntor = f['width/ntor'][:]
    rhoRes = f['width/rhoRes'][:]
    
    print("Dataset Values:")
    print("=" * 50)
    
    print(f"\nwidth/time: shape={width_time.shape}")
    print(f"Values: {width_time}")
    
    print(f"\nwidth/width: shape={width_width.shape}")
    print(f"Values: {width_width}")
    
    print(f"\nfreqs: shape={freqs.shape}")
    print(f"Values: {freqs}")
    
    print(f"\nmpol: shape={mpol.shape}")
    print(f"Values: {mpol}")
    
    print(f"\nntor: shape={ntor.shape}")
    print(f"Values: {ntor}")
    
    print(f"\nrhoRes: shape={rhoRes.shape}")
    print(f"Values: {rhoRes}")
    
    # Also show some basic statistics
    print("\n" + "=" * 50)
    print("Basic Statistics:")
    print("=" * 50)
    
    datasets = {
        'width/time': width_time[0],  # Remove the first dimension since it's (1, 1900)
        'width/width': width_width[0],
        'freqs': freqs[0],
        'mpol': mpol[0],
        'ntor': ntor[0],
        'rhoRes': rhoRes[0]
    }
    
    for name, data in datasets.items():
        print(f"\n{name}:")
        print(f"  Min: {np.min(data):.6f}")
        print(f"  Max: {np.max(data):.6f}")
        print(f"  Mean: {np.mean(data):.6f}")
        print(f"  Std: {np.std(data):.6f}")
        print(f"  First 10 values: {data[:10]}")
        print(f"  Last 10 values: {data[-10:]}")

Dataset Values:

width/time: shape=(1, 1900)
Values: [[   nan    nan    nan ... 2.3375 2.3385    nan]]

width/width: shape=(1, 1900)
Values: [[nan nan nan ... nan nan nan]]

freqs: shape=(1, 1900)
Values: [[          nan           nan           nan ... 1602.17285156
  1358.03222656           nan]]

mpol: shape=(1, 1900)
Values: [[nan nan nan ...  2.  2. nan]]

ntor: shape=(1, 1900)
Values: [[nan nan nan ...  1.  1. nan]]

rhoRes: shape=(1, 1900)
Values: [[       nan        nan        nan ... 0.6192869  0.61923099        nan]]

Basic Statistics:

width/time:
  Min: nan
  Max: nan
  Mean: nan
  Std: nan
  First 10 values: [nan nan nan nan nan nan nan nan nan nan]
  Last 10 values: [   nan    nan 2.3325 2.3335 2.3345 2.3355 2.3365 2.3375 2.3385    nan]

width/width:
  Min: nan
  Max: nan
  Mean: nan
  Std: nan
  First 10 values: [nan nan nan nan nan nan nan nan nan nan]
  Last 10 values: [nan nan nan nan nan nan nan nan nan nan]

freqs:
  Min: nan
  Max: nan
  Mean: nan
  Std: nan
  First

In [7]:
# Analyze the data excluding NaN values
print("=" * 60)
print("FILTERED ANALYSIS (excluding NaN values)")
print("=" * 60)

with h5py.File(file_path, 'r') as f:
    # Extract the datasets and remove the first dimension
    datasets = {
        'width/time': f['width/time'][0],
        'width/width': f['width/width'][0],
        'freqs': f['width/freqs'][0],
        'mpol': f['width/mpol'][0], 
        'ntor': f['width/ntor'][0],
        'rhoRes': f['width/rhoRes'][0]
    }
    
    for name, data in datasets.items():
        # Filter out NaN values
        valid_data = data[~np.isnan(data)]
        
        print(f"\n{name}:")
        print(f"  Total length: {len(data)}")
        print(f"  Valid (non-NaN) values: {len(valid_data)}")
        print(f"  NaN values: {len(data) - len(valid_data)}")
        
        if len(valid_data) > 0:
            print(f"  Min: {np.min(valid_data):.6f}")
            print(f"  Max: {np.max(valid_data):.6f}")
            print(f"  Mean: {np.mean(valid_data):.6f}")
            print(f"  Std: {np.std(valid_data):.6f}")
            print(f"  First 10 valid values: {valid_data[:min(10, len(valid_data))]}")
            print(f"  Last 10 valid values: {valid_data[-min(10, len(valid_data)):]}")
        else:
            print("  No valid (non-NaN) values found!")
            
        # Show the range of indices where valid data exists
        valid_indices = np.where(~np.isnan(data))[0]
        if len(valid_indices) > 0:
            print(f"  Valid data range: indices {valid_indices[0]} to {valid_indices[-1]}")
            print(f"  Data coverage: {len(valid_indices)}/{len(data)} ({100*len(valid_indices)/len(data):.1f}%)")

FILTERED ANALYSIS (excluding NaN values)

width/time:
  Total length: 1900
  Valid (non-NaN) values: 1852
  NaN values: 48
  Min: 0.472500
  Max: 2.338500
  Mean: 1.399349
  Std: 0.535657
  First 10 valid values: [0.4725 0.4735 0.4745 0.4755 0.4765 0.4775 0.4785 0.4795 0.4805 0.4815]
  Last 10 valid values: [2.3235 2.3245 2.3255 2.3325 2.3335 2.3345 2.3355 2.3365 2.3375 2.3385]
  Valid data range: indices 32 to 1898
  Data coverage: 1852/1900 (97.5%)

width/width:
  Total length: 1900
  Valid (non-NaN) values: 1775
  NaN values: 125
  Min: 0.018369
  Max: 0.040140
  Mean: 0.033665
  Std: 0.003448
  First 10 valid values: [0.01836927 0.02002589 0.02156992 0.01917405 0.02116773 0.02334019
 0.02308113 0.02346751 0.02410877 0.02142074]
  Last 10 valid values: [0.02780106 0.02641292 0.02692264 0.02656749 0.026817   0.026555
 0.02563299 0.02560059 0.02518796 0.02329138]
  Valid data range: indices 32 to 1838
  Data coverage: 1775/1900 (93.4%)

freqs:
  Total length: 1900
  Valid (non-NaN) va

In [8]:
# Extract and examine time arrays from all groups
print("=" * 60)
print("TIME ARRAY ANALYSIS FOR ALL GROUPS")
print("=" * 60)

with h5py.File(file_path, 'r') as f:
    # Extract time arrays from each group
    astra_time = f['astra/time'][0]  # Remove first dimension
    width_time = f['width/time'][0]
    mhd_time = f['mhd/time'][0]
    
    # Extract some representative data from each group
    astra_q = f['astra/q'][:, 30]  # Take middle radial position (column 30 out of 61)
    width_width = f['width/width'][0]
    mhd_amplitude = f['mhd/amplitude'][0]
    
    print("Time Array Information:")
    print("-" * 40)
    
    for name, time_array in [('ASTRA', astra_time), ('WIDTH', width_time), ('MHD', mhd_time)]:
        valid_time = time_array[~np.isnan(time_array)]
        print(f"\n{name} group:")
        print(f"  Total points: {len(time_array)}")
        print(f"  Valid points: {len(valid_time)}")
        print(f"  Time range: {valid_time.min():.4f} - {valid_time.max():.4f} seconds")
        print(f"  Duration: {valid_time.max() - valid_time.min():.4f} seconds")
        print(f"  Time step (mean): {np.mean(np.diff(valid_time)):.6f} seconds")
    
    # Store data for plotting
    time_data = {
        'astra': {'time': astra_time, 'data': astra_q, 'label': 'ASTRA q-profile (mid-radius)'},
        'width': {'time': width_time, 'data': width_width, 'label': 'Width'},
        'mhd': {'time': mhd_time, 'data': mhd_amplitude, 'label': 'MHD Amplitude'}
    }
    
    print(f"\nData will be stored for interactive plotting...")
    print(f"ASTRA data shape: {astra_q.shape}")
    print(f"WIDTH data shape: {width_width.shape}")
    print(f"MHD data shape: {mhd_amplitude.shape}")

TIME ARRAY ANALYSIS FOR ALL GROUPS
Time Array Information:
----------------------------------------

ASTRA group:
  Total points: 2310
  Valid points: 2310
  Time range: 0.1000 - 2.4098 seconds
  Duration: 2.3098 seconds
  Time step (mean): 0.001000 seconds

WIDTH group:
  Total points: 1900
  Valid points: 1852
  Time range: 0.4725 - 2.3385 seconds
  Duration: 1.8660 seconds
  Time step (mean): 0.001008 seconds

MHD group:
  Total points: 29021
  Valid points: 29021
  Time range: 0.0100 - 2.9120 seconds
  Duration: 2.9020 seconds
  Time step (mean): 0.000100 seconds

Data will be stored for interactive plotting...
ASTRA data shape: (2310,)
WIDTH data shape: (1900,)
MHD data shape: (29021,)


In [None]:
# Create interactive plot with all three groups aligned by time
from scipy import interpolate

print("Creating interactive plot with time-aligned data...")

with h5py.File(file_path, 'r') as f:
    # Extract time arrays and data
    astra_time = f['astra/time'][0]
    astra_q = f['astra/q'][:, 30]  # Middle radial position
    
    width_time = f['width/time'][0]
    width_width = f['width/width'][0]
    
    mhd_time = f['mhd/time'][0]
    mhd_amplitude = f['mhd/amplitude'][0]
    
    # Define common time window (intersection of all valid ranges)
    # Find the overlapping time range
    astra_valid = astra_time[~np.isnan(astra_time)]
    width_valid = width_time[~np.isnan(width_time)]
    mhd_valid = mhd_time[~np.isnan(mhd_time)]
    
    time_start = max(astra_valid.min(), width_valid.min(), mhd_valid.min())
    time_end = min(astra_valid.max(), width_valid.max(), mhd_valid.max())
    
    print(f"Common time window: {time_start:.4f} to {time_end:.4f} seconds")
    
    # Create common time grid for interpolation
    common_time = np.linspace(time_start, time_end, 1000)
    
    # Interpolate all datasets to common time grid
    datasets = {}
    
    # ASTRA data
    astra_mask = ~np.isnan(astra_time) & ~np.isnan(astra_q)
    if np.sum(astra_mask) > 1:
        astra_interp = interpolate.interp1d(
            astra_time[astra_mask], astra_q[astra_mask], 
            kind='linear', bounds_error=False, fill_value=np.nan
        )
        datasets['ASTRA q-profile'] = astra_interp(common_time)
    
    # WIDTH data
    width_mask = ~np.isnan(width_time) & ~np.isnan(width_width)
    if np.sum(width_mask) > 1:
        width_interp = interpolate.interp1d(
            width_time[width_mask], width_width[width_mask],
            kind='linear', bounds_error=False, fill_value=np.nan
        )
        datasets['Width'] = width_interp(common_time)
    
    # MHD data
    mhd_mask = ~np.isnan(mhd_time) & ~np.isnan(mhd_amplitude)
    if np.sum(mhd_mask) > 1:
        mhd_interp = interpolate.interp1d(
            mhd_time[mhd_mask], mhd_amplitude[mhd_mask],
            kind='linear', bounds_error=False, fill_value=np.nan
        )
        datasets['MHD Amplitude'] = mhd_interp(common_time)

# Create interactive plot using Plotly
fig = make_subplots(
    rows=3, cols=1,
    shared_xaxes=True,
    subplot_titles=list(datasets.keys()),
    vertical_spacing=0.08,
    specs=[[{"secondary_y": False}],
           [{"secondary_y": False}],
           [{"secondary_y": False}]]
)

colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

for i, (name, data) in enumerate(datasets.items(), 1):
    fig.add_trace(
        go.Scatter(
            x=common_time,
            y=data,
            mode='lines',
            name=name,
            line=dict(color=colors[i-1], width=2),
            hovertemplate=f'<b>{name}</b><br>Time: %{{x:.4f}} s<br>Value: %{{y:.6f}}<extra></extra>'
        ),
        row=i, col=1
    )

# Update layout
fig.update_layout(
    height=800,
    title=dict(
        text="Time-Aligned Plasma Diagnostics Data",
        x=0.5,
        font=dict(size=16)
    ),
    showlegend=True,
    hovermode='x unified'
)

# Update x-axis for bottom subplot only
fig.update_xaxes(title_text="Time (seconds)", row=3, col=1)

# Update y-axes
fig.update_yaxes(title_text="q-profile", row=1, col=1)
fig.update_yaxes(title_text="Width", row=2, col=1)
fig.update_yaxes(title_text="Amplitude", row=3, col=1)

fig.show()

print(f"Plot created with {len(common_time)} interpolated points")
print(f"Time range: {common_time[0]:.4f} - {common_time[-1]:.4f} seconds")

Creating interactive plot with time-aligned data...
Common time window: 0.4725 to 2.3385 seconds


Plot created with 1000 interpolated points
Time range: 0.4725 - 2.3385 seconds
