# Data Preprocessing for Solar Radiation Dataset

This notebook preprocesses the NSRDB (National Solar Radiation Database) H5 data and implements two splitting strategies:
1. Time-Based Split - Divides data chronologically
2. Spatial Split - Divides data by geographic locations

Both strategies allow for customization of train/validation/test ratios.

In [1]:
import h5py
import numpy as np
import os
from tqdm.notebook import tqdm
from typing import Dict, List, Tuple, Union, Optional


## Configuration

Set up configuration parameters for data splitting.

In [2]:
# Input data file
INPUT_FILE = "data/NSRDB/vietnam_2016.h5"

# Output directory
OUTPUT_DIR = "data/processed"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Default split ratios
DEFAULT_TRAIN_RATIO = 0.7
DEFAULT_VAL_RATIO = 0.15
DEFAULT_TEST_RATIO = 0.15


## Load and Explore Data

Load the H5 file and explore its structure.

In [3]:
def load_h5_file(file_path: str) -> h5py.File:
    """Load an H5 file and return the file object.

    Args:
        file_path: Path to the H5 file

    Returns:
        h5py.File: Loaded H5 file object
    """
    try:
        return h5py.File(file_path, 'r')
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        raise

# Load the H5 file
h5_file = load_h5_file(INPUT_FILE)

# List datasets in the file
print("Datasets in the file:")
for key in h5_file:
    dataset = h5_file[key]
    print(f"/{key} {dataset.shape}: {dataset.dtype}")


Datasets in the file:
/air_temperature (8784, 75361): int8
/coordinates (75361, 2): float32
/dhi (8784, 75361): int16
/dni (8784, 75361): int16
/ghi (8784, 75361): int16
/meta (75361,): [('gid', '<i4'), ('latitude', '<f4'), ('longitude', '<f4'), ('country', 'S8'), ('timezone', '<i2'), ('elevation', '<f4')]
/time_index (8784,): |S25
/wind_speed (8784, 75361): int16


In [4]:
# Examine time_index
time_index = h5_file['time_index'][:]
print(f"First 5 timestamps: {time_index[:5]}")
print(f"Last 5 timestamps: {time_index[-5:]}")
print(f"Total time steps: {len(time_index)}")


First 5 timestamps: [b'2016-01-01 00:00:00+00:00' b'2016-01-01 01:00:00+00:00'
 b'2016-01-01 02:00:00+00:00' b'2016-01-01 03:00:00+00:00'
 b'2016-01-01 04:00:00+00:00']
Last 5 timestamps: [b'2016-12-31 19:00:00+00:00' b'2016-12-31 20:00:00+00:00'
 b'2016-12-31 21:00:00+00:00' b'2016-12-31 22:00:00+00:00'
 b'2016-12-31 23:00:00+00:00']
Total time steps: 8784


In [5]:
# Examine coordinates
coordinates = h5_file['coordinates'][:]
print(f"Sample coordinates (first 5 rows):\n{coordinates[:5]}")
print(f"Total locations: {len(coordinates)}")


Sample coordinates (first 5 rows):
[[ 24.   100.  ]
 [ 23.95 100.  ]
 [ 23.9  100.  ]
 [ 23.85 100.  ]
 [ 23.8  100.  ]]
Total locations: 75361


## Data Splitting Strategies

Implement two different data splitting strategies:
1. Time-Based Split
2. Spatial Split

In [6]:
def validate_ratios(train_ratio: float, val_ratio: float, test_ratio: float) -> bool:
    """Validate that the ratios sum to approximately 1.0.

    Args:
        train_ratio: Ratio for training set
        val_ratio: Ratio for validation set
        test_ratio: Ratio for test set

    Returns:
        bool: True if ratios are valid
    """
    total = train_ratio + val_ratio + test_ratio
    if abs(total - 1.0) > 1e-10:
        print(f"Warning: Ratios sum to {total}, not 1.0. Adjusting ratios.")
        return False
    return True


### 1. Time-Based Split

Split the data chronologically (by time index).

In [7]:
def time_based_split(
    h5_file: h5py.File,
    train_ratio: float = DEFAULT_TRAIN_RATIO,
    val_ratio: float = DEFAULT_VAL_RATIO,
    test_ratio: float = DEFAULT_TEST_RATIO
) -> Dict[str, Tuple[int, int]]:
    """Split data chronologically by time.

    Args:
        h5_file: H5 file object
        train_ratio: Proportion of data for training
        val_ratio: Proportion of data for validation
        test_ratio: Proportion of data for testing

    Returns:
        Dict with dataset splits information
    """
    # Validate ratios
    if not validate_ratios(train_ratio, val_ratio, test_ratio):
        # Normalize ratios
        total = train_ratio + val_ratio + test_ratio
        train_ratio /= total
        val_ratio /= total
        test_ratio /= total

    num_timesteps = h5_file['time_index'].shape[0]

    # Calculate split indices
    train_end = int(num_timesteps * train_ratio)
    val_end = train_end + int(num_timesteps * val_ratio)

    # Define splits as (start_idx, end_idx) tuples
    splits = {
        'train': (0, train_end),
        'val': (train_end, val_end),
        'test': (val_end, num_timesteps)
    }

    # Print split information
    print(f"Time-based split:")
    print(f"  Train: {splits['train'][0]} to {splits['train'][1]-1} ({splits['train'][1] - splits['train'][0]} samples)")
    print(f"  Val:   {splits['val'][0]} to {splits['val'][1]-1} ({splits['val'][1] - splits['val'][0]} samples)")
    print(f"  Test:  {splits['test'][0]} to {splits['test'][1]-1} ({splits['test'][1] - splits['test'][0]} samples)")

    return splits


### 2. Spatial Split

Split the data by locations (geographically).

In [16]:
def spatial_split(
    h5_file: h5py.File,
    train_ratio: float = DEFAULT_TRAIN_RATIO,
    val_ratio: float = DEFAULT_VAL_RATIO,
    test_ratio: float = DEFAULT_TEST_RATIO,
    random_seed: int = 42
) -> Dict[str, np.ndarray]:
    """Split data by geographic locations.

    Args:
        h5_file: H5 file object
        train_ratio: Proportion of locations for training
        val_ratio: Proportion of locations for validation
        test_ratio: Proportion of locations for testing
        random_seed: Random seed for reproducibility

    Returns:
        Dict with location indices for each split
    """
    # Validate ratios
    if not validate_ratios(train_ratio, val_ratio, test_ratio):
        # Normalize ratios
        total = train_ratio + val_ratio + test_ratio
        train_ratio /= total
        val_ratio /= total
        test_ratio /= total

    num_locations = h5_file['coordinates'].shape[0]

    # Create shuffled indices for locations
    np.random.seed(random_seed)
    location_indices = np.arange(num_locations)
    np.random.shuffle(location_indices)

    # Calculate split sizes
    train_size = int(num_locations * train_ratio)
    val_size = int(num_locations * val_ratio)

    # Split location indices
    train_indices = location_indices[:train_size]
    val_indices = location_indices[train_size:train_size + val_size]
    test_indices = location_indices[train_size + val_size:]

    # Sort indices for h5py compatibility (h5py requires indices to be in increasing order)
    train_indices = np.sort(train_indices)
    val_indices = np.sort(val_indices)
    test_indices = np.sort(test_indices)

    splits = {
        'train': train_indices,
        'val': val_indices,
        'test': test_indices
    }

    # Print split information
    print(f"Spatial split:")
    print(f"  Train: {len(train_indices)} locations ({len(train_indices)/num_locations:.2%})")
    print(f"  Val:   {len(val_indices)} locations ({len(val_indices)/num_locations:.2%})")
    print(f"  Test:  {len(test_indices)} locations ({len(test_indices)/num_locations:.2%})")

    return splits


## Save Splits to H5 Files

Functions to save the generated splits to separate H5 files.

In [10]:
def save_time_based_split(
    h5_file: h5py.File,
    splits: Dict[str, Tuple[int, int]],
    output_dir: str
) -> None:
    """Save time-based splits to separate H5 files.

    Args:
        h5_file: Source H5 file
        splits: Dictionary with time split information
        output_dir: Directory to save output files
    """
    for split_name, (start_idx, end_idx) in tqdm(splits.items(), desc="Saving time-based splits"):
        output_file = f"{output_dir}/time_based_{split_name}.h5"

        with h5py.File(output_file, 'w') as out_file:
            # Copy time-sliced data for each dataset
            for key in h5_file:
                dataset = h5_file[key]

                # Handle time-indexed datasets vs. metadata
                if key == 'time_index':
                    # Copy time slice
                    out_file.create_dataset(key, data=dataset[start_idx:end_idx])

                elif len(dataset.shape) == 2 and dataset.shape[0] == len(h5_file['time_index']):
                    # Time-indexed dataset (e.g., ghi, dni, air_temperature)
                    out_file.create_dataset(key, data=dataset[start_idx:end_idx, :])

                else:
                    # Copy other datasets as-is (coordinates, meta)
                    out_file.create_dataset(key, data=dataset[:])

        print(f"Saved {split_name} split to {output_file}")


In [17]:
def save_spatial_split(
    h5_file: h5py.File,
    splits: Dict[str, np.ndarray],
    output_dir: str
) -> None:
    """Save spatial splits to separate H5 files.

    Args:
        h5_file: Source H5 file
        splits: Dictionary with location indices for each split
        output_dir: Directory to save output files
    """
    for split_name, location_indices in tqdm(splits.items(), desc="Saving spatial splits"):
        output_file = f"{output_dir}/spatial_{split_name}.h5"

        with h5py.File(output_file, 'w') as out_file:
            # Copy location-sliced data for each dataset
            for key in h5_file:
                dataset = h5_file[key]

                # Handle different dataset types
                if key == 'coordinates':
                    # Location coordinates
                    out_file.create_dataset(key, data=dataset[location_indices])

                elif key == 'meta':
                    # Location metadata
                    out_file.create_dataset(key, data=dataset[location_indices])

                elif key == 'time_index':
                    # Copy time index as-is
                    out_file.create_dataset(key, data=dataset[:])

                elif len(dataset.shape) == 2 and dataset.shape[1] == len(h5_file['coordinates']):
                    # Location-indexed data (e.g., ghi, dni, air_temperature)
                    out_file.create_dataset(key, data=dataset[:, location_indices])

        print(f"Saved {split_name} split to {output_file}")


## User Interface

Allow users to choose splitting strategy and customize ratios.

In [18]:
def process_data(
    strategy: str = 'time',  # 'time' or 'spatial'
    train_ratio: float = DEFAULT_TRAIN_RATIO,
    val_ratio: float = DEFAULT_VAL_RATIO,
    test_ratio: float = DEFAULT_TEST_RATIO,
    random_seed: int = 42
) -> None:
    """Process data using the specified splitting strategy.

    Args:
        strategy: Splitting strategy ('time' or 'spatial')
        train_ratio: Proportion for training set
        val_ratio: Proportion for validation set
        test_ratio: Proportion for testing set
        random_seed: Random seed for spatial split
    """
    if strategy not in ['time', 'spatial']:
        raise ValueError(f"Invalid strategy: {strategy}. Must be 'time' or 'spatial'.")

    print(f"Processing data using {strategy}-based split strategy")
    print(f"Ratios - Train: {train_ratio:.2f}, Val: {val_ratio:.2f}, Test: {test_ratio:.2f}")

    # Load the H5 file
    h5_file = load_h5_file(INPUT_FILE)

    try:
        if strategy == 'time':
            # Time-based split
            splits = time_based_split(h5_file, train_ratio, val_ratio, test_ratio)
            save_time_based_split(h5_file, splits, OUTPUT_DIR)
        else:
            # Spatial split
            splits = spatial_split(h5_file, train_ratio, val_ratio, test_ratio, random_seed)
            save_spatial_split(h5_file, splits, OUTPUT_DIR)

        print(f"Data processing complete. Files saved to {OUTPUT_DIR}")
    finally:
        h5_file.close()


## Run Data Processing

Execute data processing with desired settings.

In [None]:
# Splitting 1: Time-based split with default ratios
process_data(strategy='time')


Processing data using time-based split strategy
Ratios - Train: 0.70, Val: 0.15, Test: 0.15
Time-based split:
  Train: 0 to 6147 (6148 samples)
  Val:   6148 to 7464 (1317 samples)
  Test:  7465 to 8783 (1319 samples)


Saving time-based splits:   0%|          | 0/3 [00:00<?, ?it/s]

Saved train split to data/processed/time_based_train.h5
Saved val split to data/processed/time_based_val.h5
Saved test split to data/processed/time_based_test.h5
Data processing complete. Files saved to data/processed


In [None]:
# Splitting 2: Spatial split with default ratios
process_data(strategy='spatial')


Processing data using spatial-based split strategy
Ratios - Train: 0.70, Val: 0.15, Test: 0.15
Spatial split:
  Train: 52752 locations (70.00%)
  Val:   11304 locations (15.00%)
  Test:  11305 locations (15.00%)


Saving spatial splits:   0%|          | 0/3 [00:00<?, ?it/s]

## Checking Split Results

This section lets you examine the generated split files.

In [None]:
def check_split_file(file_path: str) -> None:
    """Check the contents of a split file.

    Args:
        file_path: Path to the split file
    """
    try:
        with h5py.File(file_path, 'r') as f:
            print(f"\nFile: {file_path}")
            for key in f:
                dataset = f[key]
                print(f"  /{key} {dataset.shape}: {dataset.dtype}")

            # Show sample data
            if 'time_index' in f:
                print(f"  Time range: {f['time_index'][0]} to {f['time_index'][-1]}")

            if 'coordinates' in f:
                print(f"  Number of locations: {len(f['coordinates'])}")
    except Exception as e:
        print(f"Error checking file {file_path}: {e}")


In [None]:
# List all H5 files in the output directory
output_files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith('.h5')]
print(f"Split files generated: {output_files}")

# Check each file
for file_name in output_files:
    check_split_file(os.path.join(OUTPUT_DIR, file_name))
