In [2]:
import os
import numpy as np
import h5py
from sklearn.model_selection import train_test_split

# --- CONFIGURATION ---
buf = 1024  # Must match buffer size used when converting .bin to HDF5
test_size = 0.1  # Fraction of data to use as test set
seed = 42  # Random seed for reproducibility

# Path containing the HDF5 files generated from SDR .bin files
h5_folder_fp = "/Users/biratsapkota/Downloads/supported-format-files/"
folder = os.listdir(h5_folder_fp)
folder.sort()

print(f"Found {len(folder)} files in {h5_folder_fp}")


Found 32 files in /Users/biratsapkota/Downloads/supported-format-files/


In [3]:
# Initialize arrays to store full dataset and labels
dataset = np.zeros((1, buf, 2), dtype='f')        # shape: (samples, buf, 2)
dataset_labels = np.zeros((1, 4), dtype='i')      # shape: (samples, num_classes)

print("Initialized empty dataset arrays.")


Initialized empty dataset arrays.


In [4]:
for file in folder:
    file_path = os.path.join(h5_folder_fp, file)

    if os.path.isfile(file_path) and file.endswith(".h5"):
        # Open HDF5 file
        with h5py.File(file_path, 'r') as f:
            name = os.path.splitext(file)[0]
            data = f[name][()]

        print(f"Loaded {data.shape[0]} samples from {file}")

        # Append data to dataset
        dataset = np.concatenate((dataset, data), axis=0)

        # Generate labels from filename
        # Assumes filename starts with multi-hot label like '1010_filename.h5'
        label_str = name.split('_')[0]
        label = [int(c) for c in label_str]
        label = np.array([label] * data.shape[0], dtype='i')  # Repeat label for each sample
        dataset_labels = np.concatenate((dataset_labels, label), axis=0)


Loaded 3905 samples from 0000_day1.h5
Loaded 3905 samples from 0000_day2.h5
Loaded 3905 samples from 0001_day1.h5
Loaded 3905 samples from 0001_day2.h5
Loaded 3905 samples from 0010_day1.h5
Loaded 3905 samples from 0010_day2.h5
Loaded 3905 samples from 0011_day1.h5
Loaded 3905 samples from 0011_day2.h5
Loaded 3905 samples from 0100_day1.h5
Loaded 3905 samples from 0100_day2.h5
Loaded 3905 samples from 0101_day1.h5
Loaded 3905 samples from 0101_day2.h5
Loaded 3905 samples from 0110_day1.h5
Loaded 3905 samples from 0110_day2.h5
Loaded 3905 samples from 0111_day1.h5
Loaded 3905 samples from 0111_day2.h5
Loaded 3905 samples from 1000_day1.h5
Loaded 3905 samples from 1000_day2.h5
Loaded 3905 samples from 1001_day1.h5
Loaded 3905 samples from 1001_day2.h5
Loaded 3905 samples from 1010_day1.h5
Loaded 3905 samples from 1010_day2.h5
Loaded 3905 samples from 1011_day1.h5
Loaded 3905 samples from 1011_day2.h5
Loaded 3905 samples from 1100_day1.h5
Loaded 3905 samples from 1100_day2.h5
Loaded 3905 

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    dataset, dataset_labels, test_size=test_size, random_state=seed
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")


Training set: 112464 samples
Test set: 12497 samples


In [6]:
with h5py.File('./sdr_test.hdf5', 'w') as f_test:
    f_test.create_dataset('X', data=X_test, dtype='f')
    f_test.create_dataset('y', data=y_test, dtype='i')

print("Saved test set: sdr_test.hdf5")


Saved test set: sdr_test.hdf5


In [7]:
with h5py.File('./sdr_train.hdf5', 'w') as f_train:
    f_train.create_dataset('X', data=X_train, dtype='f')
    f_train.create_dataset('y', data=y_train, dtype='i')

print("Saved training set: sdr_train.hdf5")


Saved training set: sdr_train.hdf5
