# Importing External Data into TorchSig: Bring Your Own Data (BYOD) SigMF Example
This notebook shows an example of how to import externally created data into TorchSig using a basic SigMF file format.

This example provides a custom subclass of TorchSig's `FileReader` to read an externally created dataset as a `StaticTorchSigDataset`.

---

In [None]:
!pip install sigmf

In [None]:
import numpy as np
import datetime as dt
import os
from sigmf import SigMFFile, sigmffile
from typing import Tuple, Dict, List

# TorchSig
from torchsig.datasets.datasets import StaticTorchSigDataset
from torchsig.signals.signal_types import Signal
from torchsig.utils.file_handlers import FileReader
from torchsig.transforms.transforms import ComplexTo2D

## Step 1: External Data Generation Process: create synthetic data outside TorchSig workflow

If your data already exists somewhere, you can skip to Step 2.

We will write a sample dataset using SigMF data and metadata file formats.

### External Synthetic Data and Metadata Generation

In [None]:
# configuration parameters
root = "datasets/byod_sigmf_example"  # data file top-level folder
seed = 1234567890  # rng seed

os.makedirs(root, exist_ok=True)  # directory for files

Below, we generate some signals (outside of TorchSig).

In [None]:
# Parameters
fs = 1_000_000  # 1 MHz sample-rate (fixed rate)
num_samples = 1024  # samples per data (fixed size)
dataset_size = 8  # dataset size
labels = ["BPSK", "QPSK", "Noise"]  # three arbitrary metadata class labels (strings)
modcod = [0, 1, 2]  # three arbitrary metadata integers
rng = np.random.default_rng(seed)  # random number generator

In [None]:
# Create user's external data: non-TorchSig synthetic data along with metadata

signals_array = np.empty((dataset_size, num_samples), dtype=np.complex64)  # data
meta_rows = []  # metadata

t = np.arange(num_samples) / fs  # timesteps

# create synthetic dataset elements
for idx in range(dataset_size):
    label = rng.choice(labels)
    mc = rng.choice(modcod)

    if label == "BPSK":
        bits = rng.integers(0, 2, num_samples)
        sig = (2 * bits - 1) + 0j
    elif label == "QPSK":
        bits = rng.integers(0, 4, num_samples)
        table = {0: 1 + 1j, 1: 1 - 1j, 2: -1 + 1j, 3: -1 - 1j}
        sig = np.vectorize(table.get)(bits)
    else:  # white noise
        sig = (rng.normal(size=num_samples) + 1j * rng.normal(size=num_samples)) * 0.1

    sig /= np.sqrt((np.abs(sig) ** 2).mean())  # normalize power for consistency
    signals_array[idx] = sig.astype(np.complex64)

    # add to metadata
    meta_rows.append(dict(index=idx, label=label, modcod=mc, sample_rate=fs))

In [None]:
# Write and verify basic example SigMF data and metadata files

# SigMF stores samples sequentially, so we flatten the 2D
# data row-wise to simulate a wideband datastream
data_flattened = signals_array.flatten()

# write the aggregate binary data file (.sigmf-data)
data_filename = f"{root}/byod.sigmf-data"
meta_filename = f"{root}/byod.sigmf-meta"
data_flattened.tofile(data_filename)

# create the metadata file (.sigmf-meta)
meta = SigMFFile(
    data_file=data_filename,  # Link to the data file
    global_info={
        SigMFFile.DATATYPE_KEY: "cf32_le",  # Complex float32, little-endian
        SigMFFile.SAMPLE_RATE_KEY: fs,  # Sample rate in Hz
        SigMFFile.VERSION_KEY: "1.2.0",  # SigMF version
        SigMFFile.AUTHOR_KEY: "https://github.com/torchdsp/torchsig",
        SigMFFile.DESCRIPTION_KEY: "BYOD SigMF Example",
        "core:num_channels": 1,  # Specify number of channels
        "core:signal_length": num_samples,  # Number of I/Q samples in each signal
        "core:signal_count": dataset_size,  # Number of signals in data
    },
)

# add capture information (required)
meta.add_capture(
    0,
    metadata={
        SigMFFile.FREQUENCY_KEY: 2_450_000_000,  # specify some arbitrary center frequency in Hz
        SigMFFile.DATETIME_KEY: dt.datetime.utcnow().isoformat() + "Z",
    },
)

# save signal-specific metadata as annotations
for i, m in enumerate(meta_rows):
    generated_metadata = meta_rows[i]  # metadata for signal i
    sample_start_idx = i * num_samples  # signal's I/Q start index in data file
    meta.add_annotation(
        sample_start_idx,
        num_samples,
        metadata={
            SigMFFile.LABEL_KEY: generated_metadata["label"],
            SigMFFile.COMMENT_KEY: str(generated_metadata["modcod"]),
        },
    )

# Validate and write the metadata file (.sigmf-meta)
assert not meta.validate()  # sigmf check
meta.tofile(f"{root}/byod.sigmf-meta")

print(f"SigMF files created:")
print(f"  Data: {data_filename}")
print(f"  Metadata: {meta_filename}")

# check files
loaded_sigmf = sigmffile.fromfile(meta_filename)
M = loaded_sigmf.get_global_field("core:signal_count")
N = loaded_sigmf.get_global_field("core:signal_length")
loaded_data = loaded_sigmf.read_samples()  # read all samples
print(f"Meta data size verified: {loaded_data.shape[0] == (M*N)}")
print(f"Data verified: {np.allclose(data_flattened, loaded_data)}")
print(f"Synthetic signals + metadata staged in {root}")

## Step 2. FileReader

To have your data on disk interface with TorchSig, you must write your own `FileReader` so TorchSig knows how to handle your data. Make sure to call `super()`.

In [None]:
class BYODExampleFileHandler(FileReader):

    def __init__(self, root: str):
        super().__init__(root=root)

        self.data_filename = f"{root}/byod.sigmf-data"
        self.meta_filename = f"{root}/byod.sigmf-meta"
        self.data_size = None
        self.class_list = ["BPSK", "QPSK", "Noise"]
        self.dataset_metadata = self.load_dataset_metadata()

    def __len__(self) -> int:
        if self.data_size is None:
            try:
                loaded_sigmf = sigmffile.fromfile(self.meta_filename)
                self.data_size = loaded_sigmf.get_global_field("core:signal_count")
            except:
                raise ValueError(f"Error loading {self.meta_filename}")

        return self.data_size

    def load_dataset_metadata(self) -> Dict:
        try:
            loaded_sigmf = sigmffile.fromfile(self.meta_filename)
            num_iq_samples_dataset = loaded_sigmf.get_global_field("core:signal_length")
            sample_rate = loaded_sigmf.get_global_field(SigMFFile.SAMPLE_RATE_KEY)
            class_list = self.class_list
            num_samples = loaded_sigmf.get_global_field("core:signal_count")

            metadata_dict = {
                "num_iq_samples_dataset": num_iq_samples_dataset,
                "sample_rate": sample_rate,
                "class_list": class_list,
                "num_samples": num_samples,
            }
            return metadata_dict
        except:
            raise ValueError(f"Error loading {self.meta_filename}")

    def read(self, idx: int) -> Tuple[np.ndarray, List[Dict]]:
        try:
            sigmf_file = sigmffile.fromfile(
                self.meta_filename
            )  # creates data memory map access
            sample_rate = sigmf_file.get_global_field(SigMFFile.SAMPLE_RATE_KEY)
            annotations = sigmf_file.get_annotations()  # load metadata annotations

            sigmf_signal_meta = annotations[idx]
            metadata = {}
            metadata["index"] = idx
            metadata["sample_rate"] = sample_rate
            metadata["class_name"] = sigmf_signal_meta["core:label"]
            metadata["class_index"] = self.class_list.index(metadata["class_name"])
            metadata["modcod"] = sigmf_signal_meta["core:comment"]

            start_idx = sigmf_signal_meta["core:sample_start"]
            stop_idx = start_idx + sigmf_signal_meta["core:sample_count"]
            data = sigmf_file[start_idx:stop_idx]

            metadata["num_signals_max"] = 1

            return Signal(data, component_signals=[], metadata=metadata)

        except:
            raise ValueError(f"Error loading {self.meta_filename}")


test = BYODExampleFileHandler(root)
print(f"Size: {len(test)}")
print(f"Load element 2: {test.read(2)}")

## Step 3: StaticTorchSigDataset

Use `StaticTorchSigDataset` and custom file handler (above) to interface with the dataset.

In [None]:
root = "datasets/byod_sigmf_example"

custom_dataset = StaticTorchSigDataset(
    root=root, file_handler_class=BYODExampleFileHandler, target_labels=None
)
print(f"Dataset size: {len(custom_dataset)}")

sample = custom_dataset[4]
print(f"Data: {sample.data}")
print(sample)

In [None]:
# can apply transforms and metadata transforms
root = "datasets/byod_sigmf_example"

custom_dataset_2 = StaticTorchSigDataset(
    root=root,
    file_handler_class=BYODExampleFileHandler,
    transforms=[ComplexTo2D()],
    target_labels=[
        "modcod"
    ],  # transform complex data to 2D format  # return custom label
)
print(f"Dataset size: {len(custom_dataset_2)}")

data, label = custom_dataset_2[4]
print(f"Data element shape: {data.shape}")
print(f"Label: {label}")