# Importing External Data into TorchSig: Bring Your Own Data (BYOD) NumPy
This notebook shows how to import externally created data into TorchSig using a basic NumPy data plus JSON metadata example file format.

---

The main code that the user must write is a subclass of `ExternalFileHandler`, which will be passed into a `ExternalTorchSigDataset`. The `ExternalFileHandler` class must implement 3 methods:
| Method | Arguments | Return | Description |
| ------ | --------- | ------ | ----------- |
| `size` | N/A | int | Number of data samples, dataset size |
| `load_dataset_metadata` | N/A | `ExternalDatasetMetadata` | Dataset information, see `datasets/dataset_metadata.py` for more information. |
| `load` | idx: int | (np.ndarray, List[Any]) | Load sample `idx`, which includes data as np.ndarray and taregts as a list. |

If you want to apply TorchSig's transforms and impairments to your data, note that `load` must return targets that are in `List[Dict]` format, where each dict describes a signal. Additionally, the dict must have the fields required by each transform, e.g., `FamilyName` target transform requires the signal to have `class_name` in its metadata. It is up to the user to figure out what metadata is needed for what transforms/target transforms they wish to use.

In [None]:
import numpy as np
import os
import csv
import json
from typing import Tuple, Dict, List, Any
import itertools
import pprint

# TorchSig
from torchsig.datasets.datasets import ExternalTorchSigDataset
from torchsig.datasets.dataset_metadata import ExternalDatasetMetadata
from torchsig.utils.file_handlers import ExternalFileHandler
from torchsig.transforms.transforms import ComplexTo2D

## Step 1: External Data Generation Process: create synthetic data outside TorchSig workflow

If your data already exists somewhere, you can skip to Step 2.

We will write a sample dataset using Numpy's npy for signal data and and csv for metadata. 

### External Synthetic Data and Metadata Generation

In [None]:
# configuration parameters
root = 'datasets/byod_npy_example'   # data file top-level folder 
seed = 1234567890                    # rng seed

os.makedirs(root, exist_ok=True)     # directory for files

Below, we generate some signals (outside of TorchSig).

In [None]:
# Parameters
fs = 1_000_000                              # 1 MHz sample-rate (fixed rate)
num_samples = 1024                          # samples per data (fixed size)
dataset_size = 8                            # dataset size
labels = ['BPSK', 'QPSK', 'Noise']          # three arbitrary metadata class labels (strings)
modcod = [0, 1, 2]                          # three arbitrary metadata integers
rng = np.random.default_rng(seed)           # random number generator

In [None]:
# Create user's external data: non-TorchSig synthetic data along with metadata

signals_array = np.empty((dataset_size, num_samples), dtype=np.complex64)  # store all data in memory
meta_rows = []                                           # store all metadata in memory

t = np.arange(num_samples) / fs  # timesteps

# create dataset
for idx in range(dataset_size):
    label = rng.choice(labels)
    mc = rng.choice(modcod)
    
    if label == "BPSK":
        bits   = rng.integers(0, 2, num_samples)
        sig    = (2*bits-1) + 0j
    elif label == "QPSK":
        bits   = rng.integers(0, 4, num_samples)
        table  = {0:1+1j, 1:1-1j, 2:-1+1j, 3:-1-1j}
        sig    = np.vectorize(table.get)(bits)
    else:  # white noise
        sig = (rng.normal(size=num_samples) + 1j*rng.normal(size=num_samples)) * 0.1
    
    sig /= np.sqrt((np.abs(sig)**2).mean()) # normalize power for consistency
    signals_array[idx] = sig.astype(np.complex64)
    
    # add to metadata
    meta_rows.append(
        dict(
            index=idx, 
            label=label, 
            modcod=mc, 
            sample_rate=fs
        )
    )

# write information about dataset
global_metadata = {
    "size": dataset_size,
    "num_samples": num_samples,
    "class_labels": labels,
    "sample_rate": fs
}
with open(f"{root}/info.json", 'w') as f:
    json.dump(global_metadata, f, indent=4)

# write data as npy
np.save(f"{root}/data.npy", signals_array)

# write metadata
with open(f"{root}/metadata.csv", 'w', newline='') as f:
    csv.DictWriter(f, fieldnames=meta_rows[0].keys()).writerows(meta_rows)

print(f"Synthetic signals + metadata staged in {root}")

## Step 2. ExternalFileHandler

To have your data on disk interface with TorchSig, you must write your own `ExternalFileHandler` so TorchSig knows how to handle your data. Make sure to call `super()`.

Note that the metadata must at least have:
- `class_name`
- `class_index`

In [None]:
class BYODExampleFileHandler(ExternalFileHandler):

    def __init__(
        self,
        root: str
    ):
        super().__init__(root=root)

        self.class_list = ['BPSK', 'QPSK', 'Noise']  

    def size(self) -> int:
        try:
            with open(f"{self.root}/info.json", "r") as f:
                dataset_info = json.load(f)

            return dataset_info["size"]
        except:
            raise ValueError(f"Error loading {root}/info.json")
    
    def load_dataset_metadata(self) -> ExternalDatasetMetadata:
        try:
            with open(f"{self.root}/info.json", "r") as f:
                dataset_info = json.load(f)

            return ExternalDatasetMetadata(
                # minimum fields required for ExternalDatasetMetadata
                num_iq_samples_dataset = dataset_info["num_samples"],
                sample_rate = dataset_info["sample_rate"],
                class_list = dataset_info["class_labels"],
                num_samples = dataset_info["size"]
            )
        except:
            raise ValueError(f"Error loading {self.root}/info.json")

    def load(self, idx: int) -> Tuple[np.ndarray, List[Dict]]:
        try:
            # loads entire data to access an element: inefficient, but acceptable for a
            # small basic example - use memory mapping or another format for better efficiency
            data = np.load(f"{self.root}/data.npy")[idx]

            with open(f"{self.root}/metadata.csv", "r") as f:
                reader = csv.DictReader(f, fieldnames=["index", "label", "modcod", "sample_rate"])
                # get to idx row
                row = next(itertools.islice(reader, idx, idx+1), None)
                if row is None:
                    raise IndexError(f"Metadata idx {idx} is out of bounds")

                row["index"] = int(row["index"])
                row["sample_rate"] = float(row["sample_rate"])
                # add class_name
                row["class_name"] = row["label"].lower()
                # add class index
                row["class_index"] = self.class_list.index(row["label"])

                metadata = row

            return data, [metadata]
        except:
            raise ValueError(f"Error loading {root}/info.json")

test = BYODExampleFileHandler(root)
print(f'Size: {test.size()}')
print(f'Metadata: {test.load_dataset_metadata()}')
print(f'Load element 2: {test.load(2)}')

## Step 3: ExternalTorchSigDataset

Use `ExternalTorchSigDataset` and custom file handler (above) to load in data.

In [None]:
root = 'datasets/byod_npy_example'    

custom_dataset = ExternalTorchSigDataset(
    file_handler = BYODExampleFileHandler(root),
    target_labels = None
)
print(f"Dataset size: {len(custom_dataset)}")

sample = custom_dataset[4]
print(f"data: {sample.data}")
print(f"metadata: {[meta.to_dict() for meta in sample.get_full_metadata()]}")

In [None]:
# can apply transforms and metadata transforms
root = 'datasets/byod_npy_example'    

custom_dataset_2 = ExternalTorchSigDataset(
    file_handler = BYODExampleFileHandler(root),
    transforms = [ComplexTo2D()],
    target_labels = ["modcod"]
)
print(f"Dataset size: {len(custom_dataset_2)}")

data, metadata = custom_dataset_2[4]
print(f"data: {data.shape}")
print(f"metadata: {metadata}")