## **EDA of Proxima Fusion Dataset**

In [24]:
import datasets
import torch
from pprint import pprint

In [25]:
# Importing Dataset
ds = datasets.load_dataset(
    "proxima-fusion/constellaration",
    split="train",
    num_proc=4,
)

### Perform basic filtering and print example of a dataset

In [26]:
ds = ds.select_columns(
    [
        c
        for c in ds.column_names
        if c.startswith("boundary.") or c.startswith("metrics.")
    ]
)
ds = ds.filter(
    lambda x: x == 3,
    input_columns=["boundary.n_field_periods"],
    num_proc=4,
)
ml_ds = ds.remove_columns(
    [
        "boundary.n_field_periods",
        "boundary.is_stellarator_symmetric",  # all same value
        "boundary.r_sin",
        "boundary.z_cos",  # empty
        "boundary.json",
        "metrics.json",
        "metrics.id",  # not needed
    ]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_ds = ml_ds.with_format(
    "torch", device=device
)  # other options: "jax", "tensorflow" etc.

for batch in torch.utils.data.DataLoader(torch_ds, batch_size=4, num_workers=4):
    pprint(batch)
    break

{'boundary.r_cos': tensor([[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00,
          -6.5763e-02, -3.8500e-02,  2.2178e-03,  4.6007e-04],
         [-6.6648e-04, -1.0976e-02,  5.6475e-02,  1.4193e-02,  8.3476e-02,
          -4.6767e-02, -1.3679e-02,  3.9562e-03,  1.0087e-04],
         [-3.5474e-04,  4.7144e-03,  8.3967e-04, -1.9705e-02, -9.4592e-03,
          -5.8859e-03,  1.0172e-03,  9.2020e-04, -2.0059e-04],
         [ 2.9056e-03,  1.6125e-04, -4.0626e-04, -8.0189e-03,  1.3228e-03,
          -5.3636e-04, -7.3536e-04,  3.4558e-05,  1.4845e-04],
         [-1.2475e-04, -4.9942e-04, -2.6091e-04, -5.6161e-04,  8.3187e-05,
          -1.2714e-04, -2.1174e-04,  4.1940e-06, -4.5643e-05]],

        [[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  9.9909e-01,
          -6.8512e-02, -8.1567e-02,  2.5140e-02, -2.4035e-03],
         [-3.4328e-03,  1.6768e-02,  1.2305e-02, -3.6708e-02,  1.0285e-01,
           1.1224e-02, -2.3418e-02, -5.4137e-04,  9.3986e-04],
         [-2.

### Try running the forward model on an example configuration

In [5]:
r_cos_example = batch["boundary.r_cos"][0]
z_sin_example = batch["boundary.z_sin"][0]

In [38]:
from constellaration.geometry.surface_rz_fourier import SurfaceRZFourier

# Construct boundary surface objects from torch tensors
surface = SurfaceRZFourier(r_cos=r_cos_example.numpy(), z_sin=z_sin_example.numpy())

In [39]:
from constellaration.forward_model import forward_model

# Forward model to compute metrics from the boundary surface
metrics, _ = forward_model(boundary=surface)

### Load and save batches of datasets

In [8]:
dataset = []
for big_batch in torch.utils.data.DataLoader(
    torch_ds, batch_size=5000, shuffle=True, num_workers=4
):
    dataset.append(big_batch)
    break

In [10]:
for i, mini_batch in enumerate(dataset):
    mini_batch["boundary.r_cos"] = torch.reshape(
        mini_batch["boundary.r_cos"],
        shape=(
            mini_batch["boundary.r_cos"].shape[0],
            mini_batch["boundary.r_cos"].shape[1]
            * mini_batch["boundary.r_cos"].shape[2],
        ),
    )

    mini_batch["boundary.z_sin"] = torch.reshape(
        mini_batch["boundary.z_sin"],
        shape=(
            mini_batch["boundary.z_sin"].shape[0],
            mini_batch["boundary.z_sin"].shape[1]
            * mini_batch["boundary.z_sin"].shape[2],
        ),
    )

    mini_batch = torch.concatenate(
        [
            mini_batch["boundary.r_cos"],
            mini_batch["boundary.z_sin"],
            mini_batch["metrics.max_elongation"].unsqueeze(-1),
            mini_batch["metrics.aspect_ratio"].unsqueeze(-1),
            mini_batch["metrics.average_triangularity"].unsqueeze(-1),
            mini_batch[
                "metrics.edge_rotational_transform_over_n_field_periods"
            ].unsqueeze(-1),
        ],
        dim=1,
    )

    torch.save(mini_batch, f"../data/batch_{i}.pt")

### PCA analysis for dimensionality reduction

In [52]:
data = dataset[0]["boundary.r_cos"]  # 0.9999 variance with 25 dims
# data = dataset[0]["boundary.z_sin"]  # 0.9999 variance with 28 dims

# perform decomposition on the data
u, s, vh = torch.linalg.svd(data, full_matrices=False)

# project data along singular vectors
data_transformed = torch.matmul(data, vh.T)

# calculate explained variance
explained_variance = torch.var(data_transformed, dim=0) / torch.var(data, dim=0).sum()

torch.cumsum(explained_variance, dim=0)

tensor([0.0048, 0.5646, 0.7232, 0.8158, 0.9041, 0.9492, 0.9698, 0.9794, 0.9847,
        0.9884, 0.9908, 0.9931, 0.9947, 0.9962, 0.9972, 0.9980, 0.9983, 0.9986,
        0.9989, 0.9991, 0.9993, 0.9994, 0.9995, 0.9996, 0.9996, 0.9997, 0.9997,
        0.9998, 0.9998, 0.9999, 0.9999, 0.9999, 0.9999, 0.9999, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])