# Training and Validation Splits for Gaussian Plumes

Since the gaussian plumes are randomly generated and have ids incremented from 0, 
we can do training / validation split by just indexing the sorted list of files.

In [70]:
import json
from tempfile import TemporaryDirectory
from pathlib import PurePosixPath

from azureml.fsspec import AzureMachineLearningFileSystem

from src.azure_wrap.blob_storage_sdk_v2 import DATASTORE_URI, upload_dir
from src.data.generation.plumes.generate import GAUSSIAN_PLUME_BLOB

fs = AzureMachineLearningFileSystem(DATASTORE_URI)
plume_files = sorted(list(fs.glob(f"{GAUSSIAN_PLUME_BLOB / 'plumes'}/**/*.tif")))

In [74]:
# strip out the entire path except for the azure part and the filename
# we do this because all the other plume files are stored like this
def format_plume_filename(file: str) -> str:
    return "azureml://" + PurePosixPath(file).name

In [75]:
formatted_plume_files = [format_plume_filename(f) for f in plume_files]
formatted_plume_files[0:5]

['azureml://gaussian_plume_0000001.tif',
 'azureml://gaussian_plume_0000002.tif',
 'azureml://gaussian_plume_0000003.tif',
 'azureml://gaussian_plume_0000004.tif',
 'azureml://gaussian_plume_0000005.tif']

In [76]:
train_percent = 0.8

train_idx = int(len(formatted_plume_files) * train_percent)
train_files = formatted_plume_files[:train_idx]
val_files = formatted_plume_files[train_idx:]

len(formatted_plume_files), len(train_files), len(val_files)

(10000, 8000, 2000)

In [78]:
train_filename = "_gaussian_plume_uris_training.json"
val_filename = "_gaussian_plume_uris_validation.json"

with TemporaryDirectory() as tempdir:
    tempdir = Path(tempdir)
    with open(tempdir / train_filename, "w") as f:
        json.dump(train_files, f)

    with open(tempdir / val_filename, "w") as f:
        json.dump(val_files, f)

    upload_dir(tempdir / train_filename, GAUSSIAN_PLUME_BLOB.as_posix(), recursive=False)
    upload_dir(tempdir / val_filename, GAUSSIAN_PLUME_BLOB.as_posix(), recursive=False)