# Test Local Shared Data

Here we will test the loading of the data form the `DVC Data Registry` and mounted to the devcontaiener.

In [1]:
import dvc.api
import ray
import json

  from .autonotebook import tqdm as notebook_tqdm
2025-11-14 02:15:43,368	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
REPO = "https://github.com/OpenCloudHub/data-registry"

In [5]:
def load_versioned_data(dataset_name, version="v1.0.0"):
    """Load specific dataset version."""

    train_path = dvc.api.get_url(
        "data/fashion-mnist/processed/train/train.parquet", repo=REPO, rev=version
    )
    val_path = dvc.api.get_url(
        "data/fashion-mnist/processed/val/val.parquet", repo=REPO, rev=version
    )

    metrics_content = dvc.api.read(
        "data/fashion-mnist/processed/metadata.json", repo=REPO, rev=version
    )
    metadata = json.loads(metrics_content)

    train_ds = ray.data.read_parquet(train_path)
    val_ds = ray.data.read_parquet(val_path)

    return train_ds, val_ds, metadata

### Compare Versions

#### V3

In [7]:
train_ds_v3, val_ds_v3, metadata_v3 = load_versioned_data("fashion-mnist", "v0.0.3")

Parquet Files Sample 0: 100%|██████████| 1.00/1.00 [00:00<00:00, 3.91 file/s]
Parquet Files Sample 0: 100%|██████████| 1.00/1.00 [00:00<00:00, 23.1 file/s]


In [8]:
train_ds_v3.count()

30000

In [9]:
print("Version v0.0.3 Metadata:", metadata_v3)

Version v0.0.3 Metadata: {'dataset': {'name': 'fashion-mnist', 'description': 'Fashion-MNIST dataset - grayscale images of fashion items', 'source': 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com'}, 'schema': {'features': {'image': {'dtype': 'uint8', 'shape': [28, 28], 'description': '28x28 grayscale image'}, 'label': {'dtype': 'uint8', 'shape': [], 'description': 'Class label (0-9)', 'classes': {'0': 'T-shirt/top', '1': 'Trouser', '2': 'Pullover', '3': 'Dress', '4': 'Coat', '5': 'Sandal', '6': 'Shirt', '7': 'Sneaker', '8': 'Bag', '9': 'Ankle boot'}}}}, 'splits': {'train': {'num_samples': 30000, 'num_features': 2, 'class_distribution': {'T-shirt/top': 2966, 'Trouser': 3023, 'Pullover': 3034, 'Dress': 3018, 'Coat': 3033, 'Sandal': 2971, 'Shirt': 3006, 'Sneaker': 2955, 'Bag': 2961, 'Ankle boot': 3033}}, 'val': {'num_samples': 5000, 'num_features': 2, 'class_distribution': {'T-shirt/top': 487, 'Trouser': 492, 'Pullover': 514, 'Dress': 478, 'Coat': 506, 'Sandal': 519, 'Shirt':

In [10]:
# Test row
row_v3 = train_ds_v3.take(1)[0]

2025-11-14 02:17:50,575	INFO dataset.py:3055 -- Tip: Use `take_batch()` instead of `take() / show()` to return records in pandas or numpy batch format.
2025-11-14 02:17:50,585	INFO logging.py:295 -- Registered dataset logger for dataset dataset_4_0
2025-11-14 02:17:50,585	INFO logging.py:295 -- Registered dataset logger for dataset dataset_4_0
2025-11-14 02:17:50,624	INFO streaming_executor.py:117 -- Starting execution of Dataset dataset_4_0. Full logs are in /tmp/ray/session_2025-11-14_02-16-47_976496_135762/logs/ray-data
2025-11-14 02:17:50,626	INFO streaming_executor.py:118 -- Execution plan of Dataset dataset_4_0: InputDataBuffer[Input] -> TaskPoolMapOperator[ReadParquet] -> LimitOperator[limit=1]
Running 0: 0.00 row [00:00, ? row/s]
- ReadParquet->SplitBlocks(185) 1: 0.00 row [00:00, ? row/s]


- ReadParquet->SplitBlocks(185): Tasks: 1; Actors: 0; Queued blocks: 0; Resources: 1.0 CPU, 1.5MB object store: : 0.00 row [00:01, ? row/s]
- ReadParquet->SplitBlocks(185): Tasks: 1; Actors

In [14]:
for key, value in row_v3.items():
    print(f"{key}: {value}")
    print(f"Type: {type(value)}")

image: [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 4, 0, 0, 147, 125, 106, 113, 27, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0, 3, 0, 27, 198, 59, 0, 0, 16, 124, 40, 0, 2, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 173, 50, 0, 0, 0, 0, 0, 137, 0, 0, 0, 1, 1, 0, 0, 1, 0], [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 3, 140, 0, 0, 0, 0, 10, 0, 101, 45, 0, 4, 0, 0, 1, 1, 0, 0], [0, 0, 0, 0, 0, 1, 0, 0, 4, 0, 29, 145, 0, 0, 2, 2, 3, 0, 43, 101, 0, 7, 0, 0, 0, 1, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 105, 139, 0, 0, 0, 0, 1, 0, 7, 155, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 3, 5, 0, 1, 2, 0, 0, 184, 125, 0, 3, 1, 2, 4, 0, 0, 144, 8, 0, 0, 0, 0, 0, 3, 1], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 231, 44, 0, 0, 0, 0, 0, 0, 0, 152, 36, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 99, 99, 82, 84, 83, 59, 75, 207, 68, 37, 61, 61, 63, 61, 59, 54, 183, 

In [None]:
# Shape of image array
print("Image shape:", row_v3["image"].shape)

AttributeError: 'list' object has no attribute 'shape'

: 

In [None]:
ray.shutdown()

In [1]:
import mlflow

In [2]:
info = mlflow.models.get_model_info("models:/ci.fashion-mnist-classifier/15")

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 426.16it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 426.16it/s]


In [7]:
info.run_id

'c7aa952cb8a842349976bab43fdfb9e5'

In [8]:
client = mlflow.tracking.MlflowClient()
run = client.get_run(info.run_id)

In [10]:
data_version = run.data.tags.get("dvc_data_version")
data_version

'v0.0.3'