# Importing data from medmnist

To configure the data catalog, run the below cells. These cells will download the data from the medmnist dataset and save it in the `data` directory as a dataframe encapsuled in a pickle file.


In [None]:
# !pip install medmnist

In [1]:
from medmnist import DermaMNIST

In [2]:
data_train = DermaMNIST(split="train", download=True, root="../data/01_raw/")

Downloading https://zenodo.org/records/10519652/files/dermamnist.npz?download=1 to ../data/01_raw/dermamnist.npz


100%|██████████| 19.7M/19.7M [00:52<00:00, 379kB/s] 


In [3]:
import numpy as np

ds = np.load("../data/01_raw/dermamnist.npz")

In [4]:
train_images = ds["train_images"]
train_labels = ds["train_labels"]
val_images = ds["val_images"]
val_labels = ds["val_labels"]
test_images = ds["test_images"]
test_labels = ds["test_labels"]

In [5]:
train_len = len(train_images)
val_len = len(val_images)
test_len = len(test_images)

In [6]:
# generate ids array
train_ids = [f"train_{i}" for i in range(train_len)]
val_ids = [f"val_{i}" for i in range(val_len)]
test_ids = [f"test_{i}" for i in range(test_len)]

In [7]:
train_images = list(train_images)
val_images = list(val_images)
test_images = list(test_images)

In [8]:
train_labels = list(train_labels)
val_labels = list(val_labels)
test_labels = list(test_labels)

In [9]:
# construct a df for each of the splits
import pandas as pd

train_df = pd.DataFrame(
    {
        "id": train_ids,
        "image": train_images,
        "label": train_labels,
    }
)
test_df = pd.DataFrame(
    {
        "id": test_ids,
        "image": test_images,
        "label": test_labels,
    }
)
val_df = pd.DataFrame(
    {
        "id": val_ids,
        "image": val_images,
        "label": val_labels,
    }
)

In [10]:
train_df.to_pickle("../data/01_raw/train.pkl")
test_df.to_pickle("../data/01_raw/test.pkl")
val_df.to_pickle("../data/01_raw/val.pkl")

In [19]:
train_df_loaded = pd.read_pickle("../data/01_raw/train.pkl")

In [24]:
train_df_loaded["image"].iloc[0].shape

(28, 28, 3)

In [15]:
def normalizing_images(data: pd.DataFrame) -> pd.DataFrame:
    data["image"] = data["image"].apply(lambda x: x / 255.0)
    return data

In [17]:
train_df_loaded_new = normalizing_images(train_df_loaded)
train_df_loaded_new

Unnamed: 0,id,image,label
0,train_0,"[[[0.002429834678969627, 0.001707035755478662,...",[0]
1,train_1,"[[[0.003537101114955786, 0.001707035755478662,...",[5]
2,train_2,"[[[0.003521722414455979, 0.002399077277970012,...",[5]
3,train_3,"[[[0.003506343713956171, 0.0019530949634755863...",[5]
4,train_4,"[[[0.0033217993079584776, 0.002891195693963860...",[4]
...,...,...,...
7002,train_7002,"[[[0.0030911188004613607, 0.001953094963475586...",[5]
7003,train_7003,"[[[0.003506343713956171, 0.0018762014609765476...",[5]
7004,train_7004,"[[[0.00030757400999615535, 0.00027681660899653...",[2]
7005,train_7005,"[[[0.002629757785467128, 0.002168396770472895,...",[5]


In [25]:
from torchvision import transforms


def tensoring_resizing(data: pd.DataFrame) -> pd.DataFrame:
    transform = transforms.Compose(
        [transforms.ToPILImage(), transforms.Resize((28, 28)), transforms.ToTensor()]
    )

    data["image"] = data["image"].apply(lambda x: transform(x).permute(1, 2, 0).numpy())
    return data

In [27]:
train_df_loaded_new = tensoring_resizing(train_df_loaded_new)
train_df_loaded_new["image"].iloc[0].shape

(28, 28, 3)

In [37]:
%load_ext kedro.ipython

In [2]:
from kedro.io.data_catalog import DataCatalog

catalog = DataCatalog.from_config("../conf/base/catalog.yml")

AttributeError: 'str' object has no attribute 'items'

In [43]:
catalog.save("train_raw", train_df)
catalog.save("test_raw", test_df)
catalog.save("val_raw", val_df)

In [45]:
df = catalog.load("train_raw")

In [49]:
df["image"][0].shape

[1m([0m[1;36m28[0m, [1;36m28[0m, [1;36m3[0m[1m)[0m

In [1]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
device

device(type='cuda')