In [1]:
%load_ext autoreload
%autoreload 2

import os
from pprint import pprint

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from PIL import Image

from datasets import Dataset, DatasetDict, load_dataset, concatenate_datasets
from huggingface_hub import HfApi
from tqdm import tqdm

import isic
from isic import printenc, println, printdash

SEED = 42

In [2]:
printenc("Archive set processing")

# dataset and metadata for the complete set.
ds_cp = isic.get_complete()
md_cp = ds_cp.get_proc_metadata()
printenc("Dataset integrity", ds_cp.verify_integrity())

md_cp = md_cp[md_cp['image_type'] == 'dermoscopic']
printenc("Filter non-dermoscopic", md_cp['image_type'].value_counts())

md_cp = md_cp[md_cp['benign_malignant'] != 'indeterminate/malignant']
printenc("Filter indeterminate/malignant because small sample size & uncertain how to deal with.",
         md_cp['benign_malignant'].value_counts())

def cp_nan_diag_maps(diag) -> int:
    # GPT generated but verified with statistics in 2024 and complete sets
    # Prompted with list of cancer names and required to map benign to 0 and malignant to 1 in python-style dict
    cancer_type_mapping = {
        'actinic keratosis': 0,  # Benign (pre-cancerous, but not malignant)
        'atypical spitz tumor': 0,  # Uncertain but all other cases indeterminate -> label 0 
        'basal cell carcinoma': 1,  # Malignant
        'dermatofibroma': 0,  # Benign
        'melanoma': 1,  # Malignant
        'pigmented benign keratosis': 0,  # Benign
        'seborrheic keratosis': 0,  # Benign
        'solar lentigo': 0,  # Benign
        'squamous cell carcinoma': 1,  # Malignant
        'vascular lesion': 0 # Uncertain but all other cases treated benign -> 0
    }

    if type(diag) is not str and np.isnan(diag):
        # the entry has both nan benign_malignant and nan diagnosis
        # it's hopeless.
        return np.nan

    return cancer_type_mapping[diag]

def cp_label_maps(row, query_inferred: bool = False) -> int:
    # to make sure we consciously assign these values
    inferred = -1
    label = -1
    if row['benign_malignant'] == 'benign':
        label = 0
        inferred = 0
    elif row['benign_malignant'] == 'malignant':
        label = 1
        inferred = 0
    elif row['benign_malignant'] in ['indeterminate', 'indeterminate/benign']:
        # as is done in isic_2024
        label = 0
        inferred = 0
    elif np.isnan(row['benign_malignant']):
        label = cp_nan_diag_maps(row['diagnosis'])
        inferred = 1

    assert(inferred != -1)
    assert(label != -1)

    if query_inferred:
        return inferred
    return label

# one column has to have non-nan values
# this should eliminate nan values from the label column, as they are useless
# even in training and have to be filtered anyhow :)
md_cp = md_cp.dropna(subset=['benign_malignant', 'diagnosis'], how = 'all')
printenc("Drop nan diagnosis and nan benign_malignant samples\n",
         "Rows where both columns take nan now:",
         md_cp[md_cp['benign_malignant'].isna() & md_cp['diagnosis'].isna()].value_counts(dropna = False))

printdash()
md_cp['label'] = md_cp.apply(cp_label_maps, axis = 1)
md_cp['PROC_label_inferred'] = md_cp.apply(cp_label_maps, axis = 1, args = (True,))

println("Map complete set benign/malignant and diagnosis to input label, label inferred rows")

print("statistics for labels")
println(md_cp['label'].value_counts(dropna = False))

print("statistics for benign rows")
println(md_cp[md_cp['label'] == 0]\
             .value_counts(['diagnosis', 'benign_malignant', 'PROC_label_inferred'], dropna = False)\
             .sort_index(ascending = True))

print("statistics for malignant rows")
println(md_cp[md_cp['label'] == 1][['diagnosis', 'benign_malignant', 'PROC_label_inferred']]\
             .value_counts(sort = False, dropna = False))

print("statistics of inferred labels")
print(md_cp[md_cp['PROC_label_inferred'] == 1][['benign_malignant', 'label', 'diagnosis']]\
             .value_counts(sort = False, dropna = False))
printdash()

--------------------
Archive set processing
--------------------


100%|█████| 81722/81722 [00:00<00:00, 112951.28it/s]


--------------------
Dataset integrity
True
--------------------
--------------------
Filter non-dermoscopic
image_type
dermoscopic    81155
Name: count, dtype: int64
--------------------
--------------------
Filter indeterminate/malignant because small sample size & uncertain how to deal with.
benign_malignant
benign                  63867
malignant                8906
indeterminate             150
indeterminate/benign       66
Name: count, dtype: int64
--------------------
--------------------
Drop nan diagnosis and nan benign_malignant samples

Rows where both columns take nan now:
Series([], Name: count, dtype: int64)
--------------------
--------------------
Map complete set benign/malignant and diagnosis to input label, label inferred rows

statistics for labels
label
0    68021
1    13004
Name: count, dtype: int64

statistics for benign rows
diagnosis                           benign_malignant      PROC_label_inferred
AIMP                                benign                0  

In [3]:
printenc("Complete set partitioning (train/val/test 8:1:1)")

ids_cp = md_cp['isic_id'].tolist()
ids_train_cp, ids_val_cp = train_test_split(ids_cp,
                                            test_size = 0.2,
                                            train_size = 0.8,
                                            random_state=SEED)

ids_val_cp, ids_test_cp = train_test_split(ids_val_cp,
                                           test_size = 0.5,
                                           train_size = 0.5,
                                           random_state = SEED)
# training instead of train for compatibility with isic archive naming convention
md_cp.loc[md_cp['isic_id'].isin(ids_train_cp), 'PROC_use'] = 'training'
md_cp.loc[md_cp['isic_id'].isin(ids_val_cp), 'PROC_use'] = 'validation'
md_cp.loc[md_cp['isic_id'].isin(ids_test_cp), 'PROC_use'] = 'test'

printenc("Verify train/verify split",
         md_cp['PROC_use'].value_counts(normalize = True, dropna = False))

--------------------
Complete set partitioning (train/val/test 8:1:1)
--------------------
--------------------
Verify train/verify split
PROC_use
training      0.800000
test          0.100006
validation    0.099994
Name: proportion, dtype: float64
--------------------


In [4]:
printenc("2024 set processing",
         "TODO: SEE FINAL SECTION FOR DOWNSAMPLED STATISTICS!")

# dataset and metadata
ds_24 = isic.get_2024()
md_24 = ds_24.get_proc_metadata()

printenc("Verify dataset integrity", ds_24.verify_integrity())

md_24['benign_malignant'] = md_24['iddx_1'].apply(lambda s:s.lower())
printenc("Map iddx_1 to benign_malignant",
         md_24[['iddx_1', 'benign_malignant']].value_counts(dropna = False))

md_24['label'] = md_24['target']
printenc("Map target to label",
         md_24[['target', 'label']].value_counts(dropna = False))

# See isic.map_iddx3_diag: supported by ChatGPT response as well as online search
# potentially contentious are 'Solar or actinic keratosis',
# and those mapped to 'other'.
# Some more detailed categories are also collapsed into broader ones (eg types of melanoma)
md_24['diagnosis'] = md_24['iddx_3'].apply(isic.map_iddx3_diag)
md_24['PROC_diag_inferred'] = md_24['iddx_3'].apply(isic.map_iddx3_diag, args = (True,))
# shouldn't we use the competition (2024) format? Well this column isn't present in test...
printenc("Map iddx_3 to diagnosis, isic archive format",
         md_24[['iddx_3', 'diagnosis', 'PROC_diag_inferred']].value_counts(dropna = False, sort = False))

printdash()
print("Downsample benign set to 20:1 benign:malignant")
OVERSAMPLING_RATE = 20
sel_24_ids = [] # sampled ids in 2024 set
num_mal_24 = (md_24['label'] == 1).sum() # number of malignant samples in 24 set
sel_24_ids.extend(md_24[md_24['label'] == 1]['isic_id'].tolist()) # malignant
# benign and potentially indeterminate
sel_24_ids.extend(md_24[md_24['label'] == 0]['isic_id'].sample(OVERSAMPLING_RATE * num_mal_24,
                                                               random_state = SEED))
md_24 = md_24[md_24['isic_id'].isin(sel_24_ids)]

print("new benign/malignant statistics")
println(md_24['label'].value_counts())

col_interest_24 = ['age_approx', 'anatom_site_general', 'clin_size_long_diam_mm', 'sex']
for col in col_interest_24:
    print(f"statistics on column [{col}]")
    println(isic.dtype_based_stats(md_24, col))
printdash()

--------------------
2024 set processing
TODO: SEE FINAL SECTION FOR DOWNSAMPLED STATISTICS!
--------------------


100%|████| 401059/401059 [00:05<00:00, 68492.23it/s]


--------------------
Verify dataset integrity
True
--------------------
--------------------
Map iddx_1 to benign_malignant
iddx_1         benign_malignant
Benign         benign              400552
Malignant      malignant              393
Indeterminate  indeterminate          114
Name: count, dtype: int64
--------------------
--------------------
Map target to label
target  label
0       0        400666
1       1           393
Name: count, dtype: int64
--------------------
--------------------
Map iddx_3 to diagnosis, isic archive format
iddx_3                                              diagnosis                           PROC_diag_inferred
Angiofibroma                                        angiofibroma or fibrous papule      1                          2
Atypical intraepithelial melanocytic proliferation  AIMP                                1                         11
Atypical melanocytic neoplasm                       atypical melanocytic proliferation  1                         

In [5]:
printenc("Assign PROC_use values to 2024 set (train:val 4:1)")

ids_24 = md_24['isic_id'].tolist()
ids_train_24, ids_val_24 = train_test_split(ids_24,
                                            test_size = 0.2,
                                            train_size = 0.8,
                                            random_state=SEED)
# training instead of train for compatibility with isic archive naming convention
md_24.loc[md_24['isic_id'].isin(ids_train_24), 'PROC_use'] = 'training'
md_24.loc[md_24['isic_id'].isin(ids_val_24), 'PROC_use'] = 'validation'

printenc("Verify train/validation split",
         md_24['PROC_use'].value_counts(normalize = True, dropna = False))

printenc("Check sufficient positive in validation set",
         md_24[md_24['PROC_use'] == 'validation']['label'].value_counts(dropna = False))

--------------------
Assign PROC_use values to 2024 set (train:val 4:1)
--------------------
--------------------
Verify train/validation split
PROC_use
training      0.799952
validation    0.200048
Name: proportion, dtype: float64
--------------------
--------------------
Check sufficient positive in validation set
label
0    1573
1      78
Name: count, dtype: int64
--------------------


In [6]:
printenc("Combine 2024 and archive sets, setting unavailable values -> na")

# concatenate the two lists
# https://stackoverflow.com/a/67808713
md_full = pd.concat([md_24, md_cp], axis=0).reset_index(drop=True)

printenc("Verify combined list", 
         md_full['PROC_source'].value_counts(dropna = False))

printenc("isic_id all unique?",
      md_full['isic_id'].is_unique)

printenc("all labels valid (no nan, 0/1 only)?",
      md_full['label'].value_counts(dropna = False))

printenc("Overall train/val/test",
      md_full['PROC_use'].value_counts(dropna = False))

--------------------
Combine 2024 and archive sets, setting unavailable values -> na
--------------------
--------------------
Verify combined list
PROC_source
complete    81025
2024         8253
Name: count, dtype: int64
--------------------
--------------------
isic_id all unique?
True
--------------------
--------------------
all labels valid (no nan, 0/1 only)?
label
0    75881
1    13397
Name: count, dtype: int64
--------------------
--------------------
Overall train/val/test
PROC_use
training      71422
validation     9753
test           8103
Name: count, dtype: int64
--------------------


In [7]:
printenc("Metadata mixed-type cleanup")

printdash()
print("Before clean-up")
md_full.info(verbose = True)
printdash()

def type_in_col(column, tp):
    return column.apply(isinstance, args = (tp,)).any()

# after this procedure all nullable types are cast to float64 (inclufing bool)
# on the other hand strings will have '' to represent na.
for col in md_full.columns.values:
    num_types = len(md_full[col].apply(type).unique())
    
    if type_in_col(md_full[col], str):
        # for those str columns that have only the str type fillna won't do a thing.
        # otherwise get rid of nans and change type to string for consistency
        num_nan = md_full[col].isna().sum()
        md_full[col] = md_full[col].fillna('').astype('string')
        counts = md_full[col].value_counts()

        if '' not in counts:
            assert(num_nan == 0)
        else:
            assert(num_nan == counts[''])
        
    elif type_in_col(md_full[col], bool):
        num_nan = md_full[col].isna().sum()
        md_full[col] = md_full[col].astype(float)
        assert(num_nan == md_full[col].isna().sum())
    
    assert(len(md_full[col].apply(type).unique()) == 1)

md_full.convert_dtypes()

printdash()
print('After clean-up')
md_full.info(verbose = True)
printdash()

--------------------
Metadata mixed-type cleanup
--------------------
--------------------
Before clean-up
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89278 entries, 0 to 89277
Data columns (total 76 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   isic_id                       89278 non-null  object 
 1   target                        8253 non-null   float64
 2   patient_id                    48914 non-null  object 
 3   age_approx                    87239 non-null  float64
 4   sex                           87168 non-null  object 
 5   anatom_site_general           72330 non-null  object 
 6   clin_size_long_diam_mm        12209 non-null  float64
 7   image_type                    89278 non-null  object 
 8   tbp_tile_type                 8253 non-null   object 
 9   tbp_lv_A                      8253 non-null   float64
 10  tbp_lv_Aext                   8253 non-null   float64
 11  tbp_lv_B    

In [8]:
printenc("Metadata save and upload")
METADATA_SAVE = 'data/isic/metadata_full.json'
METADATA_REPO = 'metadata.json'
md_full.to_json(METADATA_SAVE)

api = HfApi()

api.upload_file(
    commit_message = "Update metadata",
    path_or_fileobj = METADATA_SAVE,
    path_in_repo = METADATA_REPO,
    repo_id = 'TobanDjan/isic_full',
    repo_type = 'dataset'
)

printenc("Split train/val/test")

md_train = md_full[md_full['PROC_use'] == 'training']
md_val = md_full[md_full['PROC_use'] == 'validation']
md_test = md_full[md_full['PROC_use'] == 'test']

md_splits = {
            'train': md_train,
            'validation': md_val,
            'test': md_test
            }

--------------------
Metadata save and upload
--------------------


No files have been modified since last commit. Skipping to prevent empty commit.


--------------------
Split train/val/test
--------------------


In [9]:
# ignore - legacy upload method for isic_2024 and isic_archive sets.
def get_filename(row):
    filename = ""
    
    if row['PROC_source'] == '2024':
        filename = ds_24.get_filename(row['isic_id'])
    elif row['PROC_source'] == 'complete':
        filename = ds_cp.get_filename(row['isic_id'])

    assert(filename != "")
    return filename

def upload_imagefolder():
    md_full['file_name'] = md_full['isic_id'].apply(lambda s: s + '.jpg')
    
    BASE_DIR = 'data/isic/final/'
    os.makedirs(BASE_DIR, exist_ok = False)
    
    # separate 2024 and complete sets
    for source in md_full['PROC_source'].unique():
        printdash()
        print(f"Processing subset [{source}]")
        source_subset = md_full[md_full['PROC_source'] == source]
        source_dir = os.path.join(BASE_DIR, source)
        os.makedirs(source_dir, exist_ok = True)
        
        for use in md_full[md_full['PROC_source'] == source]['PROC_use'].unique():
            use_subset = source_subset[source_subset['PROC_use'] == use]
            if use == 'training':
                use = 'train' # huggingface convention.
    
            use_dir = os.path.join(source_dir, use)
            os.makedirs(use_dir, exist_ok = True)
    
            for label in use_subset['label'].unique():
                label_subset = use_subset[use_subset['label'] == label]
                label_dir = os.path.join(use_dir, str(label))
                os.makedirs(label_dir, exist_ok = True)
                
                print(f"Processing [{source}] for use [{use}], label[{label}]")
            
                for idx, row in tqdm(label_subset.iterrows(), total = label_subset.shape[0]):
                    filename = get_filename(row)
                    final_loc = os.path.join(label_dir, os.path.basename(filename))
                    os.link(filename, final_loc)
        
                    # eff me.
                    # https://github.com/huggingface/datasets/issues/6764#issue-2215767119
                    # os.symlink(filename, final_loc)
    
        dataset = load_dataset("imagefolder", data_dir = source_dir)
    
        REMOTE_REPOS = { 'complete': 'isic_archive', '2024': 'isic_2024' }
            
        dataset.push_to_hub('TobanDjan/' + REMOTE_REPOS[source], private = False)
        
        printdash()

In [10]:
def load_image(row):
    filename = ""

    if row['PROC_source'] == '2024':
        # bypass ISIC_Dataset.get_image() since it's too slow (generates metadata everytime.)
        # bypass ISIC_Dataset.get_images() since it doesn't fit into my memory
        filename = ds_24.get_filename(row['isic_id'])
    elif row['PROC_source'] == 'complete':
        filename = ds_cp.get_filename(row['isic_id'])
        
    assert(filename != "")
    image = Image.open(filename)
    # DO NOT LOAD HERE, for PIL is LAZY
    # https://www.reddit.com/r/learnpython/comments/3e1611/comment/ctajz5z/?utm_source=share&utm_medium=web3x&utm_name=web3xcss&utm_term=1&utm_content=share_button
    return image

def ds_generator(md):
    # https://ys-l.github.io/posts/2015/08/28/how-not-to-use-pandas-apply/
    # https://github.com/huggingface/datasets/issues/4796#issuecomment-2048822155
    md_dict = md.to_dict(orient = 'records')
    
    for row in md_dict:
        # make new instance to not load image into row
        # or else might exceed the nofile limit
        yield row | {'image': load_image(row)}

HF_TEMP = 'data/isic/temp/hf/'

printenc("Prepare HF DatasetDict")

hf_full = DatasetDict()

for use, md in md_splits.items():
    printdash()
    print(f"Subset {[use]}")

    hf_full[use] = Dataset.from_generator(ds_generator,
                                          cache_dir = HF_TEMP,
                                          keep_in_memory = False,
                                          gen_kwargs = {'md': md})

    assert(len(hf_full[use]) == md.shape[0])
    
    pprint(hf_full[use])
    printdash()

--------------------
Prepare HF DatasetDict
--------------------
--------------------
Subset ['train']
Dataset({
    features: ['isic_id', 'target', 'patient_id', 'age_approx', 'sex', 'anatom_site_general', 'clin_size_long_diam_mm', 'image_type', 'tbp_tile_type', 'tbp_lv_A', 'tbp_lv_Aext', 'tbp_lv_B', 'tbp_lv_Bext', 'tbp_lv_C', 'tbp_lv_Cext', 'tbp_lv_H', 'tbp_lv_Hext', 'tbp_lv_L', 'tbp_lv_Lext', 'tbp_lv_areaMM2', 'tbp_lv_area_perim_ratio', 'tbp_lv_color_std_mean', 'tbp_lv_deltaA', 'tbp_lv_deltaB', 'tbp_lv_deltaL', 'tbp_lv_deltaLB', 'tbp_lv_deltaLBnorm', 'tbp_lv_eccentricity', 'tbp_lv_location', 'tbp_lv_location_simple', 'tbp_lv_minorAxisMM', 'tbp_lv_nevi_confidence', 'tbp_lv_norm_border', 'tbp_lv_norm_color', 'tbp_lv_perimeterMM', 'tbp_lv_radial_color_std_max', 'tbp_lv_stdL', 'tbp_lv_stdLExt', 'tbp_lv_symm_2axis', 'tbp_lv_symm_2axis_angle', 'tbp_lv_x', 'tbp_lv_y', 'tbp_lv_z', 'attribution', 'copyright_license', 'lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 'iddx_4', 'iddx_5',

In [11]:
printenc("Upload to HF")

IMG_PER_SHARD = 1500
nshard_dict = dict()

for use, md in md_splits.items():
    nshard_dict[use] = int(np.ceil(md.shape[0] / IMG_PER_SHARD))

# if shard is not specified it will try to do it in one shard with batch size 1000,
# and memory will run out.
# max_shard_size doesn't split as expected. 
hf_full.push_to_hub('TobanDjan/isic_full', num_shards = nshard_dict)

--------------------
Upload to HF
--------------------


Uploading the dataset shards:   0%|          | 0/48 [00:00<?, ?it/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Map:   0%|          | 0/1487 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/7 [00:00<?, ?it/s]

Map:   0%|          | 0/1394 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1394 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1393 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/6 [00:00<?, ?it/s]

Map:   0%|          | 0/1351 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1351 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1351 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1350 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/datasets/TobanDjan/isic_full/commit/b41ec26e16d7863e94c5bf84323e91fdf708fb17', commit_message='Upload dataset (part 00001-of-00002)', commit_description='', oid='b41ec26e16d7863e94c5bf84323e91fdf708fb17', pr_url=None, pr_revision=None, pr_num=None)