In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
import shutil

In [20]:
train_df_metadata = pd.read_csv("Dataset/Binary_Full/train/metadata.csv")
train_df_metadata.head()

Unnamed: 0,lesion_id,file_name,dx,dx_type,age,sex,localization,dataset,label
0,HAM_0000118,benign/ISIC_0027419.jpg,benign,histo,80.0,male,scalp,vidir_modern,benign
1,HAM_0000118,benign/ISIC_0025030.jpg,benign,histo,80.0,male,scalp,vidir_modern,benign
2,HAM_0002730,benign/ISIC_0026769.jpg,benign,histo,80.0,male,scalp,vidir_modern,benign
3,HAM_0002730,benign/ISIC_0025661.jpg,benign,histo,80.0,male,scalp,vidir_modern,benign
4,HAM_0001466,benign/ISIC_0031633.jpg,benign,histo,75.0,male,ear,vidir_modern,benign


In [24]:
# select 2K random samples from benign and malignant classes
benign_samples = train_df_metadata[train_df_metadata['label'] == "benign"].sample(n=1000, random_state=42)
malignant_samples = train_df_metadata[train_df_metadata['label'] == "malignant"].sample(n=1000, random_state=42)

# combine the samples into a new dataframe
balanced_train_df = pd.concat([benign_samples, malignant_samples]).reset_index(drop=True)
# save the balanced dataframe to a new CSV file
os.makedirs("Dataset/Binary_2K_samples/train", exist_ok=True)
balanced_train_df.to_csv("Dataset/Binary_2K_samples/train/metadata.csv", index=False)

In [28]:
# copy the images to new folders based on the binary labels
main_dir = "Dataset/Binary_Full/train"
dest_dir = "Dataset/Binary_2K_samples/train/"
os.makedirs(dest_dir, exist_ok=True)

for label in balanced_train_df['label'].unique():
    os.makedirs(os.path.join(dest_dir, label), exist_ok=True)

for _, row in balanced_train_df.iterrows():
    src = os.path.join(main_dir, row['file_name'])
    dst = os.path.join(dest_dir, row['file_name'])
    shutil.copy(src, dst)

In [38]:
df = df.rename(columns={"image_id": "file_name"})
df['file_name'] = df['file_name'].apply(lambda x: x + ".jpg")
df["file_name"] = df.apply(lambda x: f"{x['label']}/{x['file_name']}", axis=1)
df.to_csv("Dataset/Binary_Full/train/metadata.csv", index=False)

In [39]:
df.head()

Unnamed: 0,lesion_id,file_name,dx,dx_type,age,sex,localization,dataset,label
0,HAM_0000118,benign/ISIC_0027419.jpg,bkl,histo,80.0,male,scalp,vidir_modern,benign
1,HAM_0000118,benign/ISIC_0025030.jpg,bkl,histo,80.0,male,scalp,vidir_modern,benign
2,HAM_0002730,benign/ISIC_0026769.jpg,bkl,histo,80.0,male,scalp,vidir_modern,benign
3,HAM_0002730,benign/ISIC_0025661.jpg,bkl,histo,80.0,male,scalp,vidir_modern,benign
4,HAM_0001466,benign/ISIC_0031633.jpg,bkl,histo,75.0,male,ear,vidir_modern,benign


In [29]:
from datasets import load_dataset

In [30]:
dataset = load_dataset("imagefolder", data_dir="Dataset/Binary_2K_samples")

Resolving data files:   0%|          | 0/2001 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1512 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/2001 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/1512 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [31]:
dataset

DatasetDict({
    train: Dataset({
        features: ['lesion_id', 'image', 'dx', 'dx_type', 'age', 'sex', 'localization', 'dataset', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['lesion_id', 'image', 'dx', 'dx_type', 'age', 'sex', 'localization', 'dataset', 'label'],
        num_rows: 1511
    })
})

In [32]:
# get first sample from the training set
dataset["train"]

Dataset({
    features: ['lesion_id', 'image', 'dx', 'dx_type', 'age', 'sex', 'localization', 'dataset', 'label'],
    num_rows: 2000
})

In [33]:
dataset.push_to_hub("binary-2K-samples-skin-lesion-HM10000", private=False)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 1.08MB /  277MB            

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   1%|          | 1.55MB /  278MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Map:   0%|          | 0/1511 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/16 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   6%|5         | 25.2MB /  420MB            

CommitInfo(commit_url='https://huggingface.co/datasets/preetsojitra/binary-2K-samples-skin-lesion-HM10000/commit/f441c31208bc8866639b0f7e8d62eb930ff2c881', commit_message='Upload dataset', commit_description='', oid='f441c31208bc8866639b0f7e8d62eb930ff2c881', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/preetsojitra/binary-2K-samples-skin-lesion-HM10000', endpoint='https://huggingface.co', repo_type='dataset', repo_id='preetsojitra/binary-2K-samples-skin-lesion-HM10000'), pr_revision=None, pr_num=None)