# Notebook: Create Dataset

This notebook is used to create two subsets for training and evaluation. Both for training Stable Diffusion and the classification model
<br>
**Contributors:** Nils Hellwig 

## Import Packages

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import shutil
import os

## Parameters

In [2]:
DATASET_METADATA_PATH = "../Datasets/raw_dataset.csv"
RAW_DATASET_PATH = "../Datasets/raw_dataset/"
DATASET_PATH = "../Datasets/dataset/"
SEED = 42

In [3]:
LABEL_PROMPTS = {
                 "akiec": "Actinic keratoses and intraepithelial carcinoma / Bowen's disease",
                 "bcc":"basal cell carcinoma",
                 "df": "dermatofibroma",
                 "mel": "melanoma",
                 "nv": "melanocytic nevi",
                 "vasc": "vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage)",
                 "bkl": "benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses)"
                }

## Settings

In [4]:
np.random.seed(SEED)
random.seed(SEED)

## Code

### Create Subsets

In [5]:
# Load the CSV file as a DataFrame
df = pd.read_csv(DATASET_METADATA_PATH, delimiter=";").drop(columns=['Unnamed: 0'])
df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0027419.jpg
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025030.jpg
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0026769.jpg
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern,dataset/ISIC_0025661.jpg
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern,dataset/ISIC_0031633.jpg
...,...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033084.jpg
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033550.jpg
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern,dataset/ISIC_0033536.jpg
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern,dataset/ISIC_0032854.jpg


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['dx'], random_state=SEED)

In [7]:
train_df

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
8050,HAM_0005972,ISIC_0033319,nv,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0033319.jpg
4898,HAM_0004902,ISIC_0030823,nv,follow_up,40.0,male,trunk,vidir_molemax,dataset/ISIC_0030823.jpg
9695,HAM_0005282,ISIC_0028730,akiec,histo,65.0,male,lower extremity,rosendahl,dataset/ISIC_0028730.jpg
4090,HAM_0000475,ISIC_0027299,nv,follow_up,40.0,male,lower extremity,vidir_molemax,dataset/ISIC_0027299.jpg
8625,HAM_0000949,ISIC_0032444,nv,histo,65.0,male,back,rosendahl,dataset/ISIC_0032444.jpg
...,...,...,...,...,...,...,...,...,...
2360,HAM_0000940,ISIC_0032692,vasc,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0032692.jpg
3409,HAM_0005629,ISIC_0029317,nv,follow_up,45.0,female,upper extremity,vidir_molemax,dataset/ISIC_0029317.jpg
8736,HAM_0004025,ISIC_0025983,nv,histo,20.0,female,abdomen,rosendahl,dataset/ISIC_0025983.jpg
2399,HAM_0004542,ISIC_0027256,vasc,consensus,0.0,female,back,vidir_modern,dataset/ISIC_0027256.jpg


In [8]:
train_df.to_csv(DATASET_PATH + "train.csv")

In [9]:
test_df.to_csv(DATASET_PATH + "test.csv")

### Create Folder with images

In [10]:
def create_subset(df_subset, subset_name):
    # Iterate through each row of the DataFrame
    for index, row in df_subset.iterrows():
        # Get the name of the image and its corresponding label from the DataFrame
        image_name = row['image_id']
        label = row['dx']
    
        # Define the source and target paths for copying the image
        source_path = RAW_DATASET_PATH + image_name + ".jpg"
        target_path = DATASET_PATH + subset_name + "/" + label + '/' + image_name + ".jpg"
    
        # Create the subfolder if it doesn't already exist
        os.makedirs(os.path.dirname(target_path), exist_ok=True)
    
        # Copy the image to the target subfolder
        shutil.copyfile(source_path, target_path)

In [11]:
create_subset(train_df, "train")
create_subset(test_df, "test")

### Create Metadata

In [12]:
def get_metadata_hf(df_subset):
    metadata_hf = df_subset.copy()
    metadata_hf.loc[:, 'image_id'] = metadata_hf['dx'] + '/' + metadata_hf['image_id'] + '.jpg'
    metadata_hf['label'] = metadata_hf['dx']
    metadata_hf['dx'] = metadata_hf['dx'].replace(LABEL_PROMPTS)
    metadata_hf.loc[:, 'dx'] = metadata_hf['dx'] + " " +  metadata_hf['sex'] + " " +  metadata_hf['localization'] + " " + metadata_hf["age"].astype(str)
    metadata_hf = metadata_hf.rename(columns={"image_id": "file_name", "dx": "text"})
    metadata_hf = metadata_hf[["file_name", "text", "label"]]
    return metadata_hf

In [13]:
train_metadata_hf = get_metadata_hf(train_df).reset_index(drop=True)
train_metadata_hf.to_csv(DATASET_PATH + "train" + "/metadata.csv")
train_metadata_hf

Unnamed: 0,file_name,text,label
0,nv/ISIC_0033319.jpg,melanocytic nevi female lower extremity 35.0,nv
1,nv/ISIC_0030823.jpg,melanocytic nevi male trunk 40.0,nv
2,akiec/ISIC_0028730.jpg,Actinic keratoses and intraepithelial carcinom...,akiec
3,nv/ISIC_0027299.jpg,melanocytic nevi male lower extremity 40.0,nv
4,nv/ISIC_0032444.jpg,melanocytic nevi male back 65.0,nv
...,...,...,...
8007,vasc/ISIC_0032692.jpg,"vascular lesions (angiomas, angiokeratomas, py...",vasc
8008,nv/ISIC_0029317.jpg,melanocytic nevi female upper extremity 45.0,nv
8009,nv/ISIC_0025983.jpg,melanocytic nevi female abdomen 20.0,nv
8010,vasc/ISIC_0027256.jpg,"vascular lesions (angiomas, angiokeratomas, py...",vasc


In [14]:
test_metadata_hf = get_metadata_hf(test_df).reset_index(drop=True)
test_metadata_hf.to_csv(DATASET_PATH + "test" + "/metadata.csv")
test_metadata_hf

Unnamed: 0,file_name,text,label
0,nv/ISIC_0030038.jpg,melanocytic nevi female back 30.0,nv
1,nv/ISIC_0025442.jpg,melanocytic nevi male lower extremity 25.0,nv
2,mel/ISIC_0027204.jpg,melanoma male neck 70.0,mel
3,nv/ISIC_0032165.jpg,melanocytic nevi male chest 70.0,nv
4,bkl/ISIC_0033185.jpg,benign keratosis-like lesions (solar lentigine...,bkl
...,...,...,...
1998,nv/ISIC_0034116.jpg,melanocytic nevi female trunk 35.0,nv
1999,bcc/ISIC_0026453.jpg,basal cell carcinoma female back 55.0,bcc
2000,mel/ISIC_0029885.jpg,melanoma male back 35.0,mel
2001,mel/ISIC_0033226.jpg,melanoma male upper extremity 65.0,mel
