# Notebook: Create Synthetic Samples With Trained SD Model

This notebook is used to create new images using the trained SD Model.
<br>
**Contributors:** Nils Hellwig 

## Import Packages

In [113]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import random
import shutil
import os
import torch
from diffusers import StableDiffusionPipeline

## Parameters

In [114]:
DATASET_METADATA_PATH = "../Datasets/raw_dataset.csv"
RAW_DATASET_PATH = "../Datasets/raw_dataset/"
DATASET_PATH = "../Datasets/dataset/"
SYNTH_DATASET_PATH = "../Datasets/synth_dataset/"
SEED = 42
N_EXAMPLES_FOR_LABEL = 1000
MODEL_PATH = "/mnt/data/stable_diffusion_2_skin/"
OUTPUT_CSV = "/Datasets/generative_prompts.csv"

In [115]:
LABEL_PROMPTS = {
                 "akiec": "Actinic keratoses and intraepithelial carcinoma / Bowen's disease",
                 "bcc":"basal cell carcinoma",
                 "df": "dermatofibroma",
                 "mel": "melanoma",
                 "nv": "melanocytic nevi",
                 "vasc": "vascular lesions (angiomas, angiokeratomas, pyogenic granulomas and hemorrhage",
                 "bkl": "benign keratosis-like lesions (solar lentigines / seborrheic keratoses and lichen-planus like keratoses"
                }

## Settings

In [116]:
np.random.seed(SEED)
random.seed(SEED)

In [117]:
pipe = StableDiffusionPipeline.from_pretrained(MODEL_PATH, torch_dtype=torch.float16, use_auth_token=True, safety_checker = None)
pipe.to("cuda")



StableDiffusionPipeline {
  "_class_name": "StableDiffusionPipeline",
  "_diffusers_version": "0.15.0.dev0",
  "feature_extractor": [
    "transformers",
    "CLIPFeatureExtractor"
  ],
  "requires_safety_checker": false,
  "safety_checker": [
    null,
    null
  ],
  "scheduler": [
    "diffusers",
    "DDIMScheduler"
  ],
  "text_encoder": [
    "transformers",
    "CLIPTextModel"
  ],
  "tokenizer": [
    "transformers",
    "CLIPTokenizer"
  ],
  "unet": [
    "diffusers",
    "UNet2DConditionModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKL"
  ]
}

## Code

### Load Train Dataframe

In [118]:
# Load the CSV file as a DataFrame
train_df = pd.read_csv(DATASET_PATH + "train.csv")
train_df

Unnamed: 0.1,Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
0,8050,HAM_0005972,ISIC_0033319,nv,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0033319.jpg
1,4898,HAM_0004902,ISIC_0030823,nv,follow_up,40.0,male,trunk,vidir_molemax,dataset/ISIC_0030823.jpg
2,9695,HAM_0005282,ISIC_0028730,akiec,histo,65.0,male,lower extremity,rosendahl,dataset/ISIC_0028730.jpg
3,4090,HAM_0000475,ISIC_0027299,nv,follow_up,40.0,male,lower extremity,vidir_molemax,dataset/ISIC_0027299.jpg
4,8625,HAM_0000949,ISIC_0032444,nv,histo,65.0,male,back,rosendahl,dataset/ISIC_0032444.jpg
...,...,...,...,...,...,...,...,...,...,...
8007,2360,HAM_0000940,ISIC_0032692,vasc,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0032692.jpg
8008,3409,HAM_0005629,ISIC_0029317,nv,follow_up,45.0,female,upper extremity,vidir_molemax,dataset/ISIC_0029317.jpg
8009,8736,HAM_0004025,ISIC_0025983,nv,histo,20.0,female,abdomen,rosendahl,dataset/ISIC_0025983.jpg
8010,2399,HAM_0004542,ISIC_0027256,vasc,consensus,0.0,female,back,vidir_modern,dataset/ISIC_0027256.jpg


In [119]:
df_generative_prompts = pd.DataFrame({
    'file_name': [], 
    'text': [],
    'localization': [],
    'sex': [],
    'age': [],
    'dx': []
})

In [120]:
import random
import pandas as pd

def get_random_value(df, column_name):
    column = df[column_name]
    #print(column)
    random_value = np.random.choice(column)
    return random_value

In [121]:
vc = train_df["dx"].value_counts()

In [122]:
for i in range(0, N_EXAMPLES_FOR_LABEL):
    for key, value in vc.items():
        df_dx = train_df[train_df["dx"] == key]
        sex = get_random_value(df_dx, "sex")
        localization = get_random_value(df_dx, "localization")
        age = get_random_value(df_dx, "age")
        
        prompt = LABEL_PROMPTS[key] + " " + sex + " " + localization + " " + str(age)
        file_name = key + "/" + key + "_" + str(i) + ".jpg"
        
        new_row = {'file_name': file_name, 'text': prompt, 'sex': sex, 'localization': localization, 'age': age, 'dx': key}
        df_generative_prompts = pd.concat([df_generative_prompts, pd.DataFrame(new_row, index=[0])], ignore_index=True)


In [123]:
df_generative_prompts

Unnamed: 0,file_name,text,localization,sex,age,dx
0,nv/nv_0.jpg,melanocytic nevi female abdomen 80.0,abdomen,female,80.0,nv
1,mel/mel_0.jpg,melanoma male upper extremity 80.0,upper extremity,male,80.0,mel
2,bkl/bkl_0.jpg,benign keratosis-like lesions (solar lentigine...,face,female,40.0,bkl
3,bcc/bcc_0.jpg,basal cell carcinoma female abdomen 80.0,abdomen,female,80.0,bcc
4,akiec/akiec_0.jpg,Actinic keratoses and intraepithelial carcinom...,neck,male,75.0,akiec
...,...,...,...,...,...,...
6995,bkl/bkl_999.jpg,benign keratosis-like lesions (solar lentigine...,face,male,80.0,bkl
6996,bcc/bcc_999.jpg,basal cell carcinoma male back 70.0,back,male,70.0,bcc
6997,akiec/akiec_999.jpg,Actinic keratoses and intraepithelial carcinom...,lower extremity,male,60.0,akiec
6998,vasc/vasc_999.jpg,"vascular lesions (angiomas, angiokeratomas, py...",upper extremity,female,70.0,vasc


In [None]:
df_generative_prompts.to_csv(OUTPUT_CSV)

In [124]:
for index, row in df_generative_prompts.iterrows():
    file_name = row['file_name']
    text = row['text']
    try:
        os.makedirs(SYNTH_DATASET_PATH + row['dx'])
    except FileExistsError:
        pass
    
    image = pipe(prompt=text).images[0]
    image.save(SYNTH_DATASET_PATH + row['file_name'])

 30%|█████████████████████████████                                                                    | 15/50 [00:04<00:10,  3.47it/s]


KeyboardInterrupt: 