# Notebook: Train Classifier

This notebook is used to train a classifier
<br>
**Contributors:** Nils Hellwig 

## Import Packages

In [1]:
from tensorflow.keras.layers import Input, Concatenate, Conv2D, Flatten, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.applications import EfficientNetB0 as BaseModel
from tensorflow.keras.models import Model
from one_hot_encoder import *
import tensorflow as tf
import pandas as pd
import numpy as np
import random
import os

sex ['female', 'male', 'unknown']
localization ['abdomen', 'acral', 'back', 'chest', 'ear', 'face', 'foot', 'genital', 'hand', 'lower extremity', 'neck', 'scalp', 'trunk', 'unknown', 'upper extremity']
dx ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']


In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

## Parameters

In [3]:
DATASET_PATH = "../Datasets/dataset/"
SYNTH_DATASET_PATH = "../Datasets/synth_dataset/"
SEED = 42
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224
IMAGE_CHANNELS = 3
BATCH_SIZE = 32

In [4]:
N_DIM_AGE = 1
N_DIM_SEX = 3
N_DIM_LOCALIZATION = 15

In [5]:
N_CLASSES_PREDICTOR = 7

## Settings

In [6]:
np.random.seed(SEED)
random.seed(SEED)

## Code

### Load Dataframes

In [7]:
train_df_real = pd.read_csv(DATASET_PATH + "train.csv")
train_df_real

Unnamed: 0.1,Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
0,8050,HAM_0005972,ISIC_0033319,nv,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0033319.jpg
1,4898,HAM_0004902,ISIC_0030823,nv,follow_up,40.0,male,trunk,vidir_molemax,dataset/ISIC_0030823.jpg
2,9695,HAM_0005282,ISIC_0028730,akiec,histo,65.0,male,lower extremity,rosendahl,dataset/ISIC_0028730.jpg
3,4090,HAM_0000475,ISIC_0027299,nv,follow_up,40.0,male,lower extremity,vidir_molemax,dataset/ISIC_0027299.jpg
4,8625,HAM_0000949,ISIC_0032444,nv,histo,65.0,male,back,rosendahl,dataset/ISIC_0032444.jpg
...,...,...,...,...,...,...,...,...,...,...
8007,2360,HAM_0000940,ISIC_0032692,vasc,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0032692.jpg
8008,3409,HAM_0005629,ISIC_0029317,nv,follow_up,45.0,female,upper extremity,vidir_molemax,dataset/ISIC_0029317.jpg
8009,8736,HAM_0004025,ISIC_0025983,nv,histo,20.0,female,abdomen,rosendahl,dataset/ISIC_0025983.jpg
8010,2399,HAM_0004542,ISIC_0027256,vasc,consensus,0.0,female,back,vidir_modern,dataset/ISIC_0027256.jpg


In [8]:
train_df_real = train_df_real.rename(columns={"image_id": "file_name"})
train_df_real["file_name"] = "../Datasets/dataset/train/" + train_df_real["dx"] + "/" + train_df_real["file_name"] + ".jpg"

In [9]:
test_df_real = pd.read_csv(DATASET_PATH + "test.csv")
test_df_real = test_df_real.rename(columns={"image_id": "file_name"})
test_df_real = test_df_real[["dx", "age", "sex", "localization", "file_name"]]
test_df_real["file_name"] = "../Datasets/dataset/test/" + test_df_real["dx"] + "/" + test_df_real["file_name"] + ".jpg"
test_df_real

Unnamed: 0,dx,age,sex,localization,file_name
0,nv,30.0,female,back,../Datasets/dataset/test/nv/ISIC_0030038.jpg
1,nv,25.0,male,lower extremity,../Datasets/dataset/test/nv/ISIC_0025442.jpg
2,mel,70.0,male,neck,../Datasets/dataset/test/mel/ISIC_0027204.jpg
3,nv,70.0,male,chest,../Datasets/dataset/test/nv/ISIC_0032165.jpg
4,bkl,55.0,female,lower extremity,../Datasets/dataset/test/bkl/ISIC_0033185.jpg
...,...,...,...,...,...
1998,nv,35.0,female,trunk,../Datasets/dataset/test/nv/ISIC_0034116.jpg
1999,bcc,55.0,female,back,../Datasets/dataset/test/bcc/ISIC_0026453.jpg
2000,mel,35.0,male,back,../Datasets/dataset/test/mel/ISIC_0029885.jpg
2001,mel,65.0,male,upper extremity,../Datasets/dataset/test/mel/ISIC_0033226.jpg


In [10]:
synth_df = pd.read_csv("../Datasets/generative_prompts.csv")
synth_df["file_name"] = "../Datasets/synth_dataset/" + synth_df["file_name"]
synth_df = synth_df[synth_df["file_name"].apply(lambda x: os.path.isfile(x))]
synth_df

Unnamed: 0.1,Unnamed: 0,file_name,text,localization,sex,age,dx
0,0,../Datasets/synth_dataset/nv/nv_0.jpg,melanocytic nevi female abdomen 80.0,abdomen,female,80.0,nv
1,1,../Datasets/synth_dataset/mel/mel_0.jpg,melanoma male upper extremity 80.0,upper extremity,male,80.0,mel
2,2,../Datasets/synth_dataset/bkl/bkl_0.jpg,benign keratosis-like lesions (solar lentigine...,face,female,40.0,bkl
3,3,../Datasets/synth_dataset/bcc/bcc_0.jpg,basal cell carcinoma female abdomen 80.0,abdomen,female,80.0,bcc
4,4,../Datasets/synth_dataset/akiec/akiec_0.jpg,Actinic keratoses and intraepithelial carcinom...,neck,male,75.0,akiec
...,...,...,...,...,...,...,...
1749,1749,../Datasets/synth_dataset/df/df_249.jpg,dermatofibroma male upper extremity 65.0,upper extremity,male,65.0,df
1750,1750,../Datasets/synth_dataset/nv/nv_250.jpg,melanocytic nevi female upper extremity 30.0,upper extremity,female,30.0,nv
1751,1751,../Datasets/synth_dataset/mel/mel_250.jpg,melanoma male trunk 70.0,trunk,male,70.0,mel
1752,1752,../Datasets/synth_dataset/bkl/bkl_250.jpg,benign keratosis-like lesions (solar lentigine...,lower extremity,male,,bkl


In [11]:
train_df = pd.concat([train_df_real, synth_df])[["dx", "age", "sex", "localization", "file_name"]].reset_index(drop=True)
train_df

Unnamed: 0,dx,age,sex,localization,file_name
0,nv,35.0,female,lower extremity,../Datasets/dataset/train/nv/ISIC_0033319.jpg
1,nv,40.0,male,trunk,../Datasets/dataset/train/nv/ISIC_0030823.jpg
2,akiec,65.0,male,lower extremity,../Datasets/dataset/train/akiec/ISIC_0028730.jpg
3,nv,40.0,male,lower extremity,../Datasets/dataset/train/nv/ISIC_0027299.jpg
4,nv,65.0,male,back,../Datasets/dataset/train/nv/ISIC_0032444.jpg
...,...,...,...,...,...
9761,df,65.0,male,upper extremity,../Datasets/synth_dataset/df/df_249.jpg
9762,nv,30.0,female,upper extremity,../Datasets/synth_dataset/nv/nv_250.jpg
9763,mel,70.0,male,trunk,../Datasets/synth_dataset/mel/mel_250.jpg
9764,bkl,,male,lower extremity,../Datasets/synth_dataset/bkl/bkl_250.jpg


Check nan!

### Create Model

In [12]:
# 1. AGE INPUT
age_input = Input((N_DIM_AGE))

# 2. SEX INPUT
sex_input = Input((N_DIM_SEX))

# 3. LOCALIZATION INPUT
localization_input = Input((N_DIM_LOCALIZATION))

# 4. IMAGE INPUT: Convolution + Flatten for the image
image_input = Input((IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS))
base_model = BaseModel(weights='imagenet', include_top=False, input_shape=(IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS))
base_model.trainable = False
print("enter")

conv_layer = base_model(image_input)
conv_input = GlobalAveragePooling2D()(conv_layer)
print(conv_input)

# Concatenate the convolutional features and the vector input
concat_layer = Concatenate()([age_input, sex_input, localization_input, conv_input])
cl_layer1 = Dense(256, activation="relu")(concat_layer)
cl_layer2 = Dense(256, activation="relu")(cl_layer1)
output = Dense(N_CLASSES_PREDICTOR, activation="softmax")(cl_layer2)

# define a model with a list of two inputs
model = Model(inputs=[age_input, sex_input, localization_input, image_input], outputs=output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

Metal device set to: Apple M1 Max


2023-03-11 19:30:28.550062: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-03-11 19:30:28.550180: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


enter
KerasTensor(type_spec=TensorSpec(shape=(None, 1280), dtype=tf.float32, name=None), name='global_average_pooling2d/Mean:0', description="created by layer 'global_average_pooling2d'")


In [13]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 efficientnetb0 (Functional)    (None, 7, 7, 1280)   4049571     ['input_4[0][0]']                
                                                                                                  
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 3)]          0           []                           

### Create Generator

In [14]:
class Generator(tf.keras.utils.Sequence):

    def __init__(self, df, X_col, y_col, batch_size, input_size, shuffle=True):
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.batch_size = batch_size
        self.input_size = input_size
        self.shuffle = shuffle
        self.data_generator_size = len(self.df)
        self.test_env = test_env

    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def add_augmentation(self, img):
        if tf.random.uniform(()) > 0.5:
            img = tf.image.flip_left_right(img)
        
        if tf.random.uniform(()) > 0.5:
            img = tf.image.random_flip_up_down(img)
            
        if tf.random.uniform(()) > 0.5:
            img = tf.keras.preprocessing.image.random_rotation(img, 0.2)
        return img
            
    def __get_image(self, image_id):
        image_arr = tf.io.read_file(image_id)
        image_arr = tf.io.decode_jpeg(image_arr)
        image_arr = tf.image.resize(image_arr, [IMAGE_HEIGHT, IMAGE_WIDTH], antialias=True, method="bicubic")
        if self.test_env == False:
            image_arr = self.add_augmentation(image_arr)
        image_arr = preprocess_input(image_arr)
        return image_arr

    def __get_input(self, image_id_batch):
        image_batch = np.array([self.__get_image(image_id) for image_id in image_id_batch]).reshape(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS)
        age_batch = np.array([self.df.loc[self.df['file_name'] == image_id].age.values[0] for image_id in image_id_batch]).reshape(BATCH_SIZE, 1) / 100
        age_batch = np.nan_to_num(age_batch, copy=True, nan=0.0, posinf=None, neginf=None)
        sex_batch = np.array([one_hot_encode_sex(self.df.loc[self.df['file_name'] == image_id].sex.values[0]) for image_id in image_id_batch]).reshape(BATCH_SIZE, 3)
        localization_batch = np.array([one_hot_encode_localization(self.df.loc[self.df['file_name'] == image_id].localization.values[0]) for image_id in image_id_batch]).reshape(BATCH_SIZE, 15)

        return [age_batch,
                sex_batch, 
                localization_batch,
                image_batch]

    def __get_output(self, label, num_classes):
        return one_hot_encode_dx(label)

    def __get_data(self, batches):
        image_id_batch = batches[self.X_col]
        label_batch = batches[self.y_col]
        
        X_batch = self.__get_input(list(image_id_batch))
        y0_batch = np.asarray([self.__get_output(y, N_CLASSES_PREDICTOR) for y in label_batch])

        return X_batch, y0_batch

    def __getitem__(self, index):
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__get_data(batches)
        return X, y

    def __len__(self):
        return self.data_generator_size // self.batch_size

In [15]:
train_generator = Generator(train_df, X_col="file_name", y_col="dx", batch_size = BATCH_SIZE, input_size=(IMAGE_HEIGHT, IMAGE_WIDTH))
test_generator = Generator(test_df_real, X_col="file_name", y_col="dx", batch_size = BATCH_SIZE, input_size=(IMAGE_HEIGHT, IMAGE_WIDTH), , test_env = True, shuffle=False)

In [16]:
train_df

Unnamed: 0,dx,age,sex,localization,file_name
0,nv,35.0,female,lower extremity,../Datasets/dataset/train/nv/ISIC_0033319.jpg
1,nv,40.0,male,trunk,../Datasets/dataset/train/nv/ISIC_0030823.jpg
2,akiec,65.0,male,lower extremity,../Datasets/dataset/train/akiec/ISIC_0028730.jpg
3,nv,40.0,male,lower extremity,../Datasets/dataset/train/nv/ISIC_0027299.jpg
4,nv,65.0,male,back,../Datasets/dataset/train/nv/ISIC_0032444.jpg
...,...,...,...,...,...
9761,df,65.0,male,upper extremity,../Datasets/synth_dataset/df/df_249.jpg
9762,nv,30.0,female,upper extremity,../Datasets/synth_dataset/nv/nv_250.jpg
9763,mel,70.0,male,trunk,../Datasets/synth_dataset/mel/mel_250.jpg
9764,bkl,,male,lower extremity,../Datasets/synth_dataset/bkl/bkl_250.jpg


### Train Model

In [17]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy", patience = 10, restore_best_weights = True, min_delta = 0.001, verbose = 1)

In [None]:
history = model.fit(train_generator, epochs=10, verbose=1, validation_data=test_generator, callbacks=[early_stopping_callback])

Epoch 1/10


2023-03-11 19:30:30.859706: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-03-11 19:30:32.779027: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-03-11 19:34:24.287680: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10