# Notebook: Train Classifier

This notebook is used to train a classifier
<br>
**Contributors:** Nils Hellwig 

## Import Packages

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Concatenate, Conv2D, Flatten, Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.applications import EfficientNetB0 as BaseModel
from tensorflow.keras.models import Model
from one_hot_encoder import *
import pandas as pd
import numpy as np
import random
import os

sex ['female', 'male', 'unknown']
localization ['abdomen', 'acral', 'back', 'chest', 'ear', 'face', 'foot', 'genital', 'hand', 'lower extremity', 'neck', 'scalp', 'trunk', 'unknown', 'upper extremity']
dx ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']


## Parameters

In [2]:
DATASET_PATH = "../Datasets/dataset/"
SYNTH_DATASET_PATH = "../Datasets/synth_dataset/"
SEED = 42
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224
IMAGE_CHANNELS = 3
BATCH_SIZE = 32

In [3]:
N_DIM_AGE = 1
N_DIM_SEX = 3
N_DIM_LOCALIZATION = 15

In [4]:
N_CLASSES_PREDICTOR = 7

## Settings

In [5]:
np.random.seed(SEED)
random.seed(SEED)

In [6]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "True"

## Code

### Load Dataframes

In [7]:
train_df_real = pd.read_csv(DATASET_PATH + "train.csv")
train_df_real

Unnamed: 0.1,Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset,filepath
0,8050,HAM_0005972,ISIC_0033319,nv,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0033319.jpg
1,4898,HAM_0004902,ISIC_0030823,nv,follow_up,40.0,male,trunk,vidir_molemax,dataset/ISIC_0030823.jpg
2,9695,HAM_0005282,ISIC_0028730,akiec,histo,65.0,male,lower extremity,rosendahl,dataset/ISIC_0028730.jpg
3,4090,HAM_0000475,ISIC_0027299,nv,follow_up,40.0,male,lower extremity,vidir_molemax,dataset/ISIC_0027299.jpg
4,8625,HAM_0000949,ISIC_0032444,nv,histo,65.0,male,back,rosendahl,dataset/ISIC_0032444.jpg
...,...,...,...,...,...,...,...,...,...,...
8007,2360,HAM_0000940,ISIC_0032692,vasc,histo,35.0,female,lower extremity,vidir_modern,dataset/ISIC_0032692.jpg
8008,3409,HAM_0005629,ISIC_0029317,nv,follow_up,45.0,female,upper extremity,vidir_molemax,dataset/ISIC_0029317.jpg
8009,8736,HAM_0004025,ISIC_0025983,nv,histo,20.0,female,abdomen,rosendahl,dataset/ISIC_0025983.jpg
8010,2399,HAM_0004542,ISIC_0027256,vasc,consensus,0.0,female,back,vidir_modern,dataset/ISIC_0027256.jpg


In [8]:
train_df_real = train_df_real.rename(columns={"image_id": "file_name"})
train_df_real["file_name"] = "../Datasets/dataset/train/" + train_df_real["dx"] + "/" + train_df_real["file_name"] + ".jpg"

In [9]:
test_df_real = pd.read_csv(DATASET_PATH + "test.csv")
test_df_real = test_df_real.rename(columns={"image_id": "file_name"})
test_df_real = test_df_real[["dx", "age", "sex", "localization", "file_name"]]
test_df_real["file_name"] = "../Datasets/dataset/test/" + test_df_real["dx"] + "/" + test_df_real["file_name"] + ".jpg"
test_df_real

Unnamed: 0,dx,age,sex,localization,file_name
0,nv,30.0,female,back,../Datasets/dataset/test/nv/ISIC_0030038.jpg
1,nv,25.0,male,lower extremity,../Datasets/dataset/test/nv/ISIC_0025442.jpg
2,mel,70.0,male,neck,../Datasets/dataset/test/mel/ISIC_0027204.jpg
3,nv,70.0,male,chest,../Datasets/dataset/test/nv/ISIC_0032165.jpg
4,bkl,55.0,female,lower extremity,../Datasets/dataset/test/bkl/ISIC_0033185.jpg
...,...,...,...,...,...
1998,nv,35.0,female,trunk,../Datasets/dataset/test/nv/ISIC_0034116.jpg
1999,bcc,55.0,female,back,../Datasets/dataset/test/bcc/ISIC_0026453.jpg
2000,mel,35.0,male,back,../Datasets/dataset/test/mel/ISIC_0029885.jpg
2001,mel,65.0,male,upper extremity,../Datasets/dataset/test/mel/ISIC_0033226.jpg


In [10]:
synth_df = pd.read_csv("../Datasets/generative_prompts.csv")
synth_df["file_name"] = "../Datasets/synth_dataset/" + synth_df["file_name"]
synth_df

Unnamed: 0.1,Unnamed: 0,file_name,text,localization,sex,age,dx
0,0,../Datasets/synth_dataset/nv/nv_0,melanocytic nevi female abdomen 80.0,abdomen,female,80.0,nv
1,1,../Datasets/synth_dataset/nv/nv_1,melanocytic nevi female scalp 45.0,scalp,female,45.0,nv
2,2,../Datasets/synth_dataset/nv/nv_2,melanocytic nevi female trunk 45.0,trunk,female,45.0,nv
3,3,../Datasets/synth_dataset/nv/nv_3,melanocytic nevi male lower extremity 45.0,lower extremity,male,45.0,nv
4,4,../Datasets/synth_dataset/nv/nv_4,melanocytic nevi female upper extremity 55.0,upper extremity,female,55.0,nv
...,...,...,...,...,...,...,...
6995,6995,../Datasets/synth_dataset/df/df_995,dermatofibroma male upper extremity 35.0,upper extremity,male,35.0,df
6996,6996,../Datasets/synth_dataset/df/df_996,dermatofibroma female lower extremity 35.0,lower extremity,female,35.0,df
6997,6997,../Datasets/synth_dataset/df/df_997,dermatofibroma male lower extremity 50.0,lower extremity,male,50.0,df
6998,6998,../Datasets/synth_dataset/df/df_998,dermatofibroma female abdomen 65.0,abdomen,female,65.0,df


In [11]:
train_df = pd.concat([train_df_real, synth_df])[["dx", "age", "sex", "localization", "file_name"]].reset_index()
train_df

Unnamed: 0,index,dx,age,sex,localization,file_name
0,0,nv,35.0,female,lower extremity,../Datasets/dataset/train/nv/ISIC_0033319.jpg
1,1,nv,40.0,male,trunk,../Datasets/dataset/train/nv/ISIC_0030823.jpg
2,2,akiec,65.0,male,lower extremity,../Datasets/dataset/train/akiec/ISIC_0028730.jpg
3,3,nv,40.0,male,lower extremity,../Datasets/dataset/train/nv/ISIC_0027299.jpg
4,4,nv,65.0,male,back,../Datasets/dataset/train/nv/ISIC_0032444.jpg
...,...,...,...,...,...,...
15007,6995,df,35.0,male,upper extremity,../Datasets/synth_dataset/df/df_995
15008,6996,df,35.0,female,lower extremity,../Datasets/synth_dataset/df/df_996
15009,6997,df,50.0,male,lower extremity,../Datasets/synth_dataset/df/df_997
15010,6998,df,65.0,female,abdomen,../Datasets/synth_dataset/df/df_998


In [12]:
train_df = train_df.sample(frac=1).reset_index(drop=True)

In [13]:
test_df_real = test_df_real.sample(frac=1).reset_index(drop=True)

Check nan!

### Create Model

In [14]:
# 1. AGE INPUT
age_input = Input((N_DIM_AGE))

# 2. SEX INPUT
sex_input = Input((N_DIM_SEX))

# 3. LOCALIZATION INPUT
localization_input = Input((N_DIM_LOCALIZATION))

# 4. IMAGE INPUT: Convolution + Flatten for the image
image_input = Input((IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS))
base_model = BaseModel(weights='imagenet', include_top=False, input_shape=(IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS))
base_model.trainable = False

conv_layer = base_model(image_input)
conv_input = GlobalAveragePooling2D()(conv_layer)
print(conv_input)

# Concatenate the convolutional features and the vector input
concat_layer = Concatenate()([age_input, sex_input, localization_input, conv_input])
cl_layer1 = Dense(512, activation="relu")(concat_layer)
cl_layer2 = Dense(512, activation="relu")(cl_layer1)
cl_layer3 = Dense(512, activation="relu")(cl_layer2)
output = Dense(N_CLASSES_PREDICTOR, activation="softmax")(cl_layer3)

# define a model with a list of two inputs
model = Model(inputs=[age_input, sex_input, localization_input, image_input], outputs=output)
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy'])

2023-03-22 07:57:00.679831: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-22 07:57:01.134402: E tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] The TF_FORCE_GPU_ALLOW_GROWTH environment variable is set but could not be parsed: "True". Valid values are "true" or "false". Using original config value of 0.
2023-03-22 07:57:01.134437: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46695 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:68:00.0, compute capability: 8.6


KerasTensor(type_spec=TensorSpec(shape=(None, 1280), dtype=tf.float32, name=None), name='global_average_pooling2d/Mean:0', description="created by layer 'global_average_pooling2d'")


In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 efficientnetb0 (Functional)    (None, 7, 7, 1280)   4049571     ['input_4[0][0]']                
                                                                                                  
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 3)]          0           []                           

### Create Generator

In [26]:
class Generator(tf.keras.utils.Sequence):

    def __init__(self, df, X_col, y_col, batch_size, input_size, shuffle=True, test_env=False):
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.batch_size = batch_size
        self.input_size = input_size
        self.shuffle = shuffle
        self.data_generator_size = len(self.df)
        self.test_env = test_env
        self.used_synth_examples = set()

    def shuffle_df(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
            
    def on_epoch_end(self):
        self.shuffle_df()
    
    def add_augmentation(self, img):
        if tf.random.uniform(()) > 0.5:
            img = tf.image.flip_left_right(img)
        
        if tf.random.uniform(()) > 0.5:
            img = tf.image.random_flip_up_down(img)
            
        if tf.random.uniform(()) > 0.5:
            img = tf.keras.preprocessing.image.random_rotation(img, 0.2)
        return img
    
    
            
    def __get_image(self, image_id):
        if image_id.startswith("../Datasets/synth_dataset/"):
            for i in range(0, 16):
                if not((image_id + "_"+ str(i)) in self.used_synth_examples):
                    self.used_synth_examples.add(image_id + "_"+ str(i))
                    image_arr = tf.io.read_file(image_id + "_"+ str(i)+".jpg")
                    break
                elif i == 15:
                    print("reset!")
                    self.used_synth_examples = set()
                    self.used_synth_examples.add(image_id + "_"+ str(i))
                    image_arr = tf.io.read_file(image_id + "_"+ str(i)+".jpg")
                    break
                    
        else:
            image_arr = tf.io.read_file(image_id)

        image_arr = tf.io.decode_jpeg(image_arr)
        image_arr = tf.image.resize(image_arr, [IMAGE_HEIGHT, IMAGE_WIDTH], antialias=True, method="bicubic")
        if self.test_env == False:
            image_arr = self.add_augmentation(image_arr)
        image_arr = preprocess_input(image_arr)
        return image_arr

    def __get_input(self, image_id_batch):
        image_batch = np.array([self.__get_image(image_id) for image_id in image_id_batch]).reshape(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, IMAGE_CHANNELS)
        age_batch = np.array([self.df.loc[self.df['file_name'] == image_id].age.values[0] for image_id in image_id_batch]).reshape(BATCH_SIZE, 1) / 100
        age_batch = np.nan_to_num(age_batch, copy=True, nan=0.0, posinf=None, neginf=None)
        sex_batch = np.array([one_hot_encode_sex(self.df.loc[self.df['file_name'] == image_id].sex.values[0]) for image_id in image_id_batch]).reshape(BATCH_SIZE, 3)
        localization_batch = np.array([one_hot_encode_localization(self.df.loc[self.df['file_name'] == image_id].localization.values[0]) for image_id in image_id_batch]).reshape(BATCH_SIZE, 15)

        return [age_batch,
                sex_batch, 
                localization_batch,
                image_batch]

    def __get_output(self, label, num_classes):
        return one_hot_encode_dx(label)

    def __get_data(self, batches):
        image_id_batch = batches[self.X_col]
        label_batch = batches[self.y_col]
    
        X_batch = self.__get_input(list(image_id_batch))
        y0_batch = np.asarray([self.__get_output(y, N_CLASSES_PREDICTOR) for y in label_batch])

        return X_batch, y0_batch

    def __getitem__(self, index):
        batches = self.df[index * self.batch_size:(index + 1) * self.batch_size]
        X, y = self.__get_data(batches)
        return X, y
    
    def __len__(self):
        return self.data_generator_size // self.batch_size

In [27]:
train_generator = Generator(train_df, X_col="file_name", y_col="dx", batch_size = BATCH_SIZE, input_size=(IMAGE_HEIGHT, IMAGE_WIDTH))

In [28]:
test_generator = Generator(test_df_real, X_col="file_name", y_col="dx", batch_size = BATCH_SIZE, input_size=(IMAGE_HEIGHT, IMAGE_WIDTH), test_env = True, shuffle=False)

In [19]:
for bt in train_generator:
    print(bt[0][3][0][0][0][:2], bt[0][3][1][0][0][:2])
    break

[234.48503 184.70955] [202.46703 132.805  ]


### Train Model

In [20]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor = "val_accuracy", patience = 25, restore_best_weights = True, min_delta = 0.001, verbose = 1)

In [29]:
history = model.fit(train_generator, epochs=100, verbose=1, validation_data=test_generator, callbacks=[early_stopping_callback])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
 30/469 [>.............................] - ETA: 8:12 - loss: 0.2618 - accuracy: 0.9021reset!
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
reset!
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
reset!
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
reset!
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/10

In [30]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report

# Konvertiere die Vorhersagen in Labels (one-hot)
class_indices = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

y_label = np.array([])
y_pred = np.array([])

for inputs, targets in test_generator:
    y_batch_predict = model.predict(inputs)
    y_batch_label = targets
    #print(y_batch_label[0])
    #print(y_batch_predict[0])
    
    y_batch_predict = np.array(class_indices)[np.argmax(y_batch_predict, axis=1)]
    y_batch_label = np.array(class_indices)[np.argmax(y_batch_label, axis=1)]
    
    
    y_label = np.concatenate([y_label, y_batch_label])
    y_pred = np.concatenate([y_pred, y_batch_predict])


# Berechne Accuracy, Precision und Recall für alle Labels
accuracy = accuracy_score(y_label, y_pred)
classification = classification_report(y_label, y_pred, target_names=class_indices, digits=3, output_dict=True)

# Ausgabe der Ergebnisse für jede Klasse
for class_name in class_indices:
    metrics = classification[class_name]
    print(f'Class: {class_name}\nPrecision: {metrics["precision"]}\nRecall: {metrics["recall"]}\nF1-Score: {metrics["f1-score"]}\nSupport: {metrics["support"]}\n')
    
# Ausgabe der aggregierten Ergebnisse
print('Accuracy:', accuracy)

Class: akiec
Precision: 0.5277777777777778
Recall: 0.6031746031746031
F1-Score: 0.562962962962963
Support: 63

Class: bcc
Precision: 0.7011494252873564
Recall: 0.6039603960396039
F1-Score: 0.648936170212766
Support: 101

Class: bkl
Precision: 0.6894977168949772
Recall: 0.6926605504587156
F1-Score: 0.6910755148741419
Support: 218

Class: df
Precision: 0.6666666666666666
Recall: 0.5217391304347826
F1-Score: 0.5853658536585366
Support: 23

Class: mel
Precision: 0.625
Recall: 0.6531531531531531
F1-Score: 0.6387665198237885
Support: 222

Class: nv
Precision: 0.9308790383170549
Recall: 0.9322799097065463
F1-Score: 0.9315789473684211
Support: 1329

Class: vasc
Precision: 0.92
Recall: 0.8214285714285714
F1-Score: 0.8679245283018867
Support: 28

Accuracy: 0.8412298387096774


In [31]:
model.evaluate(test_generator)



[0.630845844745636, 0.8412298560142517]