In [225]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [226]:
df = pd.read_csv("ddi_metadata.csv")

counts = df['disease'].value_counts()

rare_classes = counts[counts < 12].index
df['disease'] = df['disease'].apply(lambda x: 'other' if x in rare_classes else x)
print(df['disease'].value_counts())

disease
other                              179
melanocytic-nevi                   119
seborrheic-keratosis                58
verruca-vulgaris                    50
basal-cell-carcinoma                41
epidermal-cyst                      35
mycosis-fungoides                   32
squamous-cell-carcinoma-in-situ     28
dermatofibroma                      22
acrochordon                         19
squamous-cell-carcinoma             17
dysplastic-nevus                    16
seborrheic-keratosis-irritated      14
pyogenic-granuloma                  14
neurofibroma                        12
Name: count, dtype: int64


In [227]:
le = LabelEncoder()
df['disease'] = le.fit_transform(df['disease'])
num_classes = df['disease'].nunique()

In [228]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['disease'], random_state=42)

print("Train size:", len(train_df))
print("Val size:", len(val_df))

Train size: 524
Val size: 132


In [229]:
IMG_SIZE = (128,128)
IMG_DIR = "ddidiversedermatologyimages"

def load_image(filename, label):
    img_path = tf.strings.join([IMG_DIR, "/", filename])

    img = tf.io.read_file(img_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = img / 255.0

    return img, label


In [230]:
data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
])

In [231]:
def build_dataset(df, batch_size=32, augment=False, shuffle=False):
    filenames = df['DDI_file'].values
    labels = df['disease'].values

    ds = tf.data.Dataset.from_tensor_slices((filenames, labels))
    ds = ds.map(load_image, num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle:
        ds = ds.shuffle(buffer_size=1000)
    if augment:
        ds = ds.map(lambda x,y: (data_augmentation(x), y), num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

In [232]:
train_ds = build_dataset(train_df, augment=True, shuffle=True)
val_ds = build_dataset(val_df)

In [233]:
print(train_df['disease'].value_counts())
print(val_df['disease'].value_counts())

disease
8     143
5      95
10     46
14     40
1      33
4      28
6      25
13     22
2      18
0      15
12     14
3      13
9      11
11     11
7      10
Name: count, dtype: int64
disease
8     36
5     24
10    12
14    10
1      8
6      7
4      7
13     6
0      4
2      4
12     3
9      3
3      3
11     3
7      2
Name: count, dtype: int64


In [234]:
model = models.Sequential([
    layers.Input(shape=(128,128,3)),
    layers.Conv2D(16, (3,3), activation='relu', padding='same'),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(32, (3,3), activation='relu', padding='same'),
    layers.MaxPooling2D(2,2),
    layers.Conv2D(64, (3,3), activation='relu', padding='same'),
    layers.MaxPooling2D(2,2),
    layers.Flatten(),
    layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)),
    layers.Dropout(0.5),
    layers.Dense(num_classes, activation='softmax')
])

In [235]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [236]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

In [237]:
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=50,
    callbacks=[callback]
)

model.save("cnn_model.keras")

Epoch 1/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.1597 - loss: 2.8270 - val_accuracy: 0.2727 - val_loss: 2.4940
Epoch 2/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.2157 - loss: 2.5593 - val_accuracy: 0.2727 - val_loss: 2.4254
Epoch 3/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.2639 - loss: 2.5234 - val_accuracy: 0.2727 - val_loss: 2.4040
Epoch 4/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.2750 - loss: 2.4686 - val_accuracy: 0.2727 - val_loss: 2.4066
Epoch 5/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.2523 - loss: 2.4554 - val_accuracy: 0.2727 - val_loss: 2.3907
Epoch 6/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - accuracy: 0.2363 - loss: 2.4582 - val_accuracy: 0.2727 - val_loss: 2.3625
Epoch 7/50
[1m17/17[0m [32m━━━━