In [None]:
from google.colab import files
uploaded = files.upload()
for fn in uploaded.keys():
  print(f'User uploaded file "{fn}" with length {len(uploaded[fn])} bytes')


Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 68 bytes


In [None]:
!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d kmader/skin-cancer-mnist-ham10000


403 - Forbidden - Permission 'datasets.get' was denied
Dataset URL: https://www.kaggle.com/datasets/kmader/skin-cancer-mnist-ham10000
License(s): CC-BY-NC-SA-4.0
Downloading skin-cancer-mnist-ham10000.zip to /content
100% 5.19G/5.20G [01:10<00:00, 101MB/s] 
100% 5.20G/5.20G [01:10<00:00, 79.6MB/s]


In [None]:
from zipfile import ZipFile
file_name = "/content/skin-cancer-mnist-ham10000.zip"
with ZipFile(file_name, 'r') as zip:
    zip.extractall()
    print('Dataset extracted successfully')

Dataset extracted successfully


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16  # Import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf

In [None]:
img_width, img_height = 128, 128
batch_size = 32
epochs = 20
metadata = pd.read_csv('/content/HAM10000_metadata.csv')
image_dirs = [
    '/content/HAM10000_images_part_1',
    '/content/HAM10000_images_part_2',
    '/content/ham10000_images_part_1',
    '/content/ham10000_images_part_2'
]

In [None]:
metadata.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [None]:
image_paths = [os.path.join(image_dir, fname + '.jpg') for image_dir in image_dirs for fname in metadata['image_id']]
labels = metadata['dx'].values
label_to_index = {label: i for i, label in enumerate(np.unique(labels))}
labels = [label_to_index[label] for label in labels]


In [None]:
# print("Length of image_paths:", len(image_paths))
# print("Length of labels:", len(labels))

In [None]:
if len(image_paths) != len(labels):
    if len(image_paths) > len(labels):
        labels = labels + [labels[-1]] * (len(image_paths) - len(labels))
    else:
        image_paths = image_paths + [image_paths[-1]] * (len(labels) - len(image_paths))


In [None]:
# Split the data into train, validation, and test sets
train_data, test_data = train_test_split(pd.DataFrame({'image_path': image_paths, 'label': labels}), test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.15, random_state=42)


In [None]:
# Data generators
train_datagen = ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    rescale=1./255
)

val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)


In [None]:
train_generator = train_datagen.flow_from_dataframe(
    train_data,
    x_col='image_path',
    y_col='label',
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='raw'
)

val_generator = val_datagen.flow_from_dataframe(
    val_data,
    x_col='image_path',
    y_col='label',
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='raw'
)

test_generator = test_datagen.flow_from_dataframe(
    test_data,
    x_col='image_path',
    y_col='label',
    target_size=(img_width, img_height),
    batch_size=batch_size,
    class_mode='raw'
)

Found 13589 validated image filenames.
Found 2443 validated image filenames.
Found 3998 validated image filenames.




In [None]:
# Base VGG16 model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(img_width, img_height, 3))


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m58889256/58889256[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Fine-tune the last few layers
for layer in base_model.layers:
    layer.trainable = True


In [None]:
# Model architecture
x = base_model.output
x = Flatten()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
x = BatchNormalization()(x)
output = Dense(len(label_to_index), activation='softmax')(x)  # Output layer for multi-class classification


In [None]:
model = Model(inputs=base_model.input, outputs=output)


In [None]:
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
# Callbacks
best_model = ModelCheckpoint('best_model.keras', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
early_stop = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)  # Increased patience
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00001)  # Adjusted patience


In [None]:
history = model.fit(
    train_generator,
    epochs=epochs,
    validation_data=val_generator,
    callbacks=[best_model, early_stop, reduce_lr]
)

Epoch 1/20


  self._warn_if_super_not_called()


[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 520ms/step - accuracy: 0.2436 - loss: 2.1760
Epoch 1: val_accuracy improved from -inf to 0.47524, saving model to best_model.keras
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 617ms/step - accuracy: 0.2438 - loss: 2.1752 - val_accuracy: 0.4752 - val_loss: 1.8430 - learning_rate: 1.0000e-04
Epoch 2/20
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 445ms/step - accuracy: 0.6717 - loss: 1.2119
Epoch 2: val_accuracy improved from 0.47524 to 0.76627, saving model to best_model.keras
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m254s[0m 512ms/step - accuracy: 0.6718 - loss: 1.2116 - val_accuracy: 0.7663 - val_loss: 0.8393 - learning_rate: 1.0000e-04
Epoch 3/20
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 414ms/step - accuracy: 0.7627 - loss: 0.8619
Epoch 3: val_accuracy did not improve from 0.76627
[1m425/425[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [None]:
test_loss, test_acc = model.evaluate(val_generator, steps=val_generator.samples // batch_size)
print("Test Accuracy:", test_acc)

[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 284ms/step - accuracy: 0.7755 - loss: 0.6596
Test Accuracy: 0.765625


In [None]:
# Predict labels for the test set
y_pred = model.predict(test_generator)
y_pred_classes = np.argmax(y_pred, axis=1)


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 358ms/step


In [None]:
# Convert true labels to one-hot encoded format
y_true = tf.keras.utils.to_categorical(test_generator.labels, num_classes=len(label_to_index))
y_true_classes = np.argmax(y_true, axis=1)


In [None]:
# Generate classification report
print("Classification Report:")
print(classification_report(y_true_classes, y_pred_classes, target_names=label_to_index.keys()))


Classification Report:
              precision    recall  f1-score   support

       akiec       0.00      0.00      0.00        36
         bcc       0.00      0.00      0.00        56
         bkl       0.00      0.00      0.00       123
          df       0.00      0.00      0.00        14
         mel       0.77      1.00      0.87      3073
          nv       0.00      0.00      0.00       687
        vasc       0.00      0.00      0.00         9

    accuracy                           0.77      3998
   macro avg       0.11      0.14      0.12      3998
weighted avg       0.59      0.77      0.67      3998



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Generate confusion matrix
conf_matrix = confusion_matrix(y_true_classes, y_pred_classes)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[   0    0    0    0   36    0    0]
 [   0    0    0    0   56    0    0]
 [   0    0    0    0  123    0    0]
 [   0    0    0    0   14    0    0]
 [   0    0    0    0 3073    0    0]
 [   0    0    0    0  687    0    0]
 [   0    0    0    0    9    0    0]]
