<a href="https://colab.research.google.com/github/Tasneem1028/CV-DL/blob/main/Skin_Lesion_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#step 1: load the dataset

In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from glob import glob
from PIL import Image
from scipy import stats
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import (
    Dense, Dropout, Flatten, Conv2D, MaxPooling2D,
    BatchNormalization, GlobalAveragePooling2D
)
from tensorflow.keras.applications import MobileNet
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.metrics import top_k_categorical_accuracy


In [None]:
#read metadata
skin_df=pd.read_csv('HAM10000_metadata.csv')
print(skin_df.head())


In [None]:
# label encoding to numeric values from text
le = LabelEncoder()
le.fit(skin_df['dx'])
LabelEncoder()
print(list(le.classes_))

skin_df['label'] = le.transform(skin_df["dx"])
print(skin_df.sample(10))


In [None]:
# Data distribution visualization
fig = plt.figure(figsize=(12,8))

ax1 = fig.add_subplot(221)
skin_df['dx'].value_counts().plot(kind='bar', ax=ax1)
ax1.set_ylabel('Count')
ax1.set_title('Cell Type');

ax2 = fig.add_subplot(222)
skin_df['sex'].value_counts().plot(kind='bar', ax=ax2)
ax2.set_ylabel('Count', size=15)
ax2.set_title('Sex');

ax3 = fig.add_subplot(223)
skin_df['localization'].value_counts().plot(kind='bar')
ax3.set_ylabel('Count',size=12)
ax3.set_title('Localization')

ax4 = fig.add_subplot(224)
sample_age = skin_df[pd.notnull(skin_df['age'])]
sns.distplot(sample_age['age'], fit=stats.norm, color='red');
ax4.set_title('Age')

plt.tight_layout()
plt.show()

In [None]:
# Distribution of data into various classes
print(skin_df['label'].value_counts())

In [None]:
#Balance data.
#Separate each classes, resample, and combine back into single dataframe
df_0 = skin_df[skin_df['label'] == 0]
df_1 = skin_df[skin_df['label'] == 1]
df_2 = skin_df[skin_df['label'] == 2]
df_3 = skin_df[skin_df['label'] == 3]
df_4 = skin_df[skin_df['label'] == 4]
df_5 = skin_df[skin_df['label'] == 5]
df_6 = skin_df[skin_df['label'] == 6]

n_samples=500
df_0_balanced = resample(df_0, replace=True, n_samples=n_samples, random_state=42)
df_1_balanced = resample(df_1, replace=True, n_samples=n_samples, random_state=42)
df_2_balanced = resample(df_2, replace=True, n_samples=n_samples, random_state=42)
df_3_balanced = resample(df_3, replace=True, n_samples=n_samples, random_state=42)
df_4_balanced = resample(df_4, replace=True, n_samples=n_samples, random_state=42)
df_5_balanced = resample(df_5, replace=True, n_samples=n_samples, random_state=42)
df_6_balanced = resample(df_6, replace=True, n_samples=n_samples, random_state=42)

#Combined back to a single dataframe
skin_df_balanced = pd.concat([df_0_balanced, df_1_balanced,
                              df_2_balanced, df_3_balanced,
                              df_4_balanced, df_5_balanced, df_6_balanced])




In [None]:
#Check the distribution. All classes should be balanced now.
print(skin_df_balanced['label'].value_counts())

In [None]:


# âœ… Define paths for both image folders
image_folders = ['HAM10000_images_part_1', 'HAM10000_images_part_2']

# âœ… Collect all image file paths from both folders
image_path = {}
for folder in image_folders:
    for path in glob(os.path.join(folder, '*.jpg')):
        image_path[os.path.basename(path)] = path

# âœ… Add '.jpg' to image_id and map to actual paths
skin_df['image_id'] = skin_df['image_id'] + '.jpg'
skin_df_balanced['path'] = skin_df['image_id'].map(image_path.get)

# âœ… Quick checks
print("Total images found:", len(image_path))
print(skin_df_balanced.sample(3))
print(skin_df['label'].head())
print(skin_df['dx'].head())




In [None]:

SIZE = 224
skin_df_balanced['image'] = skin_df_balanced['path'].map(
    lambda p: np.asarray(
        Image.open(p).convert("RGB").resize((SIZE, SIZE)),
        dtype=np.uint8
    )
)


In [None]:
print(skin_df_balanced.sample(3))

In [None]:
n_samples = 5  # number of samples for plotting
# Plotting
fig, m_axs = plt.subplots(7, n_samples, figsize = (4*n_samples, 3*7))
for n_axs, (type_name, type_rows) in zip(m_axs,
                                         skin_df_balanced.sort_values(['dx']).groupby('dx')):
    n_axs[0].set_title(type_name)
    for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples, random_state=1234).iterrows()):
        c_ax.imshow(c_row['image'])
        c_ax.axis('off')



In [None]:
#Convert dataframe column of images into numpy array
X = np.asarray(skin_df_balanced['image'].tolist())
X = X/255.  # Scale values to 0-1. You can also used standardscaler or other scaling methods.
Y=skin_df_balanced['label']  #Assign label values to Y
Y_cat = to_categorical(Y, num_classes=7) #Convert to categorical as this is a multiclass classification problem


X_trainval, X_test, Y_trainval, Y_test = train_test_split(
    X, Y_cat,
    test_size=0.10,
    stratify=Y,        # IMPORTANT for HAM10000 imbalance
    random_state=42
)

X_train, X_val, Y_train, Y_val = train_test_split(
    X_trainval, Y_trainval,
    test_size=0.1111,    # gives you exactly 10% of original dataset
    stratify=Y_trainval,
    random_state=42
)

print("Train set:", X_train.shape, Y_train.shape)
print("Validation set:", X_val.shape, Y_val.shape)
print("Test set:", X_test.shape, Y_test.shape)

In [None]:
datagen = ImageDataGenerator(
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)
#AUGMNENTATION

In [None]:
# Define the MobileNet-based model
image_size = 224
num_classes = 7

base_model = MobileNet(input_shape=(image_size, image_size, 3), include_top=False, weights='imagenet')
base_model.trainable = False
x = base_model.layers[-1].output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.25)(x)
predictions = Dense(7, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=predictions)

# Custom accuracy metrics
def top_3_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=3)

def top_2_accuracy(y_true, y_pred):
    return top_k_categorical_accuracy(y_true, y_pred, k=2)

# Compile the model
model.compile(Adam(learning_rate=1e-3), loss='categorical_crossentropy', metrics=['categorical_accuracy', top_2_accuracy, top_3_accuracy])

# Class weights
class_weights = {
    0: 1.0,
    1: 1.0,
    2: 1.0,
    3: 1.0,
    4: 1.7,
    5: 1.0,
    6: 1.0,
}
#{0: 'akiec', 1: 'bcc', 2: 'bkl', 3: 'df', 4: 'mel', 5: 'nv', 6: 'vasc'}
'''Melanocytic nevi (nv)
Benign Keratosis (bkl)
Dermatofibroma (df)
Actinic Keratosis (akiec)
vasculaer lesion(vas)
Melanoma (mel)
Basal Cell Carcinoma (bcc)
'''
# Checkpoint and learning rate reduction callbacks
filepath = "mobilenet_tl.keras"
checkpoint = ModelCheckpoint(filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')

reduce_lr = ReduceLROnPlateau(monitor='val_categorical_accuracy', factor=0.55, patience=2, verbose=1, mode='max', min_lr=0.00001)

callbacks_list = [checkpoint, reduce_lr]


batch_size = 32
epochs = 10

history = model.fit(
    datagen.flow(X_train, Y_train, batch_size=batch_size),
    steps_per_epoch=len(X_train) // batch_size,
    validation_data=(X_val, Y_val),
    epochs=epochs,
    class_weight=class_weights,
    callbacks=callbacks_list,
    verbose=1
)

# Evaluate on validation set (optional)
val_loss, val_cat_acc, val_top_2_acc, val_top_3_acc = model.evaluate(X_val, Y_val)

print('val_loss:', val_loss)
print('val_cat_acc:', val_cat_acc)
print('val_top_2_acc:', val_top_2_acc)
print('val_top_3_acc:', val_top_3_acc)


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix


model.save("mobilenet_tl.keras")
print("Model saved to mobilenet_tl.keras")

if 'history' in locals():
    # Retrieve metrics
    # Note: Keras keys might vary, this safely grabs them
    acc = history.history.get('categorical_accuracy', history.history.get('accuracy'))
    val_acc = history.history.get('val_categorical_accuracy', history.history.get('val_accuracy'))
    loss = history.history['loss']
    val_loss = history.history['val_loss']

    epochs_range = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))

    # Plot Accuracy
    plt.subplot(1, 2, 1)
    plt.plot(epochs_range, acc, label='Training Accuracy')
    plt.plot(epochs_range, val_acc, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.grid(True)

    # Plot Loss
    plt.subplot(1, 2, 2)
    plt.plot(epochs_range, loss, label='Training Loss')
    plt.plot(epochs_range, val_loss, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(loc='upper right')
    plt.grid(True)

    plt.show()
else:
    print("Skipping graphs: 'history' variable not found (model was likely loaded, not just trained).")

print("Generating predictions...")
predictions = model.predict(X_test)

test_loss, test_cat_acc, test_top_2_acc, test_top_3_acc = model.evaluate(X_test, Y_test)

print('\nðŸ“Œ TEST METRICS')
print(f'Test loss: {test_loss:.4f}')
print(f'Test categorical accuracy: {test_cat_acc:.4f}')
print(f'Test top-2 accuracy: {test_top_2_acc:.4f}')
print(f'Test top-3 accuracy: {test_top_3_acc:.4f}')


y_true = np.argmax(Y_test, axis=1)

# Convert predictions to integer labels
y_pred = np.argmax(predictions, axis=1)

# HAM10000 class names
class_names = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'nv', 'vasc']

print("\nðŸ“Œ CLASSIFICATION REPORT (HAM10000)\n")
print(classification_report(
    y_true,
    y_pred,
    target_names=class_names,
    digits=2
))

print("\nðŸ“Œ CONFUSION MATRIX\n")
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names,
            yticklabels=class_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


In [None]:
image_path = str(input("image for testing: ")) # path to your image

In [None]:





# print("Predicted label:", predicted_label)
def preprocess_image(image_path, target_size=(224, 224)):
    # Load the image and convert to RGB to ensure 3 channels
    image = Image.open(image_path).convert("RGB")
    
    # Resize to the same size used in training (224x224)
    image = image.resize(target_size)

    # Convert to array and scale to [0, 1] just like training
    image_array = np.array(image).astype("float32") / 255.0

    # Add batch dimension: (224,224,3) -> (1,224,224,3)
    image_array = np.expand_dims(image_array, axis=0)

    return image_array

def classify_image(image_path, model):
    # Preprocess the image
    image_array = preprocess_image(image_path)

    # Make prediction
    predictions = model.predict(image_array)

    # Get the predicted label index
    predicted_label = np.argmax(predictions, axis=1)[0]

    return predicted_label


# Custom metric functions (same as in training)
def top_2_accuracy(y_true, y_pred):
    return tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=2)

def top_3_accuracy(y_true, y_pred):
    return tf.keras.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)

# Load the trained model
model = load_model(
    "models.h5",
    custom_objects={"top_2_accuracy": top_2_accuracy, "top_3_accuracy": top_3_accuracy}
)

# Example usage (make sure image_path is defined)
predicted_label = classify_image(image_path, model)
print("Predicted label:", predicted_label)


In [None]:
label_to_dx_mapping = skin_df_balanced[['label', 'dx']].drop_duplicates().sort_values('label').set_index('label')['dx'].to_dict()

# Print the mapping
print(label_to_dx_mapping)