<a href="https://colab.research.google.com/github/Smiah11/CO3519/blob/main/SaeedMiah_CO3519_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import numpy as np
import csv
import zipfile
from google.colab import drive
import PIL
from PIL import Image
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model
from sklearn.metrics import classification_report

#Mount Drive to access Data

In [None]:
#mount google drive
drive.mount('/content/drive',force_remount=True)
#get the path to the data
train_image_path = '/content/drive/My Drive/ISIC_2020_Training_JPEG/train'
train_data_path = '/content/drive/MyDrive/ISIC_2020_Training_GroundTruth_v2.csv'
test_image_path = '/content/drive/My Drive/ISIC_2020_Test_JPEG/ISIC_2020_Test_Input'
test_data_path = '/content/drive/MyDrive/ISIC_2020_Test_Metadata (1).csv'

df = pd.read_csv(train_data_path)

Mounted at /content/drive


In [None]:
# Function to count the number of images in a directory
def count_images_in_directory(data_dir):
    num_images = sum([len(files) for _, _, files in os.walk(data_dir)])
    return num_images

# Count images in the training and test directories
train_image_count = count_images_in_directory(train_image_path)
test_image_count = count_images_in_directory(test_image_path)

print(f"Number of training images: {train_image_count}")
print(f"Number of test images: {test_image_count}")


Number of training images: 66252
Number of test images: 10984


In [None]:
# Create images array, will store 2D array of [image_name, {data}]
images = []

with open(train_data_path) as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        # Skip first row
        if row[0] == "image_name":
            continue

        # Add row to array
        images.append([
            row[0],
            {
                "id": row[1],
                "sex": row[2],
                "age": row[3],
                "anatom_site": row[4],
                "diagnosis": row[5],
                "benign_malignant": row[6],
                "target": row[7]
            }
        ])


#Pre Process Data
Remove Duplicates and unknowns then resize

In [None]:
import pandas as pd

print(f"Initial number of records: {len(df)}")

# Remove duplicate rows based on image_name
df = df.drop_duplicates(subset='image_name')
print(f"Remaining records after removing duplicates: {len(df)}")

# Remove rows with unknown records
df = df[df['diagnosis'] != 'unknown']
print(f"Remaining records after removing unknowns: {len(df)}")


Initial number of records: 33126
Remaining records after removing duplicates: 33126
Remaining records after removing unknowns: 6002


Resize the images and put them in a save

In [None]:
import os
from PIL import Image

# Define  image path to save resized images
image_output = os.path.join(train_image_path, 'resized')

# Make sure output directory exists
if not os.path.exists(image_output):
    os.makedirs(image_output)

# Define image size
image_size = (224,224)

# Iterate over  dataframe
for index, row in df.iterrows():
    # Construct file paths
    file_name = row['image_name'] + '.jpg'
    input_path = os.path.join(train_image_path, file_name)
    output_path = os.path.join(image_output, file_name)

    # If the image has not been resized yet
    if not os.path.exists(output_path):
        # Open, resize, and save image
        with Image.open(input_path) as img:
            img_resized = img.resize(image_size)
            img_resized.save(output_path)
            print(f"Image {file_name} has been resized and saved to {output_path}")

#OverSampling (Class Imbalance)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()

# Apply the label encoder to the benign_malignant column
df['benign_malignant'] = le.fit_transform(df['benign_malignant'])

print(le.classes_)

['benign' 'malignant']


In [None]:
# Print the initial class distribution
print('Initial class distribution:\n', df['benign_malignant'].value_counts())
# 0 = benign
# 1 = malignant

Initial class distribution:
 0    5418
1     584
Name: benign_malignant, dtype: int64


#Using Naive Random Oversampling to balance the class distribution


kept filling the dataframe with empties? commented out

In [None]:
## Separate majority and minority classes
#df_majority = df[df.benign_malignant==0]
#df_minority = df[df.benign_malignant==1]

# Count how many instances are in the majority class
#majority_count = df_majority.shape[0]

# Upsample minority class
#df_minority_upsampled = df_minority.sample(majority_count, replace=True)

# Combine majority class with upsampled minority class
#df_upsampled = pd.concat([df_majority, df_minority_upsampled])

# Print the new class distribution
#print('\nNew class distribution:\n', df_upsampled['benign_malignant'].value_counts())

print(df.head())

      image_name  patient_id   lesion_id     sex  age_approx  \
2   ISIC_0052212  IP_2842074  IL_9087444  female        50.0   
12  ISIC_0076995  IP_2235340  IL_7147389  female        55.0   
26  ISIC_0084086  IP_4023055  IL_9753248    male        60.0   
27  ISIC_0084270  IP_2961528  IL_3011969    male        40.0   
28  ISIC_0084395  IP_0175539  IL_8767924  female        45.0   

   anatom_site_general_challenge diagnosis benign_malignant  target  
2                lower extremity     nevus           benign       0  
12                         torso     nevus           benign       0  
26               lower extremity     nevus           benign       0  
27               lower extremity     nevus           benign       0  
28                         torso     nevus           benign       0  


# Model Development

In [None]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout

# Load the pre-trained MobileNetV2 model, excluding its final layer
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add a global spatial average pooling layer
x = base_model.output
x = GlobalAveragePooling2D()(x)

# Add a dropout layer for regularisation
x = Dropout(0.5)(x)

# Add a final dense layer for classification
predictions = Dense(1, activation='sigmoid')(x)

# Define the model
model = Model(inputs=base_model.input, outputs=predictions)

# Freeze the base_model
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "model_8"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 224, 224, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 112, 112, 32  864         ['input_9[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 112, 112, 32  128         ['Conv1[0][0]']                  
                                )                                                           

# Model Training

Had an issue where the image name had .jpg 3 times on it so this code removes the additional .jpg

In [None]:

#print(df.head)
df['image_name'] = df['image_name'].apply(lambda x: x.replace('.jpg', ''))

#  add .jpg back to image name
df['image_name'] = df['image_name'].apply(lambda x: f"{x}.jpg")

print(df['image_name'].head())

2     ISIC_0052212.jpg
12    ISIC_0076995.jpg
26    ISIC_0084086.jpg
27    ISIC_0084270.jpg
28    ISIC_0084395.jpg
Name: image_name, dtype: object


In [65]:
import tensorflow as tf
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Split the dataframe into training and validation sets
df_train, df_valid = train_test_split(df, test_size=0.2, random_state=42)

image_directory = '/content/drive/MyDrive/ISIC_2020_Training_JPEG/train/resized'

# Create an ImageDataGenerator object
train_datagen = ImageDataGenerator(rescale=1./255)
valid_datagen = ImageDataGenerator(rescale=1./255)

# Convert the labels back to string
df['benign_malignant'] = df['benign_malignant'].replace({0: 'benign', 1: 'malignant'})

# Create data flows
train_generator = train_datagen.flow_from_dataframe(
    dataframe=df_train,
    directory= image_directory,
    x_col="image_name",
    y_col="benign_malignant",
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

valid_generator = valid_datagen.flow_from_dataframe(
    dataframe=df_valid,
    directory= image_directory,
    x_col="image_name",
    y_col="benign_malignant",
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary'
)

# Create a MobileNetV2 model
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add a new top layer
x = base_model.output
x = tf.keras.layers.GlobalAveragePooling2D()(x)
predictions = tf.keras.layers.Dense(1, activation='sigmoid')(x)

# This is the model we will train
model = tf.keras.models.Model(inputs=base_model.input, outputs=predictions)

# First: train only the top layers (which were randomly initialized)
for layer in base_model.layers:
    layer.trainable = False

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(
    train_generator,
    epochs=10,
    validation_data=valid_generator,
)



Found 4801 validated image filenames belonging to 2 classes.
Found 1201 validated image filenames belonging to 2 classes.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Save Model
---



In [54]:
model.save("/content/drive/MyDrive/DataModel")



# Model Evaluation

In [59]:
# Create a test generator
test_generator = train_datagen.flow_from_dataframe(
    dataframe= df_train,
    directory=image_directory,
    x_col="image_name",
    y_col="benign_malignant",
    target_size=(224, 224),
    batch_size=32,
    class_mode='binary',
    shuffle=False
)

# Evaluate the model
loss, accuracy = model.evaluate(test_generator)

# Print the test loss and accuracy
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

Found 4801 validated image filenames belonging to 2 classes.
Test loss: 0.18913358449935913
Test accuracy: 0.9370964169502258


#Model Testing

Model Testing with images found in google

In [63]:
import cv2
import numpy as np
from keras.models import load_model

# Load the pre-trained model
model = load_model('/content/drive/MyDrive/DataModel')

# Set the image path
image_path = '/content/drive/MyDrive/Google_Melanoma/benign.png'

# change the file at end of image_path to any of these:
# benign.png result should be benign
# melanoma_1.png result should be malignant
# Melanoma.jpg result should be malignant


# Load and preprocess the image
image = cv2.imread(image_path)

# Check if the image was loaded successfully
if image is None:
    print(f"Failed to load image: {image_path}")
else:
    image = cv2.resize(image, (224, 224))
    image = image / 255.0  # Normalize pixel values
    image = np.expand_dims(image, axis=0)  # Reshape to match the input shape of the model

    # Make a prediction
    prediction = model.predict(image)

    # Interpret the result
    if prediction[0][0] < 0.5:
        print("Predicted class: Benign")
    else:
        print("Predicted class: Malignant")
    print(f"Probability: {prediction[0][0]:.2f}") # displays probability, close to 0 meaning benign, clost to 1 meaning malignant and if its 0.5 then the model is unsure

Predicted class: Benign
Probability: 0.04


Model Testing with ISIC test data

In [64]:
import cv2
import numpy as np
from keras.models import load_model
import os

# Load the pre-trained model
model = load_model('/content/drive/MyDrive/DataModel')

# Set the directory path
directory_path = test_image_path

# Loop over all files in the directory
for filename in os.listdir(directory_path):
    # Only process .png files
    if filename.endswith(".jpg"):
        # Construct full image path
        image_path = os.path.join(directory_path, filename)

        # Load and preprocess the image
        image = cv2.imread(image_path)

        # Check if the image was loaded successfully
        if image is None:
            print(f"Failed to load image: {image_path}")
        else:
            image = cv2.resize(image, (224, 224))
            image = image / 255.0  # Normalize pixel values
            image = np.expand_dims(image, axis=0)  # Reshape to match the input shape of the model

            # Make a prediction
            prediction = model.predict(image)

            # Interpret the result
            if prediction[0][0] < 0.5:
                print(f"{filename}: Predicted class: Benign")
            else:
                print(f"{filename}: Predicted class: Malignant")
            print(f"Probability: {prediction[0][0]:.2f}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
ISIC_7750466.jpg: Predicted class: Benign
Probability: 0.07
ISIC_0223354.jpg: Predicted class: Benign
Probability: 0.20
ISIC_2531776.jpg: Predicted class: Benign
Probability: 0.34
ISIC_2323273.jpg: Predicted class: Benign
Probability: 0.17
ISIC_0416431.jpg: Predicted class: Benign
Probability: 0.16
ISIC_1789614.jpg: Predicted class: Benign
Probability: 0.05
ISIC_0223760.jpg: Predicted class: Malignant
Probability: 0.60
ISIC_7259357.jpg: Predicted class: Benign
Probability: 0.09
ISIC_3084021.jpg: Predicted class: Benign
Probability: 0.25
ISIC_2731928.jpg: Predicted class: Benign
Probability: 0.30
ISIC_9843071.jpg: Predicted class: Benign
Probability: 0.07
ISIC_6182913.jpg: Predicted class: Benign
Probability: 0.08
ISIC_3075041.jpg: Predicted class: Benign
Probability: 0.07
ISIC_2343290.jpg: Predicted class: Benign
Probability: 0.15
ISIC_1727295.jpg: Predicted class: Benign
Probability: 0.32
ISIC_9539153.jpg: Predicted clas