In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image
import keras
from keras import layers
from keras.applications import EfficientNetB0
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
#Defining the folder names and file names
image_folder = 'image'
csv_file = 'train-metadata.csv'

In [3]:
# #Count the total number of images in the folder
# image_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.jpg','.png','.jpeg','.bmp','.gif'))]
# num_images = len(image_files)

# #For loop to calculate the image dimensions and check the different dimensions of images
# image_dimensions = set()
# for image_file in image_files:
#     with Image.open(os.path.join(image_folder,image_file)) as img:
#         image_dimensions.add(img.size) #add width and height as a single single variable
     

In [4]:
#Import dataset and check the number of rows equals with the number of images.
df = pd.read_csv(csv_file)
num_records = len(df)

# if num_images==num_records:
#     print("The number of training images and available labels in the dataframe is equal")
    
# print(f'The dimensions of images in the training data set is:{image_dimensions}')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,isic_id,patient_id,target
0,0,ISIC_0000000,dummy_0,0
1,1,ISIC_0000001,dummy_1,0
2,2,ISIC_0000002,dummy_2,1
3,3,ISIC_0000003,dummy_3,0
4,4,ISIC_0000004,dummy_4,1


In [6]:
# IMG_SIZE is determined by EfficientNet model choice
IMG_SIZE = 224
BATCH_SIZE = 64

In [7]:
# Extract image file names and labels
image_paths = [os.path.join(image_folder, fname) for fname in df["isic_id"]]  
labels = df["target"].values  

In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,isic_id,patient_id,target
0,0,ISIC_0000000,dummy_0,0
1,1,ISIC_0000001,dummy_1,0
2,2,ISIC_0000002,dummy_2,1
3,3,ISIC_0000003,dummy_3,0
4,4,ISIC_0000004,dummy_4,1


In [9]:
image_paths[-3:]

['image\\ISIC_0073249', 'image\\ISIC_0073251', 'image\\ISIC_0073254']

In [10]:
labels[4]

1

In [11]:
idc = 'image\\ISIC_0069696'
if idc in image_paths:
    print("yes")

In [12]:
import os

image_folder = "d:/Self Study/3 Cancer Image Classification/image/"

# Check if all files exist
missing_files = [img for img in df['isic_id'] if not os.path.exists(os.path.join(image_folder, img + ".jpg"))]

if missing_files:
    print("Missing files:", missing_files)
else:
    print("All files exist!")


All files exist!


In [13]:
image_paths[0]

'image\\ISIC_0000000'

In [14]:
df['target'].value_counts()

target
0    20808
1     4522
Name: count, dtype: int64

In [15]:
# Split dataset 
train_paths, test_paths, train_labels, test_labels = train_test_split(
    image_paths, labels, test_size=0.2, random_state=123, stratify=labels)

In [16]:
train_paths[:3]

['image\\ISIC_0066109', 'image\\ISIC_0028851', 'image\\ISIC_0060479']

In [17]:
train_labels[:3]

array([1, 0, 0], dtype=int64)

In [18]:
df[df['isic_id']=='ISIC_0066109']

Unnamed: 0.1,Unnamed: 0,isic_id,patient_id,target
20803,20803,ISIC_0066109,dummy_20803,1


In [19]:
# Get number of classes
NUM_CLASSES = len(df["target"].unique())

    # One-hot encode labels
def one_hot_encode(image_path, label):
    image_path = tf.strings.join([image_path,'.jpg'])
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)  # or decode_png if needed
    image = tf.image.resize(image, (224, 224))  # Resize if needed
    image = image / 255.0  # Normalize
        
    label = tf.one_hot(label, depth=NUM_CLASSES)  # Convert to one-hot
    return image, label

train_paths = [os.path.abspath(path) for path in train_paths]
test_paths = [os.path.abspath(path) for path in test_paths]


    # Apply one-hot encoding in the dataset pipeline
ds_train = tf.data.Dataset.from_tensor_slices((train_paths, train_labels)).map(one_hot_encode)
ds_test = tf.data.Dataset.from_tensor_slices((test_paths, test_labels)).map(one_hot_encode)


In [20]:
train_paths

['d:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0066109',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0028851',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0060479',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0060032',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0033604',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0071932',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0014369_downsampled',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0031912',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0055650',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0054251',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0025419',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0029462',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0072353',
 'd:\\Self Study\\3 Cancer Image Classification\\image\\ISIC_0024

In [21]:
image = tf.io.read_file('image\\ISIC_0066109.jpg')

In [22]:
# Batch the dataset with batch size 64
ds_train = ds_train.batch(1000)
ds_test = ds_test.batch(1000)
# Prefetch to improve performance
ds_train = ds_train.prefetch(tf.data.AUTOTUNE)
ds_test = ds_test.prefetch(tf.data.AUTOTUNE)

In [23]:
print(ds_train)

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 2), dtype=tf.float32, name=None))>


In [24]:
model = EfficientNetB0(
    include_top=True,
    weights=None,
    classes=NUM_CLASSES,
    input_shape=(IMG_SIZE, IMG_SIZE, 3),
    )
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])




In [25]:
# size = (IMG_SIZE, IMG_SIZE)
# ds_train = ds_train.map(lambda image, label: (tf.image.resize(image, size), label))
# ds_test = ds_test.map(lambda image, label: (tf.image.resize(image, size), label))

In [None]:
ds_train

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 2), dtype=tf.float32, name=None))>

: 

In [None]:
epochs = 1  # @param {type: "slider", min:10, max:100}
hist = model.fit(ds_train, epochs=epochs, validation_data=ds_test)

In [None]:
25000/64


390.625