In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
from PIL import Image
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
# from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization

from tensorflow.keras.applications import VGG16, ResNet50
from tensorflow.keras.applications.vgg16 import preprocess_input as vgg_preprocess
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [None]:
# Load metadata
from google.colab import drive
drive.mount('/content/drive')

In [None]:


# Specify the correct path as a string
metadata_path = '/content/drive/MyDrive/cancer_image_classification/HAM10000_metadata.csv'

# Load the CSV file
skin_df = pd.read_csv(metadata_path)

SIZE = 224  # Size for ResNet50 and VGG16


In [None]:
# Encode labels as numeric values
label_encoder = LabelEncoder()
skin_df['label'] = label_encoder.fit_transform(skin_df['dx'])
print(f"Classes: {list(label_encoder.classes_)}")
print(skin_df.sample(10))

In [None]:
# Visualize data distribution
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
# Distribution of diagnosis types
ax1 = axes[0, 0]
skin_df['dx'].value_counts().plot(kind='bar', ax=ax1)
ax1.set_ylabel('Count')
ax1.set_title('Diagnosis Types')
# Distribution of sex
ax2 = axes[0, 1]
skin_df['sex'].value_counts().plot(kind='bar', ax=ax2)
ax2.set_ylabel('Count')
ax2.set_title('Sex Distribution')

# Localization of lesions
ax3 = axes[1, 0]
skin_df['localization'].value_counts().plot(kind='bar', ax=ax3)
ax3.set_ylabel('Count')
ax3.set_title('Localization')

# Age distribution
ax4 = axes[1, 1]
sns.histplot(skin_df['age'].dropna(), kde=True, color='red', ax=ax4)
ax4.set_title('Age Distribution')

plt.tight_layout()
plt.show()


In [None]:
# Balance the dataset
class_counts = skin_df['label'].value_counts()
print(class_counts)

# Resample each class to balance the dataset
n_samples = 500
balanced_dfs = [resample(skin_df[skin_df['label'] == i], replace=True, n_samples=n_samples, random_state=42) for i in range(7)]
skin_df_balanced = pd.concat(balanced_dfs)

# Check the new distribution
print(skin_df_balanced['label'].value_counts())


In [None]:
def load_image(path):
    if path is None:
        print(f"Warning: Path is None, skipping image.")
        return None
    try:
        image = Image.open(path)
        image = image.resize((SIZE, SIZE))
        return np.asarray(image)
    except Exception as e:
        print(f"Error loading image: {path}, Error: {str(e)}")
        return None

# Create a dictionary mapping image IDs to their paths
image_paths = {
    os.path.splitext(os.path.basename(x))[0]: x
    for x in glob(os.path.join('/content/drive/MyDrive/cancer_image_classification/cancer_type/HAM10000/', '*.jpg'))
}

# Map the paths and load images
skin_df_balanced['path'] = skin_df_balanced['image_id'].map(image_paths.get)

# Load and resize images, with error handling for missing paths
skin_df_balanced['image'] = skin_df_balanced['path'].map(load_image)


In [None]:
from tensorflow.keras.utils import to_categorical
# Ensure there are no None values in the image column
if skin_df_balanced['image'].isnull().any():
    raise ValueError("There are missing images in the dataset!")

# Convert the list of image arrays to a 4D NumPy array and normalize pixel values
X = np.array(skin_df_balanced['image'].tolist()) / 255.0

# Extract labels and convert to one-hot encoding
Y = skin_df_balanced['label'].astype(int)  # Ensure labels are integers
Y_cat = to_categorical(Y, num_classes=7)  # Adjust num_classes based on your dataset

# Check the shape of X and Y_cat to ensure correctness
print(f"Shape of X: {X.shape}")
print(f"Shape of Y_cat: {Y_cat.shape}")


In [None]:

# Specify the correct path as a string
metadata_path = '/content/drive/MyDrive/cancer_image_classification/HAM10000_metadata.csv'

# Load the CSV file
skin_df = pd.read_csv(metadata_path)

SIZE = 224  # Size for ResNet50 and VGG16


In [None]:
# Split the dataset into training and test sets
x_train, x_test, y_train, y_test = train_test_split(X, Y_cat, test_size=0.25, random_state=42)


In [None]:
# Data augmentation and preprocessing for both models
train_datagen_vgg = ImageDataGenerator(
    preprocessing_function=vgg_preprocess,
    validation_split=0.25
)

validation_datagen_vgg = ImageDataGenerator(
    preprocessing_function=vgg_preprocess,
    validation_split=0.25
)

train_datagen_resnet = ImageDataGenerator(
    preprocessing_function=resnet_preprocess,
    validation_split=0.25
)

validation_datagen_resnet = ImageDataGenerator(
    preprocessing_function=resnet_preprocess,
    validation_split=0.25
)

train_generator_vgg = train_datagen_vgg.flow(
    x_train, y_train,
    subset='training'
)

validation_generator_vgg = validation_datagen_vgg.flow(
    x_test, y_test,
    subset='validation'
)

train_generator_resnet = train_datagen_resnet.flow(
    x_train, y_train,
    subset='training'
)

validation_generator_resnet = validation_datagen_resnet.flow(
    x_test, y_test,
    subset='validation'
)

In [None]:
# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
# Define and compile VGG16 model
base_model_vgg = VGG16(weights='imagenet', include_top=False, input_shape=(SIZE, SIZE, 3))
for layer in base_model_vgg.layers:
    layer.trainable = False

x_vgg = base_model_vgg.output
x_vgg = Flatten()(x_vgg)
x_vgg = Dense(512, activation='relu')(x_vgg)
x_vgg = Dropout(0.5)(x_vgg)
predictions_vgg = Dense(7, activation='softmax')(x_vgg)

model_vgg = Model(inputs=base_model_vgg.input, outputs=predictions_vgg)
model_vgg.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:

# Train VGG16 model
history_vgg = model_vgg.fit(
    train_generator_vgg,
    epochs=20,
    validation_data=validation_generator_vgg,
    callbacks=[early_stopping]
)

In [None]:
# Define and compile ResNet18 model
base_model_resnet = ResNet50(weights='imagenet', include_top=False, input_shape=(SIZE, SIZE, 3))
for layer in base_model_resnet.layers:
    layer.trainable = False

x_resnet = base_model_resnet.output
x_resnet = Flatten()(x_resnet)
x_resnet = Dense(512, activation='relu')(x_resnet)
x_resnet = Dropout(0.5)(x_resnet)
predictions_resnet = Dense(7, activation='softmax')(x_resnet)

model_resnet = Model(inputs=base_model_resnet.input, outputs=predictions_resnet)
model_resnet.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
# Train ResNet18 model
history_resnet = model_resnet.fit(
    train_generator_resnet,
    epochs=20,
    validation_data=validation_generator_resnet,
    callbacks=[early_stopping]
)


In [None]:
# Evaluate models
loss_vgg, accuracy_vgg = model_vgg.evaluate(validation_generator_vgg)
print(f"VGG16 - Validation Loss: {loss_vgg}")
print(f"VGG16 - Validation Accuracy: {accuracy_vgg}")

loss_resnet, accuracy_resnet = model_resnet.evaluate(validation_generator_resnet)
print(f"ResNet18 - Validation Loss: {loss_resnet}")
print(f"ResNet18 - Validation Accuracy: {accuracy_resnet}")

# Make predictions
predictions_vgg = model_vgg.predict(x_test)
predictions_resnet = model_resnet.predict(x_test)