<a href="https://colab.research.google.com/github/SirReinz/ENGGSCAI/blob/Rudra/ENGG2112SCAI_Rudra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from PIL import Image
import kagglehub
import os
from google.colab import userdata

# ISIC Data

In [None]:
# import datasets
ISIC = kagglehub.dataset_download("nodoubttome/skin-cancer9-classesisic")
print(ISIC)


Using Colab cache for faster access to the 'skin-cancer9-classesisic' dataset.
/kaggle/input/skin-cancer9-classesisic


In [None]:
# Load dataset into pandas DataFrame
# This skin cancer dataset typically contains CSV files with metadata and image paths

# Look for CSV files
csv_files = []
for root, dirs, files in os.walk(ISIC):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print("Found CSV files:")
for csv_file in csv_files:
    print(f"- {csv_file}")

# Load the main dataset CSV (usually contains metadata)
if csv_files:
    # Load the first CSV file found
    df = pd.read_csv(csv_files[0])
    print(f"\nLoaded dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows:")
    print(df.head())
else:
    print("No CSV files found. This might be an image-only dataset.")
    # In this case, you might need to create a DataFrame from image filenames
    image_files = []
    for root, dirs, files in os.walk(ISIC):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_files.append(os.path.join(root, file))

    if image_files:
        df = pd.DataFrame({'image_path': image_files[:100]})  # Sample first 100 images
        print(f"Created DataFrame from image files: {df.shape}")
        print(df.head())

Found CSV files:
No CSV files found. This might be an image-only dataset.
Created DataFrame from image files: (100, 1)
                                          image_path
0  /kaggle/input/skin-cancer9-classesisic/Skin ca...
1  /kaggle/input/skin-cancer9-classesisic/Skin ca...
2  /kaggle/input/skin-cancer9-classesisic/Skin ca...
3  /kaggle/input/skin-cancer9-classesisic/Skin ca...
4  /kaggle/input/skin-cancer9-classesisic/Skin ca...


In [None]:
# If you have a specific CSV file you want to load, you can do:
# df = pd.read_csv(os.path.join(ISIC, 'specific_file.csv'))

# For image classification datasets, you might also want to:
# 1. Extract class labels from folder names or file paths
# 2. Create image loading functions
# 3. Prepare data for machine learning models

# Example: If images are organized in class folders
def create_image_dataframe(dataset_path):
    """Create a DataFrame with image paths and labels from folder structure"""
    data = []

    for root, dirs, files in os.walk(dataset_path):
        # Skip the root directory
        if root == dataset_path:
            continue

        # Get class name from folder name
        class_name = os.path.basename(root)

        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                full_path = os.path.join(root, file)
                data.append({
                    'image_path': full_path,
                    'class': class_name,
                    'filename': file
                })

    return pd.DataFrame(data)

# Try to create DataFrame from folder structure
try:
    image_df = create_image_dataframe(ISIC)
    if not image_df.empty:
        pass
        # print(f"Created image DataFrame: {image_df.shape}")
        # print(f"Classes found: {image_df['class'].unique()}")
        # print("\nSample data:")
        # print(image_df.head())

        # # Show class distribution
        # print("\nClass distribution:")
        # print(image_df['class'].value_counts())
except Exception as e:
    print(f"Could not create image DataFrame: {e}")

In [None]:
X = image_df['image_path']
y = image_df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

print("Training set size:", len(X_train))
print("Testing set size:", len(X_test))
"""
num_random_train_samples = 300  # Adjust this number as needed
X_train = X_train.sample(n=num_random_train_samples, random_state=42)
# Get the corresponding labels for the sampled images
y_train = y_train.loc[X_train.index]

num_random_test_samples = 100  # Adjust this number as needed
X_test = X_test.sample(n=num_random_test_samples, random_state=42)
# Get the corresponding labels for the sampled images
y_test = y_test.loc[X_test.index]
"""


Training set size: 1649
Testing set size: 708


'\nnum_random_train_samples = 300  # Adjust this number as needed\nX_train = X_train.sample(n=num_random_train_samples, random_state=42)\n# Get the corresponding labels for the sampled images\ny_train = y_train.loc[X_train.index]\n\nnum_random_test_samples = 100  # Adjust this number as needed\nX_test = X_test.sample(n=num_random_test_samples, random_state=42)\n# Get the corresponding labels for the sampled images\ny_test = y_test.loc[X_test.index]\n'

In [None]:
# Define the target size for resizing images
target_size = (128, 128)

def load_and_flatten_image(image_path, target_size):
    """Loads an image, resizes it, and flattens it into a 1D array."""
    try:
        img = Image.open(image_path).convert('RGB') # Ensure image is in RGB format
        img = img.resize(target_size)
        img_array = np.array(img)
        # Flatten the array: (height, width, channels) -> (height * width * channels,)
        flattened_array = img_array.flatten()
        return flattened_array
    except Exception as e:
        print(f"Error loading or processing image {image_path}: {e}")
        return None # Return None if there's an error

# Apply the function to your training and testing image paths
# This might take some time depending on the number of images
print("Processing training images...")
X_train_processed = np.array([load_and_flatten_image(path, target_size) for path in X_train if load_and_flatten_image(path, target_size) is not None])

print("Processing testing images...")
X_test_processed = np.array([load_and_flatten_image(path, target_size) for path in X_test if load_and_flatten_image(path, target_size) is not None])


Processing training images...
Processing testing images...


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
pipeline = Pipeline([
('scaler', StandardScaler()),
('mlp', MLPClassifier(hidden_layer_sizes=(100,), activation='relu',
solver='adam', max_iter=20000))
])
#Fit
pipeline.fit(X_train_processed, y_train)
#Get predictions
y_pred = pipeline.predict(X_test_processed)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming y_test contains the true labels for the test set
# and y_pred contains the predictions from your model on the test set

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate a classification report for more detailed metrics (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.4308

Classification Report:
                            precision    recall  f1-score   support

         actinic keratosis       0.03      0.03      0.03        39
      basal cell carcinoma       0.48      0.61      0.54       118
            dermatofibroma       0.31      0.33      0.32        33
                  melanoma       0.48      0.41      0.44       136
                     nevus       0.50      0.37      0.42       112
pigmented benign keratosis       0.54      0.57      0.55       144
      seborrheic keratosis       0.10      0.12      0.11        24
   squamous cell carcinoma       0.37      0.32      0.35        59
           vascular lesion       0.35      0.47      0.40        43

                  accuracy                           0.43       708
                 macro avg       0.35      0.36      0.35       708
              weighted avg       0.43      0.43      0.43       708



# SIIM ISIC Data

In [2]:
# Download the ISIC 2020 resized dataset (256x256)
ISIC_2020 = kagglehub.dataset_download("nischaydnk/isic-2020-jpg-256x256-resized")
print("ISIC 2020 (256x256) dataset downloaded to:", ISIC_2020)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nischaydnk/isic-2020-jpg-256x256-resized?dataset_version_number=1...


100%|██████████| 595M/595M [00:04<00:00, 140MB/s]

Extracting files...





ISIC 2020 (256x256) dataset downloaded to: /root/.cache/kagglehub/datasets/nischaydnk/isic-2020-jpg-256x256-resized/versions/1


In [3]:
# Explore the downloaded dataset structure
print("Dataset location:", ISIC_2020)
print("\nFiles and folders in the dataset:")
for root, dirs, files in os.walk(ISIC_2020):
    level = root.replace(ISIC_2020, '').count(os.sep)
    indent = ' ' * 2 * level
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 2 * (level + 1)
    for file in files[:5]:  # Show first 5 files in each directory
        print(f"{subindent}{file}")
    if len(files) > 5:
        print(f"{subindent}... and {len(files)-5} more files")

Dataset location: /root/.cache/kagglehub/datasets/nischaydnk/isic-2020-jpg-256x256-resized/versions/1

Files and folders in the dataset:
1/
  train-metadata.csv
  .ipynb_checkpoints/
    dataset-metadata-checkpoint.json
  train-image/
    image/
      ISIC_9023880.jpg
      ISIC_4610204.jpg
      ISIC_7544537.jpg
      ISIC_3114066.jpg
      ISIC_2811348.jpg
      ... and 33121 more files


In [4]:
csv_files = []
for root, dirs, files in os.walk(ISIC_2020):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))

print("Found CSV files:")
for csv_file in csv_files:
    print(f"- {csv_file}")

# Load the main dataset CSV (usually contains metadata)
if csv_files:
    # Load the first CSV file found
    df = pd.read_csv(csv_files[0])
    print(f"\nLoaded dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst few rows:")
    print(df.head())
else:
    print("No CSV files found. This might be an image-only dataset.")
    # In this case, you might need to create a DataFrame from image filenames
    image_files = []
    for root, dirs, files in os.walk(ISIC):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_files.append(os.path.join(root, file))

    if image_files:
        df = pd.DataFrame({'image_path': image_files[:100]})  # Sample first 100 images
        print(f"Created DataFrame from image files: {df.shape}")
        print(df.head())

Found CSV files:
- /root/.cache/kagglehub/datasets/nischaydnk/isic-2020-jpg-256x256-resized/versions/1/train-metadata.csv

Loaded dataset shape: (33126, 4)
Columns: ['Unnamed: 0', 'isic_id', 'patient_id', 'target']

First few rows:
   Unnamed: 0       isic_id  patient_id  target
0           0  ISIC_2637011  IP_7279968       0
1           1  ISIC_0015719  IP_3075186       0
2           2  ISIC_0052212  IP_2842074       0
3           3  ISIC_0068279  IP_6890425       0
4           4  ISIC_0074268  IP_8723313       0


EfficientNetB0 CNN

In [10]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, Model
from tensorflow.keras.optimizers import Adam

# Set random seed for reproducibility
tf.random.set_seed(42)

# Create image data generators with augmentation for training
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.2 # Added validation_split here
)

# Create a DataFrame with image paths and labels
def create_image_dataframe_siim(dataset_path):
    data = []
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.lower().endswith('.jpg'):
                full_path = os.path.join(root, file)
                # Extract label from the CSV if available, or use binary classification
                # For this example, we'll use binary classification (melanoma vs. non-melanoma)
                data.append({
                    'image_path': full_path,
                    'filename': file
                })
    return pd.DataFrame(data)

# Create the DataFrame
siim_df = create_image_dataframe_siim(ISIC_2020)
print(f"Total images found: {len(siim_df)}")

Total images found: 33126


In [6]:
# Create EfficientNet model
def create_model(num_classes=1):  # Using 1 for binary classification
    # Load EfficientNetB0 with pre-trained weights
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

    # Freeze the base model
    base_model.trainable = False

    # Create new model on top
    inputs = layers.Input(shape=(256, 256, 3))
    x = base_model(inputs, training=False)
    x = layers.GlobalAveragePooling2D()(x)
    x = layers.BatchNormalization()(x)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(num_classes, activation='sigmoid')(x)

    model = Model(inputs, outputs)

    return model

# Create the model
model = create_model()

# Compile the model
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.AUC()]
)

# Print model summary
model.summary()

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [None]:
# Set up batch size and image dimensions
BATCH_SIZE = 32
IMG_SIZE = (256, 256)

# Merge the target column from df into siim_df
# Assuming 'isic_id' in df corresponds to the filename (without extension) in siim_df
# and 'target' is the column with labels in df
siim_df['isic_id'] = siim_df['filename'].apply(lambda x: os.path.splitext(x)[0])
siim_df = pd.merge(siim_df, df[['isic_id', 'target']], on='isic_id', how='left')

# Drop the temporary 'isic_id' column from siim_df if not needed later
siim_df = siim_df.drop(columns=['isic_id'])

# Convert the target column to string type for flow_from_dataframe with class_mode='binary'
siim_df['target'] = siim_df['target'].astype(str)

# Split the data into training and validation sets
train_df = siim_df.sample(frac=0.8, random_state=42)
val_df = siim_df.drop(train_df.index)

# Create data generators
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=None,
    x_col='image_path',
    y_col='target',  # Use the 'target' column as the label
    class_mode='binary',  # For binary classification
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    subset='training'
)

validation_generator = train_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory=None,
    x_col='image_path',
    y_col='target', # Use the 'target' column as the label
    class_mode='binary',
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    subset='validation'
)

# Train the model
history = model.fit(
    train_generator,
    epochs=10,  # Adjust number of epochs as needed
    validation_data=validation_generator,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        )
    ]
)

Found 21201 validated image filenames belonging to 2 classes.
Found 1325 validated image filenames belonging to 2 classes.


  self._warn_if_super_not_called()


Epoch 1/10
[1m 37/663[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m40:33[0m 4s/step - accuracy: 0.9082 - auc: 0.4132 - loss: 0.2244

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))

# Plot training & validation accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot training & validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.tight_layout()
plt.show()

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('Model AUC')
plt.ylabel('AUC')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')
plt.show()