In [2]:
!pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (19 kB)
Downloading opencv_python-4.12.0.88-cp37-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.12.0.88


In [3]:
# Standard library imports
import os
import sys
import json
import pickle
import random
import re
from glob import glob
from pathlib import Path

# Third-party library imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import cv2
from tqdm import tqdm
import scipy

# PyTorch imports
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# torchvision imports
import torchvision
from torchvision import transforms
from torchvision.transforms import v2
from torchvision.datasets import VisionDataset
import torchvision.transforms as T

# PIL imports
from PIL import Image

import torch.nn as nn
import torch.optim as optim
from torchvision import models
from torch.utils.data import Subset, DataLoader
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score
import scipy.stats as stats

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
os.chdir('/content/drive/MyDrive/dermoscopic_artifacts')
sys.path.append('/content/drive/MyDrive/dermoscopic_artifacts')
import importlib
import datasets
importlib.reload(datasets)
from datasets import ISICDataset

In [7]:
df = pd.read_csv("/content/drive/MyDrive/dermoscopic_artifacts/isic_bias.csv", index_col=0, sep=';')

In [31]:
import kagglehub
path = kagglehub.dataset_download("tschandl/isic2018-challenge-task1-data-segmentation")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'isic2018-challenge-task1-data-segmentation' dataset.
Path to dataset files: /kaggle/input/isic2018-challenge-task1-data-segmentation


In [33]:
# Define paths
image_dir = "/kaggle/input/isic2018-challenge-task1-data-segmentation/ISIC2018_Task1-2_Training_Input"
mask_dir = "/kaggle/input/isic2018-challenge-task1-data-segmentation/ISIC2018_Task1_Training_GroundTruth"

# Create dataset instances for each mode
dataset_modes = ["whole", "lesion", "background", "bbox", "bbox70", "bbox90",
                 "high_whole", "low_whole", "high_lesion", "low_lesion", "high_background", "low_background"]


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 32

In [19]:
from sklearn.model_selection import KFold
# Number of splits
num_splits = 5
kf = KFold(n_splits=num_splits, shuffle=True, random_state=42)

# Assign splits
for i, (train_idx, test_idx) in enumerate(kf.split(df), 1):
    df[f"split_{i}"] = "train"
    df.loc[test_idx, f"split_{i}"] = "test"

In [20]:
dataset = ISICDataset(df, image_dir, mask_dir, mode="low_background", return_pil= True)

In [21]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [27]:
dataloader = DataLoader(ISICDataset(df, image_dir, mask_dir, mode="low_background"), batch_size=16, shuffle=True)

In [39]:
dataset_modes = ["low_background", "high_lesion", "whole"]  # Change this as needed

for dataset_mode in dataset_modes:
  all_metrics = {
      "AUROC": [],
      "Accuracy": [],
      "Recall": [],
      "Precision": []
  }


  # Directory containing saved models
  save_dir = f"/content/drive/MyDrive/dermoscopic_artifacts/classifiers/{dataset_mode}"

  # Load dataset
  full_dataset = ISICDataset(df, image_dir, mask_dir, transform=transform, mode=dataset_mode, return_pil=False)

  store_preds = {}
  store_labels = {}

  # Loop through each split
  for split in range(1, 6):
      print(f"Evaluating {dataset_mode} - Split {split}")

      # Get test indices
      test_indices = df[df[f"split_{split}"] == "test"].index.tolist()

      # Create test dataset and DataLoader
      test_dataset = Subset(full_dataset, test_indices)
      test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

      # Load model
      model = models.resnet50(pretrained=False)  # Load model architecture
      num_features = model.fc.in_features
      model.fc = nn.Sequential(
          nn.Linear(num_features, 1),
          nn.Sigmoid()
      )

      model.load_state_dict(torch.load(f"{save_dir}/resnet50_split_{split}.pth"))
      model = model.to(device)
      model.eval()

      # Lists to store predictions and labels
      all_preds = []
      all_labels = []

      # Evaluation loop
      with torch.no_grad():
          for images, labels in tqdm(test_loader, desc=f"Evaluating Split {split}"):
              images = images.to(device)
              labels = labels.cpu().numpy()  # Convert labels to NumPy array

              outputs = model(images).cpu().numpy()  # Get model predictions
              preds = outputs.flatten()  # Flatten predictions

              all_preds.extend(preds)
              all_labels.extend(labels)

      # Convert lists to NumPy arrays
      all_preds = np.array(all_preds)
      all_labels = np.array(all_labels)

      # Compute metrics
      auroc = roc_auc_score(all_labels, all_preds)
      acc = accuracy_score(all_labels, all_preds >= 0.5)
      recall = recall_score(all_labels, all_preds >= 0.5)
      precision = precision_score(all_labels, all_preds >= 0.5)

      # Store metrics
      all_metrics["AUROC"].append(auroc)
      all_metrics["Accuracy"].append(acc)
      all_metrics["Recall"].append(recall)
      all_metrics["Precision"].append(precision)

      store_preds[split] = all_preds
      store_labels[split] = all_labels

      print(f"Split {split} - AUROC: {auroc:.4f}, Accuracy: {acc:.4f}, Recall: {recall:.4f}, Precision: {precision:.4f}")

  # Compute mean metrics across splits
  mean_metrics = {metric: np.mean(values) for metric, values in all_metrics.items()}

  # Print final results
  print(f"\n===== Final ISIC Evaluation Results for {dataset_mode} =====")
  for metric, mean_value in mean_metrics.items():
      print(f"Mean {metric}: {mean_value:.4f}")
  print("\n\n")

  with open (f"{save_dir}/all_metrics_isic_{dataset_mode}.pkl", "wb") as f:
      pickle.dump(all_metrics, f)
  with open (f"{save_dir}/store_preds_isic.pkl", "wb") as f:
    pickle.dump(store_preds, f)
  with open (f"{save_dir}/store_labels_isic.pkl", "wb") as f:
    pickle.dump(store_labels, f)

Evaluating low_background - Split 1


Evaluating Split 1: 100%|██████████| 17/17 [06:34<00:00, 23.23s/it]


Split 1 - AUROC: 0.7937, Accuracy: 0.7765, Recall: 0.5905, Precision: 0.4593
Evaluating low_background - Split 2


Evaluating Split 2: 100%|██████████| 17/17 [06:14<00:00, 22.02s/it]


Split 2 - AUROC: 0.7025, Accuracy: 0.8054, Recall: 0.2475, Precision: 0.5000
Evaluating low_background - Split 3


Evaluating Split 3: 100%|██████████| 17/17 [06:21<00:00, 22.42s/it]


Split 3 - AUROC: 0.6863, Accuracy: 0.8227, Recall: 0.2200, Precision: 0.6111
Evaluating low_background - Split 4


Evaluating Split 4: 100%|██████████| 17/17 [06:15<00:00, 22.11s/it]


Split 4 - AUROC: 0.6963, Accuracy: 0.7649, Recall: 0.2437, Precision: 0.4754
Evaluating low_background - Split 5


Evaluating Split 5: 100%|██████████| 17/17 [06:17<00:00, 22.23s/it]


Split 5 - AUROC: 0.7866, Accuracy: 0.8166, Recall: 0.4043, Precision: 0.4935

===== Final ISIC Evaluation Results for low_background =====
Mean AUROC: 0.7331
Mean Accuracy: 0.7972
Mean Recall: 0.3412
Mean Precision: 0.5079



Evaluating high_lesion - Split 1


Evaluating Split 1: 100%|██████████| 17/17 [04:57<00:00, 17.48s/it]


Split 1 - AUROC: 0.7340, Accuracy: 0.7206, Recall: 0.5905, Precision: 0.3780
Evaluating high_lesion - Split 2


Evaluating Split 2: 100%|██████████| 17/17 [04:42<00:00, 16.64s/it]


Split 2 - AUROC: 0.6335, Accuracy: 0.7803, Recall: 0.2871, Precision: 0.4085
Evaluating high_lesion - Split 3


Evaluating Split 3: 100%|██████████| 17/17 [04:46<00:00, 16.84s/it]


Split 3 - AUROC: 0.6919, Accuracy: 0.7842, Recall: 0.2500, Precision: 0.4032
Evaluating high_lesion - Split 4


Evaluating Split 4: 100%|██████████| 17/17 [04:46<00:00, 16.88s/it]


Split 4 - AUROC: 0.7439, Accuracy: 0.7977, Recall: 0.2941, Precision: 0.6250
Evaluating high_lesion - Split 5


Evaluating Split 5: 100%|██████████| 17/17 [04:46<00:00, 16.86s/it]


Split 5 - AUROC: 0.7233, Accuracy: 0.7954, Recall: 0.4149, Precision: 0.4333

===== Final ISIC Evaluation Results for high_lesion =====
Mean AUROC: 0.7053
Mean Accuracy: 0.7756
Mean Recall: 0.3673
Mean Precision: 0.4496



Evaluating whole - Split 1


Evaluating Split 1: 100%|██████████| 17/17 [01:30<00:00,  5.32s/it]


Split 1 - AUROC: 0.8614, Accuracy: 0.8536, Recall: 0.5238, Precision: 0.6790
Evaluating whole - Split 2


Evaluating Split 2: 100%|██████████| 17/17 [01:26<00:00,  5.06s/it]


Split 2 - AUROC: 0.8160, Accuracy: 0.8362, Recall: 0.5347, Precision: 0.5870
Evaluating whole - Split 3


Evaluating Split 3: 100%|██████████| 17/17 [01:27<00:00,  5.13s/it]


Split 3 - AUROC: 0.7687, Accuracy: 0.8227, Recall: 0.4300, Precision: 0.5513
Evaluating whole - Split 4


Evaluating Split 4: 100%|██████████| 17/17 [01:26<00:00,  5.11s/it]


Split 4 - AUROC: 0.7663, Accuracy: 0.7996, Recall: 0.2437, Precision: 0.6744
Evaluating whole - Split 5


Evaluating Split 5: 100%|██████████| 17/17 [01:27<00:00,  5.17s/it]

Split 5 - AUROC: 0.7961, Accuracy: 0.8301, Recall: 0.2872, Precision: 0.5625

===== Final ISIC Evaluation Results for whole =====
Mean AUROC: 0.8017
Mean Accuracy: 0.8285
Mean Recall: 0.4039
Mean Precision: 0.6108








In [40]:
dataset_modes = ["whole", "high_lesion", "low_background"]

mean_results = {}

# Iterate through each artifact and compute mean metrics
for mode in dataset_modes:
    save_dir = f"/content/drive/MyDrive/dermoscopic_artifacts/classifiers/{mode}"
    with open(f"{save_dir}/all_metrics_isic_{mode}.pkl", "rb") as f:
        results_dict = pickle.load(f)

    # Compute mean for each metric
    mean_results[mode] = {metric: np.mean(values) for metric, values in results_dict.items()}

# Convert to DataFrame for easy viewing
df_mean_results = pd.DataFrame.from_dict(mean_results, orient="index")

In [41]:
df_mean_results

Unnamed: 0,AUROC,Accuracy,Recall,Precision
whole,0.801705,0.828451,0.403879,0.610834
high_lesion,0.705306,0.775644,0.367323,0.449612
low_background,0.733076,0.797232,0.341191,0.507857
