#Data download + PNG resize (CPU)

**Load the dataset ISIC(collection 66, representative of the training set of task 3) into the *results* variable** , ~10k entries

In [102]:
import requests

# Base endpoint
base_url = "https://api.isic-archive.com/api/v2/images/search/"
params = {
    "collections": "66,67,73"
}

# Loop until no more pages
all_ids = []
results = []

page = 1

while True:
   # print(f"Requesting page {page}...")
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract image IDs
    for result in data.get("results", []):
        all_ids.append(result["isic_id"])
        results.append(result)

    # Print first ID for confirmation
    #print("image ID:", all_ids[(page-1)*100])

    # Prepare for next page
    next_cursor = data.get("next")
    if not next_cursor:
        break  # we're done!

    # Update URL for the next request
    base_url = next_cursor
    params = {}  # cursor URL already includes params
    page += 1

print(f"Total images collected: {len(all_ids)}")


Total images collected: 10015


**Analysing the data**

In [62]:
results[0]

{'isic_id': 'ISIC_0034320',
 'copyright_license': 'CC-BY-NC',
 'attribution': 'MILK study team',
 'files': {'full': {'url': 'https://content.isic-archive.com/f059014d-7830-4b6c-abce-bb54a9bec0ff/26909bd2-4c06-4256-9d13-1cfcce053f5a.jpg?Expires=1746662400&Signature=OFvbE0KcA46q3voKplhcpQBWIQVnNV3EX518lgOtSU0R6cPgoWxSiTl4yLuPv99LEQUjsgkOa8YsqyxY5FVbAej7j7gCxNgURTh-bMLaqLRnFUz0UQ~CWb8fTu6av~~cuBv-1QR25QPIreRd9LJ58haIEU9-TDTMa-aPH39JiK9I240vce6h9EgcDmEH0SVCg~c~WBVoszziRKTUYgR3xMkNQ64I4XR5nnzebMWevqSP8VxpEhcjKj9MAA5S~rqSdmumAn4ryQWRRdRKaUJt68q-E8zAsgNVOGBOcv9hvEDxeECjpP8VCreaIYkqfJbv8TiiLStjXQn5tfDr5BTjiw__&Key-Pair-Id=K3KFHCM130RXTL',
   'size': 32464},
  'thumbnail_256': {'url': 'https://content.isic-archive.com/99b7ffcb-fe3d-49f8-adfc-876c1470898a/ISIC_0034320_thumbnail_256.jpg?Expires=1746662400&Signature=enQ7dcdT1EzdlPUelcf4ITiQKEbEupVFUR6dAntn7sAS2gI4qoGCyBOTi1guUyKJK9IWmx~D01iLL8ZfP8RcpKkogtzgoh-DSX0GaR~g7fgd2XjY~01ilotr9xgxe~1OlBCwVC6~Q6y1EXHyW44cfP~1WiwHW1FPtT6gmNj9ebmpmAsKb3~CKBHM

**We will confirm if every image is in the correct size (600x450)**

In [61]:
for result in results:
    try:
        pixels_x = result["metadata"]["acquisition"]["pixels_x"]
        pixels_y = result["metadata"]["acquisition"]["pixels_y"]
        if(pixels_x != 600 and pixels_y != 450):
          print("Incorrect Image ID:", result["isic_id"])
          print("pixel_X:", pixels_x, "pixel_Y:", pixels_y)
    except KeyError:
        print("Missing pixel data in one of the results.")


Since no image ID was printed, we can assume all the images have the correct size

**Download the photos into ISIC_IMAGES_TASK_3 folder**

In [63]:
import os

# Make sure the folder exists
os.makedirs("ISIC_IMAGES_TASK_3", exist_ok=True)

# Loop through all results
for result in results:
    try:
        isic_id = result["isic_id"]
        image_url = result["files"]["full"]["url"]
        file_path = os.path.join("ISIC_IMAGES_TASK_3", f"{isic_id}.jpg")

        print(f"Downloading {isic_id}...")

        # Download and save the image
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
        else:
            print(f"Failed to download {isic_id}: HTTP {response.status_code}")
    except KeyError as e:
        print(f"Missing key {e} in one of the results.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Downloading ISIC_0029305...
Downloading ISIC_0029304...
Downloading ISIC_0029303...
Downloading ISIC_0029302...
Downloading ISIC_0029301...
Downloading ISIC_0029300...
Downloading ISIC_0029299...
Downloading ISIC_0029298...
Downloading ISIC_0029297...
Downloading ISIC_0029296...
Downloading ISIC_0029295...
Downloading ISIC_0029294...
Downloading ISIC_0029293...
Downloading ISIC_0029292...
Downloading ISIC_0029291...
Downloading ISIC_0029290...
Downloading ISIC_0029289...
Downloading ISIC_0029288...
Downloading ISIC_0029287...
Downloading ISIC_0029286...
Downloading ISIC_0029285...
Downloading ISIC_0029284...
Downloading ISIC_0029283...
Downloading ISIC_0029282...
Downloading ISIC_0029281...
Downloading ISIC_0029280...
Downloading ISIC_0029279...
Downloading ISIC_0029278...
Downloading ISIC_0029277...
Downloading ISIC_0029276...
Downloading ISIC_0029275...
Downloading ISIC_0029274...
Downloading ISIC_0029273...
Downloading

In [69]:
folder = "ISIC_IMAGES_TASK_3"
num_files = len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])

print("Expected 10015")
print(f"Got: {num_files}")


Expected 10015
Got: 10015


Explore how to identify a picture as benign or malignant.

First the field "benign_malignant" seems apropriate to do this testing,so we will see if all the entries of the dataset have this value

In [84]:
for entry in results:
    isic_id = entry.get("isic_id", "UNKNOWN_ID")
    clinical = entry.get("metadata", {}).get("clinical", {})

    missing = []
    if "lesion_id" not in clinical:
        missing.append("lesion_id")
    if "benign_malignant" not in clinical:
        missing.append("benign_malignant")

    if missing:
        print(f"{isic_id} is missing: {', '.join(missing)}")


ISIC_0034318 is missing: benign_malignant
ISIC_0034315 is missing: benign_malignant
ISIC_0034306 is missing: benign_malignant
ISIC_0034303 is missing: benign_malignant
ISIC_0034299 is missing: benign_malignant
ISIC_0034296 is missing: benign_malignant
ISIC_0034291 is missing: benign_malignant
ISIC_0034283 is missing: benign_malignant
ISIC_0034280 is missing: benign_malignant
ISIC_0034276 is missing: benign_malignant
ISIC_0034259 is missing: benign_malignant
ISIC_0034255 is missing: benign_malignant
ISIC_0034252 is missing: benign_malignant
ISIC_0034235 is missing: benign_malignant
ISIC_0034223 is missing: benign_malignant
ISIC_0034221 is missing: benign_malignant
ISIC_0034214 is missing: benign_malignant
ISIC_0034201 is missing: benign_malignant
ISIC_0034197 is missing: benign_malignant
ISIC_0034196 is missing: benign_malignant
ISIC_0034189 is missing: benign_malignant
ISIC_0034186 is missing: benign_malignant
ISIC_0034175 is missing: benign_malignant
ISIC_0034169 is missing: benign_ma

Conclusion:Some of theme do not have this attribute so we need to find another for theme.The attribute diagnosis_1 seems good for this task, we will see wich values can it be.

In [90]:
missing_diagnosis_1_values = set()

for entry in results:
    clinical = entry.get("metadata", {}).get("clinical", {})

    if "benign_malignant" not in clinical:
        diag1 = clinical.get("diagnosis_1")
        if diag1:
            missing_diagnosis_1_values.add(diag1)

print(" Unique 'diagnosis_1' values for entries missing 'benign_malignant':")
for value in sorted(missing_diagnosis_1_values):
    print("-", value)


 Unique 'diagnosis_1' values for entries missing 'benign_malignant':
- Benign
- Indeterminate
- Malignant


We can conclude that if they dont have the benign_malignant atribute, we can use diagnosis_1 since it provides information about the cancer.
diagnosis_1 can take the value "Indeterminate", we will discard these values since they do not identify the disease.

Put into the file **lesions.csv** with:

*   file -> isic_id.jpg
*   patient -> lesion_id
*   label
      - 0 ("benign_malignant": "benign")
      - 1 ("benign_malignant": "malignant")








In [89]:
import pandas as pd

rows = []

for result in results:
    try:
        isic_id = result["isic_id"]
        filename = f"{isic_id}.jpg"
        clinical = result["metadata"]["clinical"]
        patient = clinical.get("lesion_id", "unknown")

        benign_malignant = clinical.get("benign_malignant")
        diagnosis_1 = clinical.get("diagnosis_1", "")

        # Decide label based on available info
        if benign_malignant:
            label = 1 if benign_malignant.lower() == "malignant" else 0
        elif diagnosis_1 == "Benign":
            label = 0
        elif diagnosis_1 == "Malignant":
            label = 1
        else:  # Indeterminate or unknown
            continue  # ❌ Skip

        rows.append({
            "file": filename,
            "patient": patient,
            "label": label
        })

    except Exception as e:
        print(f"Skipped entry {result.get('isic_id', 'UNKNOWN')} due to error: {e}")

# Create DataFrame and write to CSV
df = pd.DataFrame(rows)
df.to_csv("lesions.csv", index=False)

print("Saved lesions.csv with", len(df), "entries.")


Saved lesions.csv with 9885 entries.


**Load dataset kaggle**

In [99]:
import kagglehub
import os
import shutil
from tqdm import tqdm

# Step 1: Download the dataset
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("✅ Dataset downloaded to:", path)

# Step 2: Create destination folder
dst_dir = "KAGGLE_IMAGES_ham10000"
os.makedirs(dst_dir, exist_ok=True)

# Step 3: Walk the directory and copy .jpg files
count = 0
for root, _, files in os.walk(path):
    for file in files:
        if file.lower().endswith(".jpg"):
            src_path = os.path.join(root, file)
            dst_path = os.path.join(dst_dir, file)
            shutil.copy2(src_path, dst_path)
            count += 1

print(f"Copied {count} .jpg files into '{dst_dir}'")
meta_src = os.path.join(path, "HAM10000_metadata.csv")
meta_dst = os.path.join(dst_dir, "HAM10000_metadata.csv")
shutil.copy2(meta_src, meta_dst)
print("Copied metadata file to:", dst_dir)

✅ Dataset downloaded to: /kaggle/input/skin-cancer-mnist-ham10000
Copied 20030 .jpg files into 'KAGGLE_IMAGES_ham10000'
Copied metadata file to: KAGGLE_IMAGES_ham10000


Upon further investigation, we identified an error in our dataset selection. The HAM10000 dataset completely overlaps with the ISIC 2018 Task 3 dataset, as HAM10000 images are included within ISIC. Therefore, we have decided to exclude HAM10000 from our analysis.

The code below demonstrates this overlap by comparing the image IDs previously retrieved from ISIC with those found in the HAM10000 metadata.

In [98]:
metadata_path = "KAGGLE_IMAGES_ham10000/HAM10000_metadata.csv"
df = pd.read_csv(metadata_path)

metadata_ids = set(df["image_id"])

all_ids_set = set(all_ids)

missing_ids = metadata_ids - all_ids_set

print(f"Found {len(missing_ids)} image_ids in HAM10000 metadata that are not in all_ids:")
for mid in sorted(missing_ids):
    print("-", mid)


Found 0 image_ids in HAM10000 metadata that are not in all_ids:


Split into 70/15/15

In [104]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your lesions.csv
df = pd.read_csv("lesions.csv")

# Step 1: Get unique patients
unique_patients = df["patient"].unique()

# Step 2: Split patients → 70% train, 30% temp (val+test)
train_patients, temp_patients = train_test_split(
    unique_patients, test_size=0.30, random_state=42
)

# Step 3: Split temp → 50/50 into val and test (15% each)
val_patients, test_patients = train_test_split(
    temp_patients, test_size=0.50, random_state=42
)

# Step 4: Create splits by filtering on patient ID
train_df = df[df["patient"].isin(train_patients)].reset_index(drop=True)
val_df = df[df["patient"].isin(val_patients)].reset_index(drop=True)
test_df = df[df["patient"].isin(test_patients)].reset_index(drop=True)

# Step 5: Save them to CSV (optional)
train_df.to_csv("lesions_train.csv", index=False)
val_df.to_csv("lesions_val.csv", index=False)
test_df.to_csv("lesions_test.csv", index=False)

# Summary
print(f"Train: {len(train_df)} samples")
print(f"Val:   {len(val_df)} samples")
print(f"Test:  {len(test_df)} samples")


Train: 6881 samples
Val:   1500 samples
Test:  1504 samples


In [105]:
import os
from PIL import Image
import torch
from torchvision import transforms
from tqdm import tqdm

# Define training transform
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(0.1, 0.1, 0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

# Folder where your images are stored
image_dir = "ISIC_IMAGES_TASK_3"

# Apply transform to each image in train_df
transformed_images = []

for _, row in tqdm(train_df.iterrows(), total=len(train_df)):
    file_name = row["file"]
    image_path = os.path.join(image_dir, file_name)

    try:
        # Open image
        image = Image.open(image_path).convert("RGB")

        # Apply transform
        transformed = train_transform(image)

        # Store or use for training
        transformed_images.append(transformed)

    except Exception as e:
        print(f"Error with image {file_name}: {e}")


100%|██████████| 6881/6881 [01:17<00:00, 88.86it/s]


In [106]:
import timm
import torch.nn as nn

# Create EfficientNet-B0 for binary classification
model = timm.create_model(
    "efficientnet_b0",
    pretrained=True,
    num_classes=1  # 1 output neuron
)

# Wrap in sigmoid (if your loss doesn't include it)
model = nn.Sequential(
    model,
    nn.Sigmoid()
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]