#Data download + PNG resize (CPU)

**Load the dataset ISIC(collection 66, representative of the training set of task 3) into the *results* variable** , ~10k entries

In [1]:
import requests

# Base endpoint
base_url = "https://api.isic-archive.com/api/v2/images/search/"
params = {
    "collections": "66,67,73"
}

# Loop until no more pages
all_ids = []
results = []

page = 1

while True:
   # print(f"Requesting page {page}...")
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract image IDs
    for result in data.get("results", []):
        all_ids.append(result["isic_id"])
        results.append(result)

    # Print first ID for confirmation
    #print("image ID:", all_ids[(page-1)*100])

    # Prepare for next page
    next_cursor = data.get("next")
    if not next_cursor:
        break  # we're done!

    # Update URL for the next request
    base_url = next_cursor
    params = {}  # cursor URL already includes params
    page += 1

print(f"Total images collected: {len(all_ids)}")


Total images collected: 11720


**Analysing the data**

In [2]:
results[0]

{'isic_id': 'ISIC_0036064',
 'copyright_license': 'CC-BY-NC',
 'attribution': 'MILK study team',
 'files': {'full': {'url': 'https://content.isic-archive.com/95724593-18b2-4e93-b947-89b2309f3b47/57647d76-d7ba-4054-8119-184e6376d984.jpg?Expires=1746662400&Signature=N~2rVb8V0uB8Nxc4TRlkRNSaxIRqLEAer2VyG00qb0u3Zunq3FKNLVbvJ7pwdo9oLR~awfHJ13P3nL2JYUUiCESJItGFlfb3u1Ym4QGNx~ieyTY57mMgez~B1ADQj1sbLdjmFzBmu5M4uMQlFmalEiq6BZ5lLD3re1xFh7ZC9BPnpBA45o9u49M-7un3wvUvX1R34g3xoBjnFG2IyIdfcmNeZxOX~DN~o14XsdQUtapzfUyHEYgvln7t8lQpTQohB0G~2sA2PqbRs03mWh0trY38r1hacLGXeDwS~OxX6L7LlX9ZdMD-KvWugFbnjZrLRd9jz9z0ciK~kah-Zacmeg__&Key-Pair-Id=K3KFHCM130RXTL',
   'size': 20930},
  'thumbnail_256': {'url': 'https://content.isic-archive.com/3fe9d80d-eb4e-403f-9a6b-6aaffb37c265/ISIC_0036064_thumbnail_256.jpg?Expires=1746662400&Signature=h3h25SmAx-rct0V3WngQTGTr1dyrfB2LSZpF3hditv1gS8avyarwP6VwNorQ7vT2uEbWCzBOn-rjDAiOgob7fwsvCyicaO4s2Bc---o8YkNPwcmgRH1kLklDGZcEMd3hYJM92rycteVlwuGbfxVtdMDVTxO9ZOembGkr7gtr2967KGOgXYr4shwO

**We will confirm if every image is in the correct size (600x450)**

In [3]:
for result in results:
    try:
        pixels_x = result["metadata"]["acquisition"]["pixels_x"]
        pixels_y = result["metadata"]["acquisition"]["pixels_y"]
        if(pixels_x != 600 and pixels_y != 450):
          print("Incorrect Image ID:", result["isic_id"])
          print("pixel_X:", pixels_x, "pixel_Y:", pixels_y)
    except KeyError:
        print("Missing pixel data in one of the results.")


Since no image ID was printed, we can assume all the images have the correct size

**Download the photos into ISIC_IMAGES_TASK_3 folder**

In [None]:
import os

# Make sure the folder exists
os.makedirs("ISIC_IMAGES_TASK_3", exist_ok=True)

# Loop through all results
for result in results:
    try:
        isic_id = result["isic_id"]
        image_url = result["files"]["full"]["url"]
        file_path = os.path.join("ISIC_IMAGES_TASK_3", f"{isic_id}.jpg")

        print(f"Downloading {isic_id}...")

        # Download and save the image
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
        else:
            print(f"Failed to download {isic_id}: HTTP {response.status_code}")
    except KeyError as e:
        print(f"Missing key {e} in one of the results.")


Downloading ISIC_0036064...
Downloading ISIC_0036063...
Downloading ISIC_0036062...
Downloading ISIC_0036061...
Downloading ISIC_0036060...
Downloading ISIC_0036059...
Downloading ISIC_0036058...
Downloading ISIC_0036057...
Downloading ISIC_0036056...
Downloading ISIC_0036055...
Downloading ISIC_0036054...
Downloading ISIC_0036053...
Downloading ISIC_0036052...
Downloading ISIC_0036051...
Downloading ISIC_0036050...
Downloading ISIC_0036049...
Downloading ISIC_0036048...
Downloading ISIC_0036047...
Downloading ISIC_0036046...
Downloading ISIC_0036045...
Downloading ISIC_0036044...
Downloading ISIC_0036043...
Downloading ISIC_0036042...
Downloading ISIC_0036041...
Downloading ISIC_0036040...
Downloading ISIC_0036039...
Downloading ISIC_0036038...
Downloading ISIC_0036037...
Downloading ISIC_0036036...
Downloading ISIC_0036035...
Downloading ISIC_0036034...
Downloading ISIC_0036033...
Downloading ISIC_0036032...
Downloading ISIC_0036031...
Downloading ISIC_0036030...
Downloading ISIC_003

In [None]:
folder = "ISIC_IMAGES_TASK_3"
num_files = len([f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))])

print("Expected 10015")
print(f"Got: {num_files}")


Explore how to identify a picture as benign or malignant.

First the field "benign_malignant" seems apropriate to do this testing,so we will see if all the entries of the dataset have this value

In [None]:
for entry in results:
    isic_id = entry.get("isic_id", "UNKNOWN_ID")
    clinical = entry.get("metadata", {}).get("clinical", {})

    missing = []
    if "lesion_id" not in clinical:
        missing.append("lesion_id")
    if "benign_malignant" not in clinical:
        missing.append("benign_malignant")

    if missing:
        print(f"{isic_id} is missing: {', '.join(missing)}")


Conclusion:Some of theme do not have this attribute so we need to find another for theme.The attribute diagnosis_1 seems good for this task, we will see wich values can it be.

In [None]:
missing_diagnosis_1_values = set()

for entry in results:
    clinical = entry.get("metadata", {}).get("clinical", {})

    if "benign_malignant" not in clinical:
        diag1 = clinical.get("diagnosis_1")
        if diag1:
            missing_diagnosis_1_values.add(diag1)

print(" Unique 'diagnosis_1' values for entries missing 'benign_malignant':")
for value in sorted(missing_diagnosis_1_values):
    print("-", value)


We can conclude that if they dont have the benign_malignant atribute, we can use diagnosis_1 since it provides information about the cancer.
diagnosis_1 can take the value "Indeterminate", we will discard these values since they do not identify the disease.

Put into the file **lesions.csv** with:

*   file -> isic_id.jpg
*   patient -> lesion_id
*   label
      - 0 ("benign_malignant": "benign")
      - 1 ("benign_malignant": "malignant")








In [None]:
import pandas as pd

rows = []

for result in results:
    try:
        isic_id = result["isic_id"]
        filename = f"{isic_id}.jpg"
        clinical = result["metadata"]["clinical"]
        patient = clinical.get("lesion_id", "unknown")

        benign_malignant = clinical.get("benign_malignant")
        diagnosis_1 = clinical.get("diagnosis_1", "")

        # Decide label based on available info
        if benign_malignant:
            label = 1 if benign_malignant.lower() == "malignant" else 0
        elif diagnosis_1 == "Benign":
            label = 0
        elif diagnosis_1 == "Malignant":
            label = 1
        else:  # Indeterminate or unknown
            continue  # ❌ Skip

        rows.append({
            "file": filename,
            "patient": patient,
            "label": label
        })

    except Exception as e:
        print(f"Skipped entry {result.get('isic_id', 'UNKNOWN')} due to error: {e}")

# Create DataFrame and write to CSV
df = pd.DataFrame(rows)
df.to_csv("lesions.csv", index=False)

print("Saved lesions.csv with", len(df), "entries.")


**Load dataset kaggle**

In [None]:
import kagglehub
import os
import shutil
from tqdm import tqdm

# Step 1: Download the dataset
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("✅ Dataset downloaded to:", path)

# Step 2: Create destination folder
dst_dir = "KAGGLE_IMAGES_ham10000"
os.makedirs(dst_dir, exist_ok=True)

# Step 3: Walk the directory and copy .jpg files
count = 0
for root, _, files in os.walk(path):
    for file in files:
        if file.lower().endswith(".jpg"):
            src_path = os.path.join(root, file)
            dst_path = os.path.join(dst_dir, file)
            shutil.copy2(src_path, dst_path)
            count += 1

print(f"Copied {count} .jpg files into '{dst_dir}'")
meta_src = os.path.join(path, "HAM10000_metadata.csv")
meta_dst = os.path.join(dst_dir, "HAM10000_metadata.csv")
shutil.copy2(meta_src, meta_dst)
print("Copied metadata file to:", dst_dir)

Upon further investigation, we identified an error in our dataset selection. The HAM10000 dataset completely overlaps with the ISIC 2018 Task 3 dataset, as HAM10000 images are included within ISIC. Therefore, we have decided to exclude HAM10000 from our analysis.

The code below demonstrates this overlap by comparing the image IDs previously retrieved from ISIC with those found in the HAM10000 metadata.

In [None]:
metadata_path = "KAGGLE_IMAGES_ham10000/HAM10000_metadata.csv"
df = pd.read_csv(metadata_path)

metadata_ids = set(df["image_id"])

all_ids_set = set(all_ids)

missing_ids = metadata_ids - all_ids_set

print(f"Found {len(missing_ids)} image_ids in HAM10000 metadata that are not in all_ids:")
for mid in sorted(missing_ids):
    print("-", mid)


Split into 70/15/15

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your lesions.csv
df = pd.read_csv("lesions.csv")

# Step 1: Get unique patients
unique_patients = df["patient"].unique()

# Step 2: Split patients → 70% train, 30% temp (val+test)
train_patients, temp_patients = train_test_split(
    unique_patients, test_size=0.30, random_state=42
)

# Step 3: Split temp → 50/50 into val and test (15% each)
val_patients, test_patients = train_test_split(
    temp_patients, test_size=0.50, random_state=42
)

# Step 4: Create splits by filtering on patient ID
train_df = df[df["patient"].isin(train_patients)].reset_index(drop=True)
val_df = df[df["patient"].isin(val_patients)].reset_index(drop=True)
test_df = df[df["patient"].isin(test_patients)].reset_index(drop=True)

# Step 5: Save them to CSV (optional)
train_df.to_csv("lesions_train.csv", index=False)
val_df.to_csv("lesions_val.csv", index=False)
test_df.to_csv("lesions_test.csv", index=False)

# Summary
print(f"Train: {len(train_df)} samples")
print(f"Val:   {len(val_df)} samples")
print(f"Test:  {len(test_df)} samples")


In [None]:
import os
from PIL import Image
import torch
from torchvision import transforms
from tqdm import tqdm

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(0.1, 0.1, 0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

In [None]:
from torch.utils.data import Dataset

class SkinCancerDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.df = dataframe
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.image_dir, row["file"])
        label = torch.tensor(row["label"], dtype=torch.float32)

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        return image, label

In [None]:
from torch.utils.data import DataLoader

train_dataset = SkinCancerDataset(train_df, "ISIC_IMAGES_TASK_3", transform=train_transform)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2
)

In [None]:
import timm
import torch.nn as nn

model = timm.create_model(
    "efficientnet_b0",
    pretrained=True,
    num_classes=1  # output neuron
)

In [None]:
import torch
import torch.nn as nn

class FocalBCELoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, input, target):
        bce = self.bce(input, target)
        prob = torch.sigmoid(input)
        focal = (1 - prob) ** self.gamma
        return (focal * bce).mean()

In [None]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

criterion = FocalBCELoss(gamma=2.0)
optimizer = AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=30)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score, balanced_accuracy_score,
    precision_recall_fscore_support
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

val_dataset = SkinCancerDataset(val_df, "ISIC_IMAGES_TASK_3", transform=train_transform)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

n_epochs = 10
train_losses = np.zeros(n_epochs)
val_losses = np.zeros(n_epochs)
val_aurocs = np.zeros(n_epochs)
val_f1s = np.zeros(n_epochs)
val_bal_accs = np.zeros(n_epochs)

for epoch in range(n_epochs):
    model.train()
    running_loss = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(images).squeeze()
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    train_losses[epoch] = running_loss / len(train_loader.dataset)
    scheduler.step()

    # Step 5: Inference + Step 6: Metrics
    model.eval()
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            logits = model(images).squeeze()
            probs = torch.sigmoid(logits).cpu().numpy()
            all_probs.extend(probs)
            all_labels.extend(labels.numpy())

    y_true = np.array(all_labels)
    y_prob = np.array(all_probs)
    y_pred = (y_prob > 0.5).astype(int)

    val_losses[epoch] = criterion(torch.tensor(y_prob), torch.tensor(y_true)).item()
    val_aurocs[epoch] = roc_auc_score(y_true, y_prob)
    val_bal_accs[epoch] = balanced_accuracy_score(y_true, y_pred)
    _, _, val_f1s[epoch], _ = precision_recall_fscore_support(y_true, y_pred, average='binary')

    print(f"Epoch {epoch+1:2d}: Train Loss = {train_losses[epoch]:.4f} | "
          f"Val Loss = {val_losses[epoch]:.4f} | AUROC = {val_aurocs[epoch]:.4f} | "
          f"F1 = {val_f1s[epoch]:.4f} | Balanced Acc = {val_bal_accs[epoch]:.4f}")



# Loss plot
plt.figure(figsize=(10, 4))
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Val Loss")
plt.title("Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

# Metrics plot
plt.figure(figsize=(10, 4))
plt.plot(val_aurocs, label="AUROC")
plt.plot(val_f1s, label="F1 Score")
plt.plot(val_bal_accs, label="Balanced Accuracy")
plt.title("Validation Metrics Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Score")
plt.ylim(0, 1)
plt.legend()
plt.grid(True)
plt.show()