In [4]:
# Import Drive
from google.colab import output
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Data download + PNG resize (CPU)

**Load the dataset ISIC(collection 66, representative of the training set of task 3) into the *results* variable** , ~10k entries

In [5]:
import requests

# Base endpoint
base_url = "https://api.isic-archive.com/api/v2/images/search/"
params = {
    "collections": "66,67,73"
}

# Loop until no more pages
all_ids = []
results = []

page = 1

while True:
   # print(f"Requesting page {page}...")
    response = requests.get(base_url, params=params)
    data = response.json()

    # Extract image IDs
    for result in data.get("results", []):
        all_ids.append(result["isic_id"])
        results.append(result)

    # Print first ID for confirmation
    #print("image ID:", all_ids[(page-1)*100])

    # Prepare for next page
    next_cursor = data.get("next")
    if not next_cursor:
        break  # we're done!

    # Update URL for the next request
    base_url = next_cursor
    params = {}  # cursor URL already includes params
    page += 1

print(f"Total images collected: {len(all_ids)}")


Total images collected: 11720


**Analysing the data**

In [6]:
results[5]

{'isic_id': 'ISIC_0036059',
 'copyright_license': 'CC-BY-NC',
 'attribution': 'MILK study team',
 'files': {'full': {'url': 'https://content.isic-archive.com/b1da82d9-f96a-4279-a22d-4df086f862e8/ab106229-0d37-440d-82e1-ea0f8b346576.jpg?Expires=1746921600&Signature=KbsaSqjSJ~Hc6VLXEI3jv4n1BRoSU85VSV1cASntK0v4dc2EvMCEKCNBgJxokefHM4KYNR6OJJh6UkY1PSDwPYUBCVTpWBFMygHXBkriPdVf5N5Wx1DNd3NfdZzJq0TSufMMaOS8HdjOs2YvuzD3LjZwsNDe4DqTaSRvOh21rWh8ElJ~ngguas35ZexkJfnuvxG~P-ArvOlFNIItyR5LRzgsZpq4D9Z0nZ3Y6E6RnKlOLmpEn3vlw77WQerR7SVK19s0FLNuPf5cUU6DM50RtgW4ft2hHZYH1Dzxpa4SE1FGgW1rOn85Q663D1lRC5LpBwQy9mOLFYkjrxt1R8BqVw__&Key-Pair-Id=K3KFHCM130RXTL',
   'size': 19565},
  'thumbnail_256': {'url': 'https://content.isic-archive.com/8430b64f-83c0-4f66-98ab-260535ab538c/ISIC_0036059_thumbnail_256.jpg?Expires=1746921600&Signature=IEdlwmoE1yPc0HzrWpTFAwXDF1q0yVhO4mTEGe18x3jc~KnF2h8hE36jPkWpYfrTEipWoz606j~fzZdSkGwM2IHFw7~j755r46QYmLdj6esRgMR2ZR-J50FDw5bweOqMgLMVTQvO9tBcTj2Z~T279JeT9CftWFWmptJS1rhnb1dOWcWFOMw4nNvo

**We will confirm if every image is in the correct size (600x450)**

In [7]:
for result in results:
    try:
        pixels_x = result["metadata"]["acquisition"]["pixels_x"]
        pixels_y = result["metadata"]["acquisition"]["pixels_y"]
        if(pixels_x != 600 and pixels_y != 450):
          print("Incorrect Image ID:", result["isic_id"])
          print("pixel_X:", pixels_x, "pixel_Y:", pixels_y)
    except KeyError:
        print("Missing pixel data in one of the results.")


**Download the photos into ISIC_IMAGES_TASK_3 folder**

In [8]:
PATH = "/content/drive/MyDrive/ISIC_IMAGES_TASK_3"

In [9]:
"""import os
import requests
from tqdm import tqdm

# Make sure the folder exists
PATH = "ISIC_IMAGES_TASK_3"
os.makedirs("ISIC_IMAGES_TASK_3", exist_ok=True)

# Loop through all results with tqdm
for result in tqdm(results, desc="Downloading ISIC images"):
    try:
        isic_id = result["isic_id"]
        image_url = result["files"]["full"]["url"]
        file_path = os.path.join("ISIC_IMAGES_TASK_3", f"{isic_id}.jpg")

        # Download and save the image
        response = requests.get(image_url)
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
        else:
            print(f"Failed to download {isic_id}: HTTP {response.status_code}")
    except KeyError as e:
        print(f"Missing key {e} in one of the results.")"""


'import os\nimport requests\nfrom tqdm import tqdm\n\n# Make sure the folder exists\nPATH = "ISIC_IMAGES_TASK_3"\nos.makedirs("ISIC_IMAGES_TASK_3", exist_ok=True)\n\n# Loop through all results with tqdm\nfor result in tqdm(results, desc="Downloading ISIC images"):\n    try:\n        isic_id = result["isic_id"]\n        image_url = result["files"]["full"]["url"]\n        file_path = os.path.join("ISIC_IMAGES_TASK_3", f"{isic_id}.jpg")\n\n        # Download and save the image\n        response = requests.get(image_url)\n        if response.status_code == 200:\n            with open(file_path, "wb") as f:\n                f.write(response.content)\n        else:\n            print(f"Failed to download {isic_id}: HTTP {response.status_code}")\n    except KeyError as e:\n        print(f"Missing key {e} in one of the results.")'

Explore how to identify a picture as benign or malignant.

First the field "benign_malignant" seems apropriate to do this testing,so we will see if all the entries of the dataset have this value

In [10]:
for entry in results:
    isic_id = entry.get("isic_id", "UNKNOWN_ID")
    clinical = entry.get("metadata", {}).get("clinical", {})

    missing = []
    if "lesion_id" not in clinical:
        missing.append("lesion_id")
    if "benign_malignant" not in clinical:
        missing.append("benign_malignant")

    if missing:
        print(f"{isic_id} is missing: {', '.join(missing)}")


ISIC_0036064 is missing: benign_malignant
ISIC_0036063 is missing: benign_malignant
ISIC_0036062 is missing: benign_malignant
ISIC_0036060 is missing: benign_malignant
ISIC_0036053 is missing: benign_malignant
ISIC_0036050 is missing: benign_malignant
ISIC_0036043 is missing: benign_malignant
ISIC_0036035 is missing: benign_malignant
ISIC_0036032 is missing: benign_malignant
ISIC_0036031 is missing: benign_malignant
ISIC_0036025 is missing: benign_malignant
ISIC_0036023 is missing: benign_malignant
ISIC_0036022 is missing: benign_malignant
ISIC_0036021 is missing: benign_malignant
ISIC_0036019 is missing: benign_malignant
ISIC_0036018 is missing: benign_malignant
ISIC_0036016 is missing: benign_malignant
ISIC_0036015 is missing: benign_malignant
ISIC_0036011 is missing: benign_malignant
ISIC_0036009 is missing: benign_malignant
ISIC_0036008 is missing: benign_malignant
ISIC_0036007 is missing: benign_malignant
ISIC_0036006 is missing: benign_malignant
ISIC_0036004 is missing: benign_ma

Conclusion:Some of theme do not have this attribute so we need to find another for theme.The attribute diagnosis_1 seems good for this task, we will see wich values can it be.

In [11]:
missing_diagnosis_1_values = set()

for entry in results:
    clinical = entry.get("metadata", {}).get("clinical", {})

    if "benign_malignant" not in clinical:
        diag1 = clinical.get("diagnosis_1")
        if diag1:
            missing_diagnosis_1_values.add(diag1)

print(" Unique 'diagnosis_1' values for entries missing 'benign_malignant':")
for value in sorted(missing_diagnosis_1_values):
    print("-", value)


 Unique 'diagnosis_1' values for entries missing 'benign_malignant':
- Benign
- Indeterminate
- Malignant


We can conclude that if they dont have the benign_malignant atribute, we can use diagnosis_1 since it provides information about the cancer.
diagnosis_1 can take the value "Indeterminate", we will discard these values since they do not identify the disease.

Put into the file **lesions.csv** with:

*   file -> isic_id.jpg
*   patient -> lesion_id
*   label
      - 0 ("benign_malignant": "benign")
      - 1 ("benign_malignant": "malignant")








In [12]:
import pandas as pd

rows = []

for result in results:
    try:
        isic_id = result["isic_id"]
        filename = f"{isic_id}.jpg"
        clinical = result["metadata"]["clinical"]
        patient = clinical.get("lesion_id", "unknown")

        benign_malignant = clinical.get("benign_malignant")
        diagnosis_1 = clinical.get("diagnosis_1", "")

        # Decide label based on available info
        if benign_malignant:
            label = 1 if benign_malignant.lower() == "malignant" else 0
        elif diagnosis_1 == "Benign":
            label = 0
        elif diagnosis_1 == "Malignant":
            label = 1
        else:  # Indeterminate or unknown
            continue  # ❌ Skip

        rows.append({
            "file": filename,
            "patient": patient,
            "label": label
        })

    except Exception as e:
        print(f"Skipped entry {result.get('isic_id', 'UNKNOWN')} due to error: {e}")

# Create DataFrame and write to CSV
df = pd.DataFrame(rows)
df.to_csv("lesions.csv", index=False)

print("Saved lesions.csv with", len(df), "entries.")


Saved lesions.csv with 11571 entries.


**Load dataset kaggle**

In [13]:
"""import kagglehub
import os
import shutil
from tqdm import tqdm

# Step 1: Download the dataset
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("✅ Dataset downloaded to:", path)

# Step 2: Create destination folder
dst_dir = "KAGGLE_IMAGES_ham10000"
os.makedirs(dst_dir, exist_ok=True)

# Step 3: Walk the directory and copy .jpg files
count = 0
for root, _, files in os.walk(path):
    for file in files:
        if file.lower().endswith(".jpg"):
            src_path = os.path.join(root, file)
            dst_path = os.path.join(dst_dir, file)
            shutil.copy2(src_path, dst_path)
            count += 1

print(f"Copied {count} .jpg files into '{dst_dir}'")
meta_src = os.path.join(path, "HAM10000_metadata.csv")
meta_dst = os.path.join(dst_dir, "HAM10000_metadata.csv")
shutil.copy2(meta_src, meta_dst)
print("Copied metadata file to:", dst_dir)"""

'import kagglehub\nimport os\nimport shutil\nfrom tqdm import tqdm\n\n# Step 1: Download the dataset\npath = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")\nprint("✅ Dataset downloaded to:", path)\n\n# Step 2: Create destination folder\ndst_dir = "KAGGLE_IMAGES_ham10000"\nos.makedirs(dst_dir, exist_ok=True)\n\n# Step 3: Walk the directory and copy .jpg files\ncount = 0\nfor root, _, files in os.walk(path):\n    for file in files:\n        if file.lower().endswith(".jpg"):\n            src_path = os.path.join(root, file)\n            dst_path = os.path.join(dst_dir, file)\n            shutil.copy2(src_path, dst_path)\n            count += 1\n\nprint(f"Copied {count} .jpg files into \'{dst_dir}\'")\nmeta_src = os.path.join(path, "HAM10000_metadata.csv")\nmeta_dst = os.path.join(dst_dir, "HAM10000_metadata.csv")\nshutil.copy2(meta_src, meta_dst)\nprint("Copied metadata file to:", dst_dir)'

Upon further investigation, we identified an error in our dataset selection. The HAM10000 dataset completely overlaps with the ISIC 2018 Task 3 dataset, as HAM10000 images are included within ISIC. Therefore, we have decided to exclude HAM10000 from our analysis.

The code below demonstrates this overlap by comparing the image IDs previously retrieved from ISIC with those found in the HAM10000 metadata.

In [14]:
"""metadata_path = "KAGGLE_IMAGES_ham10000/HAM10000_metadata.csv"
df = pd.read_csv(metadata_path)

metadata_ids = set(df["image_id"])

all_ids_set = set(all_ids)

missing_ids = metadata_ids - all_ids_set

print(f"Found {len(missing_ids)} image_ids in HAM10000 metadata that are not in all_ids:")
for mid in sorted(missing_ids):
    print("-", mid)"""


'metadata_path = "KAGGLE_IMAGES_ham10000/HAM10000_metadata.csv"\ndf = pd.read_csv(metadata_path)\n\nmetadata_ids = set(df["image_id"])\n\nall_ids_set = set(all_ids)\n\nmissing_ids = metadata_ids - all_ids_set\n\nprint(f"Found {len(missing_ids)} image_ids in HAM10000 metadata that are not in all_ids:")\nfor mid in sorted(missing_ids):\n    print("-", mid)'

Split into 70/15/15

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your lesions.csv
df = pd.read_csv("lesions.csv")

# Step 1: Get unique patients
unique_patients = df["patient"].unique()

# Step 2: Split patients → 70% train, 30% temp (val+test)
train_patients, temp_patients = train_test_split(
    unique_patients, test_size=0.30, random_state=42
)

# Step 3: Split temp → 50/50 into val and test (15% each)
val_patients, test_patients = train_test_split(
    temp_patients, test_size=0.50, random_state=42
)

# Step 4: Create splits by filtering on patient ID
train_df = df[df["patient"].isin(train_patients)].reset_index(drop=True)
val_df = df[df["patient"].isin(val_patients)].reset_index(drop=True)
test_df = df[df["patient"].isin(test_patients)].reset_index(drop=True)

# Step 5: Save them to CSV (optional)
train_df.to_csv("lesions_train.csv", index=False)
val_df.to_csv("lesions_val.csv", index=False)
test_df.to_csv("lesions_test.csv", index=False)

# Summary
print(f"Train: {len(train_df)} samples")
print(f"Val:   {len(val_df)} samples")
print(f"Test:  {len(test_df)} samples")


Train: 8134 samples
Val:   1694 samples
Test:  1743 samples


pre-prcessing

In [16]:
import os
from PIL import Image
import torch
from torchvision import transforms
from tqdm import tqdm

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(0.1, 0.1, 0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5], [0.5,0.5,0.5])

])

val_transform = transforms.Compose([
    transforms.CenterCrop(224),
    transforms.Resize(224),
    transforms.ToTensor(),
    transforms.Normalize([0.5,0.5,0.5],
                         [0.5,0.5,0.5])
])


Cria a class dataset que é utilziada para criar um obejto dtaset com a colecao de todoas as imagens em formato tesor e o seu respetivo label, pode ser indexado por um index que vai desde 0 a size-1

In [17]:
from torch.utils.data import Dataset

class SkinCancerDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None, return_filename=False):
        self.df = dataframe
        self.image_dir = image_dir
        self.transform = transform
        self.return_filename = return_filename

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.image_dir, row["file"])
        label = torch.tensor(row["label"], dtype=torch.float32)
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        if self.return_filename:
            return image, label, row["file"]
        else:
            return image, label

In [18]:
from torch.utils.data import DataLoader

train_dataset = SkinCancerDataset(train_df, PATH, transform=train_transform)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2
)

In [19]:
len([f for f in os.listdir('/content/drive/MyDrive/ISIC_IMAGES_TASK_3') if f.lower().endswith(('.png', '.jpg', '.jpeg'))])

11730

In [20]:
len(train_dataset)

8134

Visualize samples image and tensor values:

In [21]:

def show_tensor_image(tensor, mean=0.5, std=0.5):
    """
    Display a single image given a tensor of shape (C, H, W).
    Assumes the tensor was normalized with transforms.Normalize([mean]*3, [std]*3).
    """
    img = tensor.cpu().clone()          # clone to avoid modifying original
    img = img * std + mean             # unnormalize
    img = img.permute(1, 2, 0).numpy()  # C×H×W -> H×W×C
    plt.figure(figsize=(4,4))
    plt.imshow(img)
    plt.axis('off')
    plt.show()

index = 8111
sample = train_dataset[index][0]
label = train_dataset[index][1]
show_tensor_image(sample)
print(label)

NameError: name 'plt' is not defined

In [22]:
train_dataset[index][0]

tensor([[[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         ...,
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.]],

        [[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         ...,
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.]],

        [[-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         ...,
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.],
         [-1., -1., -1.,  ..., -1., -1., -1.]]])

Create model fast and simple

In [23]:
import timm
import torch.nn as nn

model = timm.create_model(
    "efficientnet_b0",
    pretrained=True,
    num_classes=1  # output neuron
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/21.4M [00:00<?, ?B/s]

In [24]:
model.eval()
with torch.inference_mode():
  sample = train_dataset[index][0].unsqueeze(dim=0)
  output = model(sample)

print(sample.shape)
print(output)

torch.Size([1, 3, 224, 224])
tensor([[8.7334]])


Funcao de perda boa para datasets debalanceados, as funcoes implementadas no pytorch nao sao fixes para o nosso caso, tem bce mas nao tem focal bce :(

In [25]:
import torch
import torch.nn as nn

class FocalBCELoss(nn.Module):
    def __init__(self, gamma=2):
        super().__init__()
        self.gamma = gamma
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, input, target):
        bce = self.bce(input, target)
        prob = torch.sigmoid(input)
        focal = (1 - prob) ** self.gamma
        return (focal * bce).mean()

AdamW lr 3e-4,
weight-decay 1e-4,
cosine schedule 30
epochs.

In [26]:
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR

criterion = FocalBCELoss(gamma=2.0)
optimizer = AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = CosineAnnealingLR(optimizer, T_max=30)

Treino do modelo e visualização dos resultados:

In [None]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics import (
    roc_auc_score, balanced_accuracy_score,
    precision_recall_fscore_support, classification_report
)
from torch.utils.data import DataLoader


# --- 2) Instantiate loaders --------------------------------------

#--------APAGAR----------- (mini dataset)
#small = train_df.groupby('label').sample(10, random_state=42)
#train_dataset = SkinCancerDataset(small, PATH, transform=train_transform)
#train_loader  = DataLoader(train_dataset, batch_size=4, shuffle=True)
# train on just small_loader and see if the model can learn to predict both classes
#--------APAGAR-----------



# training loader (no filenames)
train_dataset = SkinCancerDataset(train_df, PATH, transform=train_transform, return_filename=False)
train_loader  = DataLoader(train_dataset, batch_size=32, shuffle=True,  num_workers=2)

# validation loader (with filenames)
val_dataset = SkinCancerDataset(val_df,   PATH, transform=val_transform, return_filename=True)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, num_workers=2)

# --- 3) Prepare model & optimizer & scheduler ------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# (Assume criterion, optimizer, scheduler already defined)
# criterion = FocalBCELoss(...)
# optimizer = AdamW(...)
# scheduler = CosineAnnealingLR(...)

# --- 4) Set up metric arrays ------------------------------------
n_epochs       = 3
train_losses   = np.zeros(n_epochs)
train_bal_accs = np.zeros(n_epochs)

val_losses     = np.zeros(n_epochs)
val_aurocs     = np.zeros(n_epochs)
val_f1s        = np.zeros(n_epochs)
val_bal_accs   = np.zeros(n_epochs)

# --- 5) Training + Eval loop with tracking ----------------------

for epoch in range(n_epochs):
    # ---- TRAINING ----
    model.train()
    running_loss = 0.0

    # accumulate train preds for balanced accuracy
    train_labels, train_probs = [], []

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{n_epochs} [Train]", leave=False)
    for images, labels in pbar:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        logits = model(images).squeeze()
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

        # store for training balanced accuracy
        probs = torch.sigmoid(logits)
        train_probs.extend(probs.detach().cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

        pbar.set_postfix(loss=loss.item(),
                         lr=optimizer.param_groups[0]['lr'])

    # epoch wise stats
    train_losses[epoch]   = running_loss / len(train_loader.dataset)
    train_bal_accs[epoch] = balanced_accuracy_score(
        train_labels,
        (np.array(train_probs) > 0.5).astype(int)
    )
    scheduler.step()

    # ---- VALIDATION ----
    model.eval()
    val_logits, val_labels, val_files = [], [], []

    with torch.no_grad():
        vbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{n_epochs} [Val]  ", leave=False)
        for images, labels, fnames in vbar:
            images = images.to(device)
            logits = model(images).squeeze()

            val_logits.extend(logits.cpu().numpy())
            val_labels.extend(labels.numpy())
            val_files.extend(fnames)

    y_true = np.array(val_labels)
    y_prob = torch.sigmoid(torch.tensor(val_logits)).numpy()
    y_pred = (y_prob > 0.5).astype(int)

    val_losses[epoch]   = criterion(
        torch.tensor(val_logits),
        torch.tensor(val_labels)
    ).item()
    val_aurocs[epoch]   = roc_auc_score(y_true, y_prob)
    val_bal_accs[epoch] = balanced_accuracy_score(y_true, y_pred)
    val_f1s[epoch]      = precision_recall_fscore_support(
        y_true, y_pred, average="binary"
    )[2]

    # Print epoch summary + classification report
    print(f"Epoch {epoch+1:2d}: "
          f"Train Loss={train_losses[epoch]:.4f}, "
          f"Train BalAcc={train_bal_accs[epoch]:.4f} | "
          f"Val Loss={val_losses[epoch]:.4f}, "
          f"AUROC={val_aurocs[epoch]:.4f}, "
          f"F1={val_f1s[epoch]:.4f}, "
          f"Val BalAcc={val_bal_accs[epoch]:.4f}")
    print(classification_report(y_true, y_pred, target_names=["Benign","Malignant"]))

    # --- 6) Save predictions to CSV for later review ---------
    df_preds = pd.DataFrame({
        "filename": val_files,
        "true":     y_true,
        "pred":     y_pred,
        "prob":     y_prob
    })
    df_preds.to_csv(f"val_preds_epoch_{epoch+1}.csv", index=False)
    df_preds[df_preds["true"] != df_preds["pred"]] \
        .to_csv(f"wrong_preds_epoch_{epoch+1}.csv", index=False)

# --- 7) Final plots -------------------------------------------

# Loss curves
plt.figure(figsize=(8,3))
plt.plot(train_losses,   label="Train Loss")
plt.plot(val_losses,     label="Val Loss")
plt.title("Loss Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

# Accuracy & AUROC
plt.figure(figsize=(8,3))
plt.plot(train_bal_accs, label="Train Balanced Acc")
plt.plot(val_bal_accs,   label="Val Balanced Acc")
plt.plot(val_aurocs,     label="Val AUROC")
plt.title("Balanced Acc & AUROC Over Epochs")
plt.xlabel("Epoch")
plt.ylim(0,1)
plt.legend()
plt.grid(True)
plt.show()


Epoch 1/3 [Train]:  72%|███████▏  | 184/255 [17:10<05:42,  4.83s/it, loss=0.00338, lr=0.0003]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import os
import random

def visualize_predictions(csv_file, image_dir, n=5):
    """
    Displays n correctly classified and n misclassified images side by side.

    Args:
        csv_file (str): Path to the CSV with columns ['filename', 'true', 'pred', 'prob'].
        image_dir (str): Directory containing the image files.
        n (int): Number of examples from each group to display.
    """
    # Load predictions
    df = pd.read_csv(csv_file)

    # Split correct and wrong
    correct = df[df['true'] == df['pred']]
    wrong   = df[df['true'] != df['pred']]

    # Sample up to n examples
    n_corr = min(n, len(correct))
    n_wrong = min(n, len(wrong))
    sample_correct = correct.sample(n_corr, random_state=42)
    sample_wrong = wrong.sample(n_wrong, random_state=42)

    # Create grid: 2 rows (correct, wrong), n columns
    fig, axes = plt.subplots(2, n, figsize=(n * 3, 6))

    # Plot correct predictions
    for i, row in enumerate(sample_correct.itertuples()):
        img = Image.open(os.path.join(image_dir, row.filename)).convert('RGB')
        axes[0, i].imshow(img)
        axes[0, i].set_title(f"True={row.true}")
        axes[0, i].axis('off')

    # Plot wrong predictions
    for i, row in enumerate(sample_wrong.itertuples()):
        img = Image.open(os.path.join(image_dir, row.filename)).convert('RGB')
        axes[1, i].imshow(img)
        axes[1, i].set_title(f"T={row.true}, P={row.pred}")
        axes[1, i].axis('off')

    # Label rows
    axes[0, 0].set_ylabel('Correct', size=14)
    axes[1, 0].set_ylabel('Wrong', size=14)

    plt.tight_layout()
    plt.show()

# Example usage:
visualize_predictions('val_preds_epoch_3.csv', PATH, n=5)


In [None]:
df = pd.read_csv('val_preds_epoch_3.csv')
correct_pred_df = df[df['true'] == df['pred']]
wrong_pred_df = df[df['true'] != df['pred']]
cancer_pred_df = df[df['pred'] == 1]
noncancer_pred_df = df[df['pred'] == 0]
print(f'Number of correct predictions: {len(correct_pred_df)}')
print(f'Number of wrong predictions: {len(wrong_pred_df)}')
print(f'Number of cancer predictions: {len(cancer_pred_df)}')
print(f'Number of non cancer predictions: {len(noncancer_pred_df)}')

In [None]:
# Choose an example index
index = 6  # change this to test a different image
test_dataset = SkinCancerDataset(test_df, PATH, transform=train_transform)
# Get the image tensor and true label from your dataset
image_tensor, true_label = train_dataset[index]

# Prepare the input batch and move to device
input_tensor = image_tensor.unsqueeze(0).to(device)

# Run inference
model.eval()
with torch.inference_mode():
    logit = model(input_tensor).squeeze()
    prob = torch.sigmoid(logit).item()         # probability of class “1” (malignant)
    pred = 1 if prob > 0.5 else 0              # threshold at 0.5

# Map numeric labels to strings
label_map = {0: "Benign", 1: "Malignant"}

# Print results
print(f"True label:      {label_map[int(true_label.item())]}")
print(f"Predicted label: {label_map[pred]}  (probability = {prob:.4f})")


In [None]:
print("Val split counts:\n", val_df["label"].value_counts())


In [None]:
try:
  import torchinfo
except:
  !pip install torchinfo
  import torchinfo

In [None]:
torchinfo.summary(model=model,
        input_size=(16, 3, 224, 224),
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)