In [1]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
NVIDIA GeForce RTX 3050 6GB Laptop GPU


In [2]:
PROJECT_DIR = r"C:\Users\vamsi\Desktop\4-1 mini project\skin-cancer-detection"
csv_file = f"{PROJECT_DIR}\\data\\ham10000\\HAM10000_metadata.csv"
img_dir  = f"{PROJECT_DIR}\\data\\ham10000\\images"

In [None]:
# put this in one cell
import os, random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import GroupShuffleSplit
import albumentations as A
from albumentations.pytorch import ToTensorV2
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# reproducible
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)


df = pd.read_csv(csv_file)
# check common columns
print(df.columns.tolist())
# ensure patient_id exists; if not, fallback to random split
if 'patient_id' not in df.columns:
    df['patient_id'] = df['image_id']  # conservative fallback

label_map = {'nv':0,'mel':1,'bkl':2,'bcc':3,'akiec':4,'vasc':5,'df':6}
df['label'] = df['dx'].map(label_map)
assert df['label'].isnull().sum()==0, "Unmapped dx found."

# patient-level split
gss = GroupShuffleSplit(n_splits=1, test_size=0.10, random_state=seed)
train_idx, val_idx = next(gss.split(df, groups=df['patient_id']))
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df   = df.iloc[val_idx].reset_index(drop=True)
print("train/val sizes:", len(train_df), len(val_df))

# transforms (ImageNet mean/std)
train_transform = A.Compose([
    A.RandomResizedCrop((224,224), scale=(0.8,1.0), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.2),
    A.ShiftScaleRotate(shift_limit=0.0625, scale_limit=0.1, rotate_limit=20, p=0.5),
    A.ColorJitter(p=0.2),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

print("debug 1")

val_transform = A.Compose([
    A.Resize(224,224),
    A.Normalize(mean=(0.485,0.456,0.406), std=(0.229,0.224,0.225)),
    ToTensorV2(),
])

print("debug 2")

class HAM10000Dataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.img_dir, row['image_id'] + '.jpg')
        image = Image.open(img_path).convert("RGB")
        image = np.array(image)
        label = int(row['label'])
        if self.transform:
            image = self.transform(image=image)['image']
        return image, label

print("debug 3")

train_ds = HAM10000Dataset(train_df, img_dir, transform=train_transform)
val_ds   = HAM10000Dataset(val_df, img_dir, transform=val_transform)

print("debug 4")

# weighted sampler to address imbalance
counts = train_df['label'].value_counts().sort_index().values
class_weights = 1.0 / counts
sample_weights = class_weights[train_df['label'].values]
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=len(sample_weights), replacement=True)

print("debug 5")

train_loader = DataLoader(train_ds, batch_size=32, sampler=sampler, num_workers=0, pin_memory=True)
val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=0, pin_memory=True)

print("debug 6")

# quick sanity check - inspect one batch
batch = next(iter(train_loader))
print("debug 6.1")
images, labels = batch

print("debug 7")

print('images.shape:', images.shape)   # [B, C, H, W]
print('labels.shape:', labels.shape)
print('example labels:', labels[:10])


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)