In [1]:
import kagglehub

In [2]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision import models
from PIL import Image
import os

In [3]:
path = kagglehub.dataset_download("paultimothymooney/breast-histopathology-images")

print("Path to dataset files:", path)
print("Contents of downloaded directory: ", os.listdir(path))

Downloading from https://www.kaggle.com/api/v1/datasets/download/paultimothymooney/breast-histopathology-images?dataset_version_number=1...


100%|██████████| 3.10G/3.10G [00:41<00:00, 80.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1
Contents of downloaded directory:  ['10293', '12890', '9347', '16550', '12751', '15510', '9174', '8913', '13404', '12876', '12900', '12911', '16532', '10274', '13693', '13459', '16896', '14154', '15472', '9228', '9181', '9324', '10292', '12242', '10262', '12820', '10256', '14079', '9022', '12909', '10307', '12867', '16568', '14305', '12821', '13400', '9029', '12906', '8955', '13692', '8956', '12933', '9044', '15840', '16551', '9325', '12824', '12929', '9036', '10288', '8984', '12907', '12951', '9125', '12930', '13022', '12750', '9346', '9258', '12947', '9037', '8975', '13023', '10269', '9043', '12894', '15633', '14153', '9322', '16014', '13613', '12878', '10255', '10282', '9256', '15473', '13461', '9083', '14211', '12826', '10290', '8951', '9173', '16555', '8950', '12868', '14210', '13687', '9262', '9383', '12884', '10275', '12875', '9290', '14188', '9075', '16570', '9261', 

In [4]:
class HistopathologyDataset(Dataset):
  def __init__(self, image_dir, transform=None):
    self.transform = transform

    self.images = []
    self.labels = []

    for patient_id in os.listdir(image_dir):
      patient_path = os.path.join(image_dir, patient_id)
      if os.path.isdir(patient_path):
        for class_id in ["0", "1"]:
          class_path = os.path.join(patient_path, class_id)
          if os.path.exists(class_path):
            print(f"Processing {class_path}")
            for img_name in os.listdir(class_path):
              if img_name.endswith(".png"):
                self.images.append(os.path.join(class_path, img_name))
                self.labels.append(int(class_id))

    print("Found", len(self.images), "images")
    print(f"Class Distribution - Benign: {self.labels.count(0)}, Malignant: {self.labels.count(1)}")

  def __len__(self):
    return len(self.images)

  def __getitem__(self, idx):
    image_path = self.images[idx]
    image = Image.open(image_path).convert("RGB")
    label = self.labels[idx]

    if self.transform:
      image = self.transform(image)

    return image, label

In [5]:
def get_transforms():
  train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(20),
    transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
  ])

  val_transforms = transforms.Compose([
      transforms.Resize((224, 224)),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
  ])

  return train_transforms, val_transforms

In [6]:
class HistopathologyResNet(nn.Module):
  def __init__(self, num_classes=2):
    super(HistopathologyResNet, self).__init__()
    self.resnet = models.resnet50(pretrained=True)

    for param in self.resnet.parameters():
      param.requires_grad = False

    num_ftrs = self.resnet.fc.in_features
    self.resnet.fc = nn.Sequential(
        nn.Linear(num_ftrs, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, num_classes)
    )

  def forward(self, x):
      return self.resnet(x)

In [7]:
from pdb import run
def train_epoch(model, train_loader, criterion, optimizer, device):
  model.train()
  running_loss = 0.0
  correct = 0
  total = 0

  for inputs, labels in train_loader:
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()

    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()
    _, predicted = outputs.max(1)
    total += labels.size(0)
    correct += predicted.eq(labels).sum().item()

  accuracy = 100. * correct / total
  return running_loss / len(train_loader), accuracy

def validate(model, val_loader, criterion, device):
  model.eval()
  running_loss = 0.0
  correct = 0
  total = 0

  with torch.no_grad():
    for inputs, labels in val_loader:
      inputs, labels = inputs.to(device), labels.to(device)
      outputs = model(inputs)
      loss = criterion(outputs, labels)

      running_loss += loss.item()
      _, predicted = outputs.max(1)
      total += labels.size(0)
      correct += predicted.eq(labels).sum().item()

  accuracy = 100. * correct / total
  return running_loss / len(val_loader), accuracy

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

train_transforms, val_transforms = get_transforms()

train_dataset = HistopathologyDataset(
    image_dir=path,
    transform=train_transforms
)

train_size = int(0.8 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

Using device: cuda
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/10293/0
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/10293/1
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/12890/0
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/12890/1
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/9347/0
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/9347/1
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/16550/0
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/16550/1
Processing /root/.cache/kagglehub/datasets/paultimothymooney/breast-histopathology-images/versions/1/12751/0
Pr

In [9]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

model = HistopathologyResNet().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True
)

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 113MB/s]


In [None]:
num_epochs = 50
best_val_loss = float('inf')

for epoch in range(num_epochs):
  train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
  val_loss, val_acc = validate(model, val_loader, criterion, device)

  scheduler.step(val_loss)

  if val_loss < best_val_loss:
    best_val_loss = val_loss
    torch.save(model.state_dict(), "best_model.pth")

  print(f"Epoch [{epoch+1}/{num_epochs}]")
  print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%")
  print(f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%")

Epoch [1/50]
Train Loss: 0.3692, Train Acc: 84.05%
Val Loss: 0.3323, Val Acc: 85.63%
Epoch [2/50]
Train Loss: 0.3538, Train Acc: 84.83%
Val Loss: 0.3211, Val Acc: 86.08%
Epoch [3/50]
Train Loss: 0.3505, Train Acc: 84.99%
Val Loss: 0.3214, Val Acc: 86.24%
Epoch [4/50]
Train Loss: 0.3482, Train Acc: 85.10%
Val Loss: 0.3173, Val Acc: 85.94%
Epoch [5/50]
Train Loss: 0.3450, Train Acc: 85.32%
Val Loss: 0.3262, Val Acc: 86.31%
