In [None]:

import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.distributed as dist
from torch.utils.data import DataLoader, Dataset
from torch.nn.parallel import DistributedDataParallel as DDP
from torchvision import transforms
from PIL import Image
import torch.multiprocessing as mp
import time
import sys
import matplotlib.pyplot as plt


class PneumoniaDataset(Dataset):
    def __init__(self, Image_dir, Mask_dir, transform=None):
        self.image_paths = sorted([os.path.join(Image_dir, f) for f in os.listdir(Image_dir)
                                   if f.endswith(('.jpeg', '.jpg', '.png'))])
        self.mask_paths = sorted([os.path.join(Mask_dir, f) for f in os.listdir(Mask_dir)
                                  if f.endswith(('.jpeg', '.jpg', '.png'))])
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx]).convert('L')
        mask = Image.open(self.mask_paths[idx]).convert('L')

        label = 1 if mask.getextrema()[1] > 0 else 0  

        if self.transform:
            img = self.transform(img)

        return img, label


class CNNClassifier(nn.Module):
    def __init__(self):
        super(CNNClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.dropout1 = nn.Dropout2d(0.25)

        self._to_linear = None
        self._get_flattened_size()
        self.fc1 = nn.Linear(self._to_linear, 128)
        self.fc2 = nn.Linear(128, 2)

    def _get_flattened_size(self):
        with torch.no_grad():
            x = torch.zeros(1, 1, 256, 256)
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = self.dropout1(x)
            self._to_linear = x.view(1, -1).shape[1]

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.dropout1(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


def train_model(rank, world_size, args):
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29500"

    dist.init_process_group("gloo", rank=rank, world_size=world_size)
    torch.manual_seed(0)

    transform = transforms.Compose([
        transforms.Resize((256, 256)),
        transforms.ToTensor(),
    ])

    dataset = PneumoniaDataset(args.image_dir, args.mask_dir, transform)

    sampler = torch.utils.data.distributed.DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, batch_size=32, sampler=sampler, num_workers=4, pin_memory=True)

    device = torch.device(f"cpu")  
    model = CNNClassifier().to(device)
    model = DDP(model, device_ids=None)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    
    epoch_train_losses = []
    epoch_accuracies = []

    for epoch in range(1, 6):
        model.train()
        sampler.set_epoch(epoch)
        running_loss = 0.0
        correct = 0
        total = 0
        start = time.time()
        
        for batch_idx, (data, target) in enumerate(dataloader):
            data = data.to(device)
            target = torch.tensor(target).to(device)

            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(output, 1)
            correct += (predicted == target).sum().item()
            total += target.size(0)
            
            if batch_idx % 10 == 0:
                print(f"[Rank {rank}] Epoch {epoch} | Batch {batch_idx} | Loss: {loss.item():.4f}")


        accuracy = 100 * correct / total
        epoch_train_losses.append(running_loss / len(dataloader))
        epoch_accuracies.append(accuracy)

        end = time.time()
        print(f"⏱️ [Rank {rank}] Epoch {epoch} completed in {end - start:.2f} seconds")
        print(f"⏳ [Rank {rank}] Epoch {epoch} | Training Loss: {epoch_train_losses[-1]:.4f} | Accuracy: {accuracy:.2f}%")


    if rank == 0:
        torch.save(model.state_dict(), "cnn_ddp_model.pth")
        print("✅ Model saved as cnn_ddp_model.pth", flush=True)

    dist.destroy_process_group()

    if rank == 0:
        plot_graphs(epoch_train_losses, epoch_accuracies)


def plot_graphs(losses, accuracies):
    epochs = range(1, len(losses) + 1)

    
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, losses, label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Loss over Epochs')
    plt.legend()

    
    plt.subplot(1, 2, 2)
    plt.plot(epochs, accuracies, label='Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.title('Accuracy over Epochs')
    plt.legend()

    
    plt.tight_layout()
    plt.savefig('training_loss_accuracy_plots.png')
    plt.show()


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--image-dir', type=str, required=True, help='Path to training images')
    parser.add_argument('--mask-dir', type=str, required=True, help='Path to training masks')
    parser.add_argument('--save-dir', type=str, default="/scratch/mohammed.moi/yolo_workspace", help='Where to save model')
    parser.add_argument('--world-size', type=int, default=4, help='Number of parallel processes')
    args = parser.parse_args()

    world_size = args.world_size
    mp.spawn(train_model, args=(world_size, args), nprocs=world_size, join=True)

if __name__ == "__main__":
    main()


Overwriting ddp_train_cpu.py


In [77]:
!python ddp_train_cpu.py \
  --image-dir "/scratch/mohammed.moi/yolo_workspace/pneumonia_dataset/Pneumonia Dataset/Training/Images" \
  --mask-dir "/scratch/mohammed.moi/yolo_workspace/pneumonia_dataset/Pneumonia Dataset/Training/Masks" \
  --save-dir "/scratch/mohammed.moi/yolo_workspace" \
  --world-size 4

  target = torch.tensor(target).to(device)
  target = torch.tensor(target).to(device)
  target = torch.tensor(target).to(device)
  target = torch.tensor(target).to(device)
[Rank 3] Epoch 1 | Batch 0 | Loss: 0.7236
[Rank 2] Epoch 1 | Batch 0 | Loss: 0.7149
[Rank 1] Epoch 1 | Batch 0 | Loss: 0.7297
[Rank 0] Epoch 1 | Batch 0 | Loss: 0.7302
[Rank 3] Epoch 1 | Batch 10 | Loss: 0.7267
[Rank 0] Epoch 1 | Batch 10 | Loss: 0.7136
[Rank 2] Epoch 1 | Batch 10 | Loss: 0.7308
[Rank 1] Epoch 1 | Batch 10 | Loss: 0.7185
[Rank 0] Epoch 1 | Batch 20 | Loss: 0.5279
[Rank 1] Epoch 1 | Batch 20 | Loss: 0.5257
[Rank 2] Epoch 1 | Batch 20 | Loss: 0.5599
[Rank 3] Epoch 1 | Batch 20 | Loss: 0.4490
[Rank 1] Epoch 1 | Batch 30 | Loss: 0.5314
[Rank 2] Epoch 1 | Batch 30 | Loss: 0.5509
[Rank 3] Epoch 1 | Batch 30 | Loss: 0.4579
[Rank 0] Epoch 1 | Batch 30 | Loss: 0.5079
[Rank 2] Epoch 1 | Batch 40 | Loss: 0.5575
[Rank 3] Epoch 1 | Batch 40 | Loss: 0.5562
[Rank 0] Epoch 1 | Batch 40 | Loss: 0.5383
[Rank 1] Epoch 

In [83]:
!python ddp_train_cpu.py \
  --image-dir "/scratch/mohammed.moi/yolo_workspace/pneumonia_dataset/Pneumonia Dataset/Training/Images" \
  --mask-dir "/scratch/mohammed.moi/yolo_workspace/pneumonia_dataset/Pneumonia Dataset/Training/Masks" \
  --save-dir "/scratch/mohammed.moi/yolo_workspace" \
  --world-size 4

  target = torch.tensor(target).to(device)
  target = torch.tensor(target).to(device)
  target = torch.tensor(target).to(device)
  target = torch.tensor(target).to(device)
[Rank 1] Epoch 1 | Batch 0 | Loss: 0.7297
[Rank 0] Epoch 1 | Batch 0 | Loss: 0.7302
[Rank 2] Epoch 1 | Batch 0 | Loss: 0.7149
[Rank 3] Epoch 1 | Batch 0 | Loss: 0.7236
[Rank 0] Epoch 1 | Batch 10 | Loss: 0.7136
[Rank 3] Epoch 1 | Batch 10 | Loss: 0.7267
[Rank 1] Epoch 1 | Batch 10 | Loss: 0.7185
[Rank 2] Epoch 1 | Batch 10 | Loss: 0.7308
[Rank 3] Epoch 1 | Batch 20 | Loss: 0.4490
[Rank 0] Epoch 1 | Batch 20 | Loss: 0.5279
[Rank 2] Epoch 1 | Batch 20 | Loss: 0.5599
[Rank 1] Epoch 1 | Batch 20 | Loss: 0.5257
[Rank 3] Epoch 1 | Batch 30 | Loss: 0.4579
[Rank 1] Epoch 1 | Batch 30 | Loss: 0.5314
[Rank 0] Epoch 1 | Batch 30 | Loss: 0.5079
[Rank 2] Epoch 1 | Batch 30 | Loss: 0.5509
[Rank 2] Epoch 1 | Batch 40 | Loss: 0.5575
[Rank 0] Epoch 1 | Batch 40 | Loss: 0.5383
[Rank 3] Epoch 1 | Batch 40 | Loss: 0.5562
[Rank 1] Epoch 