In [1]:
!nvidia-smi

Sun Apr 13 18:42:27 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla P100-PCIE-12GB           On  | 00000000:03:00.0 Off |                    0 |
| N/A   34C    P0              25W / 250W |      0MiB / 12288MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE-12GB           On  | 00000000:04:00.0 Off |  

In [2]:
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'

import numpy as np

import time
import torch
from ultralytics import YOLO

In [3]:
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
if num_gpus < 4:
    print("Warning: Fewer than 4 GPUs are available. Adjust 'device' parameter accordingly.")


Number of GPUs available: 4


In [4]:
model = YOLO("yolov8n.yaml")
data_config_path = "/home/temgar.s/pneumonia_yolo.yaml"
print(f"Using data configuration from: {data_config_path}")

Using data configuration from: /home/temgar.s/pneumonia_yolo.yaml


In [5]:
epochs = 10       
imgsz = 256       
batch_size = 16   
device = "0,1,2,3" 

print(f"Starting training with parameters:")
print(f"  Epochs: {epochs}")
print(f"  Image Size: {imgsz}")
print(f"  Batch Size: {batch_size}")
print(f"  Device(s): {device}")

Starting training with parameters:
  Epochs: 10
  Image Size: 256
  Batch Size: 16
  Device(s): 0,1,2,3


In [6]:
start_time = time.time()

try:
    results = model.train(
        data=data_config_path,
        epochs=epochs,
        imgsz=imgsz,
        batch=batch_size,
        device=device
    )
except Exception as e:
    print("An error occurred during training:")
    print(e)
    raise

end_time = time.time()
total_training_time = end_time - start_time
print(f"\nTotal Training Time: {total_training_time:.2f} seconds")

Ultralytics 8.3.107 ðŸš€ Python-3.12.4 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-12GB, 12194MiB)
                                                       CUDA:1 (Tesla P100-PCIE-12GB, 12194MiB)
                                                       CUDA:2 (Tesla P100-PCIE-12GB, 12194MiB)
                                                       CUDA:3 (Tesla P100-PCIE-12GB, 12194MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8n.yaml, data=/home/temgar.s/pneumonia_yolo.yaml, epochs=10, time=None, patience=100, batch=16, imgsz=256, save=True, save_period=-1, cache=False, device=0,1,2,3, workers=8, project=None, name=train6, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, conf=None, iou=0.7, max_det=300

[34m[1mtrain: [0mScanning /home/temgar.s/yolo_dataset/labels/train.cache... 26680 images, 20670 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 26680/26680 [00:00<?, ?it/s]
[34m[1mval: [0mScanning /home/temgar.s/yolo_dataset/labels/train.cache... 26680 images, 20670 backgrounds, 0 corrupt: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 26680/26680 [00:00<?, ?it/s]


Plotting labels to runs/detect/train6/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0005), 63 bias(decay=0.0)
Image sizes 256 train, 256 val
Using 32 dataloader workers
Logging results to [1mruns/detect/train6[0m
Starting training for 10 epochs...
Closing dataloader mosaic





      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/10      0.35G      2.057      3.103      1.719          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [05:07<00:00,  5.42it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [03:02<00:00, 18.28it/s]


                   all      26680       9541    0.00727       0.82     0.0475     0.0142

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/10     0.422G      1.646      2.172      1.344          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:49<00:00,  5.75it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [02:58<00:00, 18.65it/s]


                   all      26680       9541     0.0153      0.495     0.0386     0.0111

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/10     0.422G      1.582      2.061      1.259          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:47<00:00,  5.80it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [03:01<00:00, 18.37it/s]


                   all      26680       9541      0.119      0.534      0.186      0.064

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/10     0.422G      1.522      2.008      1.223          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:47<00:00,  5.79it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [03:01<00:00, 18.42it/s]


                   all      26680       9541      0.314      0.276      0.211     0.0746

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/10     0.422G      1.501      1.974      1.198          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:46<00:00,  5.82it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [03:02<00:00, 18.32it/s]


                   all      26680       9541      0.246       0.33      0.204     0.0779

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/10     0.422G       1.49      1.961      1.182          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:46<00:00,  5.82it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [02:59<00:00, 18.59it/s]


                   all      26680       9541      0.295      0.289      0.212     0.0829

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/10     0.422G      1.399      1.831       1.09          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:46<00:00,  5.83it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [02:59<00:00, 18.55it/s]


                   all      26680       9541       0.27      0.283      0.193     0.0752

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/10     0.422G      1.352      1.839      1.068          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:46<00:00,  5.82it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [02:59<00:00, 18.54it/s]


                   all      26680       9541      0.366      0.236      0.227     0.0887

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/10     0.422G        1.4      1.825      1.091          1        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:45<00:00,  5.84it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [02:58<00:00, 18.68it/s]


                   all      26680       9541      0.427      0.254      0.268      0.105

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/10     0.422G      1.355      1.796      1.078          0        256: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1668/1668 [04:44<00:00,  5.86it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [02:58<00:00, 18.67it/s]


                   all      26680       9541      0.379      0.273      0.261      0.106

10 epochs completed in 1.307 hours.
Optimizer stripped from runs/detect/train6/weights/last.pt, 6.2MB
Optimizer stripped from runs/detect/train6/weights/best.pt, 6.2MB

Validating runs/detect/train6/weights/best.pt...
Ultralytics 8.3.107 ðŸš€ Python-3.12.4 torch-2.6.0+cu124 CUDA:0 (Tesla P100-PCIE-12GB, 12194MiB)
                                                       CUDA:1 (Tesla P100-PCIE-12GB, 12194MiB)
                                                       CUDA:2 (Tesla P100-PCIE-12GB, 12194MiB)
                                                       CUDA:3 (Tesla P100-PCIE-12GB, 12194MiB)
YOLOv8n summary (fused): 72 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3335/3335 [02:46<00:00, 20.03it/s]


                   all      26680       9541      0.427      0.254      0.268      0.105
Speed: 0.1ms preprocess, 2.2ms inference, 0.0ms loss, 1.4ms postprocess per image
Results saved to [1mruns/detect/train6[0m

Total Training Time: 5084.88 seconds


In [7]:
baseline_time = 500.0  # example value in seconds
speedup = baseline_time / total_training_time
efficiency = speedup / 4  # assuming 4 GPUs
print(f"Speedup: {speedup:.2f}")
print(f"Parallel Efficiency: {efficiency:.2f}")

Speedup: 0.10
Parallel Efficiency: 0.02
