In [1]:
MODEL_VERSION = "001"

In [2]:
import torch
torch.set_float32_matmul_precision('medium')
from torchvision import transforms, models
from torchvision.transforms import Resize, ToTensor, Normalize

import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint

from code.classifier import PneumoniaClassifier
from code.dataloader import PneumoniaDataset
from code.custom_checkpoint import CustomModelCheckpoint
from code.project_globals import TEST_DIR, TRAIN_DIR, VAL_DIR
import pathlib


In [3]:
#transform = transforms.Compose([transforms.Resize((60,60)),transforms.ToTensor()])
# Define transformations for the dataset
transform = transforms.Compose([
    Resize((224, 224)),
    ToTensor(),
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
#print(TRAIN_DIR.as_posix())
# Create dataset instances with proper arguments
train = PneumoniaDataset(root_dir=TRAIN_DIR.as_posix(), transform=transform)
test = PneumoniaDataset(root_dir=TEST_DIR.as_posix(), transform=transform)
val = PneumoniaDataset(root_dir=VAL_DIR.as_posix(), transform=transform)

train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val, batch_size=32, shuffle=False)

# TensorBoard logger setup
logger = TensorBoardLogger("tb_logs", name="pneumonia_classifier")

checkpoint_callback = CustomModelCheckpoint(
    monitor='val_loss',         # Metric to monitor
    dirpath='../checkpoints',    # Directory to save the checkpoints
    filename='best_model-{epoch:02d}-{val_loss:.2f}',  # Save format
    save_top_k=3,               # Only keep the 3 best models
    mode='min'                  # Minimize val_loss
)

# Model setup
model = PneumoniaClassifier(
    backbone=models.resnet50(weights='ResNet50_Weights.DEFAULT'),
    transfer_learning=True
)

# Trainer with TensorBoard logger
trainer = pl.Trainer(
    max_epochs=10,
    accelerator="gpu",
    devices=1,
    logger=logger,  # Attach TensorBoard logger
    log_every_n_steps=1,  # Log metrics after every step
    callbacks=[checkpoint_callback]
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [4]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Nov 15 18:02:18 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.90                 Driver Version: 565.90         CUDA Version: 12.7     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 Ti   WDDM  |   00000000:02:00.0  On |                  N/A |
|  0%   41C    P8             11W /  285W |    3662MiB /  12282MiB |     10%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
# Train the model
trainer.fit(model, train_loader, val_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type           | Params | Mode 
-------------------------------------------------------------
0 | accuracy          | BinaryAccuracy | 0      | train
1 | feature_extractor | Sequential     | 23.5 M | train
2 | classifier        | Linear         | 4.1 K  | train
-------------------------------------------------------------
4.1 K     Trainable params
23.5 M    Non-trainable params
23.5 M    Total params
94.049    Total estimated model params size (MB)
152       Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

C:\Users\aszab\miniconda3\envs\pneumonia_detection\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


                                                                           

C:\Users\aszab\miniconda3\envs\pneumonia_detection\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 163/163 [02:30<00:00,  1.08it/s, v_num=20]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 20.41it/s][A
Epoch 0: 100%|██████████| 163/163 [02:31<00:00,  1.08it/s, v_num=20, val_acc_epoch=0.914]Learning Rate after epoch 0: 0.001




Epoch 1: 100%|██████████| 163/163 [02:35<00:00,  1.05it/s, v_num=20, val_acc_epoch=0.914, train_acc_epoch=0.000]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 25.64it/s][A
Epoch 1: 100%|██████████| 163/163 [02:36<00:00,  1.04it/s, v_num=20, val_acc_epoch=0.957, train_acc_epoch=0.000]Learning Rate after epoch 1: 0.001
Epoch 2: 100%|██████████| 163/163 [02:36<00:00,  1.04it/s, v_num=20, val_acc_epoch=0.957, train_acc_epoch=0.000]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/1 [00:00<?, ?it/s][A
Validation DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 17.24it/s][A
Epoch 2: 100%|██████████| 163/163 [02:36<00:00,  1.04it/s, v_num=20, val_acc_epoch=0.963, train_acc_epoch=0.000]Learning Rate after epoch 2: 0.

`Trainer.fit` stopped: `max_epochs=10` reached.


Learning Rate after epoch 9: 0.00025
Epoch 9: 100%|██████████| 163/163 [02:36<00:00,  1.04it/s, v_num=20, val_acc_epoch=0.977, train_acc_epoch=0.000]


In [7]:
checkpoint_path = '../checkpoints/best_model-epoch=07-val_loss=0.12.ckpt'
checkpoint = torch.load(checkpoint_path)

# Access the metadata
metadata = checkpoint.get("metadata", {})
print(f"Checkpoint metadata: {metadata}")

# Restore the model state
model.load_state_dict(checkpoint["state_dict"])

  checkpoint = torch.load(checkpoint_path)


FileNotFoundError: [Errno 2] No such file or directory: '../checkpoints/best_model-epoch_10-val_loss_0.25.ckpt'