In [1]:
import torch
import hydra
import pandas as pd
from tqdm import tqdm 
from transformers import ViTForImageClassification
from utils import get_dataloaders, set_all_seeds
from dataclasses import dataclass, field
from typing import Dict
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, cohen_kappa_score
import matplotlib.pyplot as plt
from dataclasses import dataclass, field
from typing import Dict
from IPython.core.display import HTML, display
from collections import defaultdict
import numpy as np

2025-03-26 01:51:49.097030: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-26 01:51:49.148473: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-26 01:51:49.148515: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-26 01:51:49.149858: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-26 01:51:49.158281: I tensorflow/core/platform/cpu_feature_guar

In [2]:
set_all_seeds(42)

In [3]:
@dataclass
class DataConfig:
    path: str
    batch_size: int
    num_workers: int
    ratio : int = 1
@dataclass
class Config:
    train: DataConfig
    val: DataConfig
    test: DataConfig

# Creating the config object from the provided data
config = Config(
    train=DataConfig(
        path='datasets/DDR/train',
        batch_size=64,
        num_workers=8
    ),
    val=DataConfig(
        path='datasets/DDR/valid',
        batch_size=64,
        num_workers=8
    ),
    test=DataConfig(
        path='datasets/DDR/test',
        batch_size=64,
        num_workers=8
    )
)

# Printing the config to verify
print(config)

Config(train=DataConfig(path='datasets/DDR/train', batch_size=64, num_workers=8, ratio=1), val=DataConfig(path='datasets/DDR/valid', batch_size=64, num_workers=8, ratio=1), test=DataConfig(path='datasets/DDR/test', batch_size=64, num_workers=8, ratio=1))


In [4]:
ds_names = ['APTOS', 'DDR', 'FGADR', 'IDRiD', 'Messidor']
loaders_lst = []
ds_names_lst = [] 

for ds in ds_names: 
    
    config.train.path = f'datasets/{ds}/train'
    config.val.path = f'datasets/{ds}/valid'
    config.test.path = f'datasets/{ds}/test'
    
    print(config)
    
    train_loader, valid_loader, test_loader = get_dataloaders(config, 'eval')
    
    loaders_lst.append(train_loader)
    loaders_lst.append(valid_loader)
    loaders_lst.append(test_loader)
    
    ds_names_lst.append(f'{ds} Train')
    ds_names_lst.append(f'{ds} Validation')
    ds_names_lst.append(f'{ds} Test')

Config(train=DataConfig(path='datasets/APTOS/train', batch_size=64, num_workers=8, ratio=1), val=DataConfig(path='datasets/APTOS/valid', batch_size=64, num_workers=8, ratio=1), test=DataConfig(path='datasets/APTOS/test', batch_size=64, num_workers=8, ratio=1))
Config(train=DataConfig(path='datasets/DDR/train', batch_size=64, num_workers=8, ratio=1), val=DataConfig(path='datasets/DDR/valid', batch_size=64, num_workers=8, ratio=1), test=DataConfig(path='datasets/DDR/test', batch_size=64, num_workers=8, ratio=1))
Config(train=DataConfig(path='datasets/FGADR/train', batch_size=64, num_workers=8, ratio=1), val=DataConfig(path='datasets/FGADR/valid', batch_size=64, num_workers=8, ratio=1), test=DataConfig(path='datasets/FGADR/test', batch_size=64, num_workers=8, ratio=1))
Config(train=DataConfig(path='datasets/IDRiD/train', batch_size=64, num_workers=8, ratio=1), val=DataConfig(path='datasets/IDRiD/valid', batch_size=64, num_workers=8, ratio=1), test=DataConfig(path='datasets/IDRiD/test', ba

In [5]:
!ls checkpoints/checkpoints_fntn_messidor_only_100_ep

model_epoch_88_kappa_0.421.pth


In [6]:
checkpoints_to_eval = [
    'checkpoints/checkpoints_DDR_tune/model_epoch_2_acc_0.815.pth', 
    'checkpoints/checkpoints_adv/model_epoch_793_acc_0.822.pth', 
    'checkpoints/checkpoints_fntn_messidor_only_100_ep/model_epoch_88_kappa_0.421.pth', 
]

model_names = [
 'DDR pretrained',  
 'Adversarial Finetune', 
 'Simple finetune'
]

In [None]:
metr_df_dct = defaultdict(dict)


for model_nm, ckpt_path in zip(model_names, checkpoints_to_eval):
    
    print(f'BENCHMARKING {model_nm}')
    
    model = ViTForImageClassification.from_pretrained('google/vit-large-patch16-224')
    model = model.to('cuda')
    model.load_state_dict(torch.load(ckpt_path)['model_state_dict'])
    model.eval()
    
    train_acc = []
    train_loss = []
    train_f1 = []
    train_kc = []


    valid_acc = []
    valid_loss = []
    valid_f1 = []
    valid_kc = []

    test_acc = []
    test_loss = []
    test_f1 = []
    test_kc = []
    
    for i , (loader, ds_name) in enumerate(zip(loaders_lst, ds_names_lst)): 
        with torch.inference_mode():

            acc = 0
            avg_loss = 0

            y_pred = []  
            y_true = []


            for batch_idx, batch in tqdm(enumerate(loader), total=len(loader)): 

                X, y = batch
                X = X.to(model.device)
                y = y.to(model.device)

                pred = model(pixel_values = X, labels = y)
                class_pred = pred.logits.argmax(axis = 1)
                acc += (class_pred == y).type(torch.float).mean()
                avg_loss += pred.loss.item()

                y_pred.extend([item.item() for item in class_pred])
                y_true.extend([item.item() for item in y])

            print(f'Results of model inference on {ds_name}')
            print(f'Loss : {avg_loss / len(loader):.3f}')
            print(f'Accuracy : {acc / len(loader):.3f}')   
            print(f"F1-macro : {f1_score(y_true, y_pred, average = 'macro')}")   
            print(f'Cohen-Kappa score : {cohen_kappa_score(y_true, y_pred)}')   
            print()
            print()

#            disp = ConfusionMatrixDisplay( confusion_matrix(y_true, y_pred) )
#            show and disp.plot()
#            show and plt.show()

            if (i % 3) == 0 :
                train_acc.append(acc / len(loader))
                train_loss.append(avg_loss / len(loader))
                train_f1.append(f1_score(y_true, y_pred, average = 'macro'))
                train_kc.append(cohen_kappa_score(y_true, y_pred))

            if (i % 3) == 1 :
                valid_acc.append(acc / len(loader))
                valid_loss.append(avg_loss / len(loader))  
                valid_f1.append(f1_score(y_true, y_pred, average = 'macro'))
                valid_kc.append(cohen_kappa_score(y_true, y_pred))

            if (i % 3) == 2 :
                test_acc.append(acc / len(loader))
                test_loss.append(avg_loss / len(loader)) 
                test_f1.append(f1_score(y_true, y_pred, average = 'macro'))
                test_kc.append(cohen_kappa_score(y_true, y_pred))

        
    metr_df_dct[model_nm]['kappa_df'] = pd.DataFrame(
        {
        'Dataset' : ds_names,
        'Train split' : [ np.round(item.item(), 3) for item in train_kc],
        'Val split' : [ np.round(item.item(), 3) for item in valid_kc],
        'Test split' : [ np.round(item.item(), 3) for item in test_kc]
        }
        )
    
    metr_df_dct[model_nm]['f1_df']  = pd.DataFrame(
        {
        'Dataset' : ds_names,
        'Train split' : [ np.round(item.item(), 3) for item in train_f1],
        'Val split' : [ np.round(item.item(), 3) for item in valid_f1],
        'Test split' : [ np.round(item.item(), 3) for item in test_f1]
        }
        )
    
    metr_df_dct[model_nm]['acc_df'] = pd.DataFrame(
        {
        'Dataset' : ds_names,
        'Train split' : [ np.round(item.item(), 3) for item in train_acc],
        'Val split' : [ np.round(item.item(), 3) for item in valid_acc],
        'Test split' : [ np.round(item.item(), 3) for item in test_acc]
        }
        )
    
    metr_df_dct[model_nm]['loss_df'] = pd.DataFrame(
        {
        'Dataset' : ds_names,
        'Train split' : [ np.round(item, 3) for item in train_loss],
        'Val split' : [ np.round(item, 3) for item in valid_loss],
        'Test split' : [ np.round(item, 3) for item in test_loss]
        }
        )

In [None]:
model_names = [
 'DDR pretrained',  
 'Adversarial Finetune', 
 'Simple finetune'
]

In [8]:
HTML(metr_df_dct['DDR pretrained']['kappa_df'].to_html(index=False))

Dataset,Train split,Val split,Test split
APTOS,0.386,0.376,0.0
DDR,0.548,0.696,0.467
FGADR,0.074,0.083,0.024
IDRiD,0.35,0.287,0.146
Messidor,0.165,0.179,0.137


In [20]:
metr_df_dct['DDR pretrained']['loss_df'].to_markdown()

'|    | Dataset   |   Train split |   Val split |   Test split |\n|---:|:----------|--------------:|------------:|-------------:|\n|  0 | APTOS     |         1.095 |       1.09  |        3.52  |\n|  1 | DDR       |         0.71  |       0.615 |        0.812 |\n|  2 | FGADR     |         1.636 |       1.581 |        1.704 |\n|  3 | IDRiD     |         1.093 |       1.083 |        1.505 |\n|  4 | Messidor  |         1.203 |       1.188 |        1.152 |'

In [9]:
HTML(metr_df_dct['Adversarial Finetune']['kappa_df'].to_html(index=False))

Dataset,Train split,Val split,Test split
APTOS,0.443,0.429,0.0
DDR,0.692,0.717,0.478
FGADR,0.187,0.211,0.157
IDRiD,0.387,0.397,0.16
Messidor,0.344,0.296,0.313


In [10]:
HTML(metr_df_dct['Simple finetune']['kappa_df'].to_html(index=False))

Dataset,Train split,Val split,Test split
APTOS,0.335,0.294,0.0
DDR,0.259,0.349,0.235
FGADR,0.151,0.159,0.076
IDRiD,0.355,0.254,0.234
Messidor,0.562,0.421,0.358


In [21]:
metr_df_dct['Simple finetune']['loss_df'].to_markdown()

'|    | Dataset   |   Train split |   Val split |   Test split |\n|---:|:----------|--------------:|------------:|-------------:|\n|  0 | APTOS     |         1.849 |       1.857 |        3.711 |\n|  1 | DDR       |         1.854 |       1.683 |        2.067 |\n|  2 | FGADR     |         2.836 |       2.815 |        3.069 |\n|  3 | IDRiD     |         1.996 |       2.022 |        2.408 |\n|  4 | Messidor  |         1.298 |       1.724 |        1.739 |'

In [22]:
metr_df_dct['Adversarial Finetune']['loss_df'].to_markdown()

'|    | Dataset   |   Train split |   Val split |   Test split |\n|---:|:----------|--------------:|------------:|-------------:|\n|  0 | APTOS     |         1.207 |       1.189 |        4.924 |\n|  1 | DDR       |         0.549 |       0.569 |        0.855 |\n|  2 | FGADR     |         1.559 |       1.516 |        1.67  |\n|  3 | IDRiD     |         1.054 |       1.101 |        1.653 |\n|  4 | Messidor  |         0.964 |       1.054 |        0.93  |'