In [1]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torchvision.models import resnet18, ResNet18_Weights
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import sys

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# model_checkpoint = "google/vit-base-patch16-224-in21k"
model_checkpoint = "google/vit-base-patch16-224-in21k"

In [3]:
from transformers import AutoImageProcessor

image_processor = AutoImageProcessor.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [4]:
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
train_transforms = Compose(
    [
        RandomResizedCrop(image_processor.size["height"]),
        RandomHorizontalFlip(),
        ToTensor(),
        normalize,
    ]
)

val_transforms = Compose(
    [
        Resize(image_processor.size["height"]),
        CenterCrop(image_processor.size["height"]),
        ToTensor(),
        normalize,
    ]
)

In [5]:
# Define Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, root_dir, sub_folder, transform=None, data_type='original'):
        self.root_dir = root_dir
        self.sub_folder = sub_folder
        self.transform = transform
        self.data_type = data_type
        self.image_paths = []
        self.labels = []

        if self.data_type == 'original':
            self.load_original_data()
        elif self.data_type == 'augmentation':
            self.load_augmented_data()

    def load_original_data(self):
        label_file = os.path.join(self.root_dir, 'shortaxis_binary v2.xlsx')
        for video_folder in os.listdir(os.path.join(self.root_dir, self.sub_folder)):
            video_path = os.path.join(self.root_dir, self.sub_folder, video_folder)
            if os.path.isdir(video_path):
                try:
                    labels_df = pd.read_excel(label_file, sheet_name=video_folder)
                except ValueError:
                    continue

                for img_filename in os.listdir(video_path):
                    if img_filename.endswith(".jpg"):
                        img_path = os.path.join(video_path, img_filename)
                        frame_idx = int(os.path.splitext(img_filename)[0].split('_')[-1])
                        labels = labels_df.loc[frame_idx, ['BAD QUALITY', 'CORD', 'FLUID']].values.astype('float32')
                        self.image_paths.append(img_path)
                        self.labels.append(labels)

    def load_augmented_data(self):
        for video_folder in os.listdir(os.path.join(self.root_dir, self.sub_folder)):
            video_path = os.path.join(self.root_dir, self.sub_folder, video_folder)
            if os.path.isdir(video_path):
                label_file = os.path.join(self.root_dir, 'Label', f'{video_folder}.xlsx')
                labels_df = pd.read_excel(label_file)

                for img_filename in os.listdir(video_path):
                    if img_filename.endswith(".jpg"):
                        img_path = os.path.join(video_path, img_filename)
                        labels = labels_df.loc[labels_df['FILENAME'] == img_filename,
                                               ['BAD QUALITY', 'CORD', 'FLUID']].values.astype('float32')
                        self.image_paths.append(img_path)
                        self.labels.append(labels)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')
        labels = self.labels[idx]

        if self.transform:
            image = self.transform(image)

        return image, labels


from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

def initialize_data(root_dir, sub_folder, data_type='original'):
    # Define separate transforms for training and validation
    # train_transform = transforms.Compose([
    #     transforms.Resize((224, 224)),
    #     transforms.RandomHorizontalFlip(),
    #     transforms.ToTensor(),
    # ])

    # val_transform = transforms.Compose([
    #     transforms.Resize((224, 224)),
    #     transforms.ToTensor(),
    # ])

    # Load the full dataset
    full_dataset = CustomDataset(root_dir, sub_folder=sub_folder, data_type=data_type)

    # Use sklearn's train_test_split to split into train and temp (validation + test)
    indices = list(range(len(full_dataset)))
    train_indices, temp_indices = train_test_split(indices, test_size=0.3, random_state=42)

    # Further split temp into validation and test sets
    val_indices, test_indices = train_test_split(temp_indices, test_size=0.5, random_state=42)

    # Create subsets for train, val, and test
    train_dataset = Subset(full_dataset, train_indices)
    val_dataset = Subset(full_dataset, val_indices)
    test_dataset = Subset(full_dataset, test_indices)

    # Apply the appropriate transforms to each subset
    train_dataset.dataset.transform = train_transforms
    val_dataset.dataset.transform = val_transforms
    test_dataset.dataset.transform = val_transforms  # Optionally reuse val_transform for test set

    print(f'Train Size: {len(train_dataset)}, Val Size: {len(val_dataset)}, Test Size: {len(test_dataset)}')

    return train_dataset, val_dataset, test_dataset

In [6]:
# Initialize Data
root_dir = '/content/drive/MyDrive'    # CHANGE BASED ON FOLDER LOCATION
sub_folder = 'short axis frames'
num_classes=3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
org_train_data, org_val_data, org_test_data = initialize_data(root_dir,
                                                                    sub_folder,
                                                                    data_type='original')

Train Size: 2373, Val Size: 508, Test Size: 509


In [7]:
device

device(type='cuda')

In [8]:
# Assuming your labels are: ['BAD QUALITY', 'CORD', 'FLUID']
labels = ['BAD QUALITY', 'CORD', 'FLUID']

# Creating label-to-ID and ID-to-label mappings
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for idx, label in enumerate(labels)}

# Checking the value for ID 2
print(id2label[2])  # Output: 'FLUID'

FLUID


In [9]:
!pip install transformers accelerate evaluate datasets peft -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/472.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/320.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [11]:
import transformers
import accelerate
import peft

print(f"Transformers version: {transformers.__version__}")
print(f"Accelerate version: {accelerate.__version__}")
print(f"PEFT version: {peft.__version__}")

Transformers version: 4.44.2
Accelerate version: 0.34.2
PEFT version: 0.13.2


In [12]:
model_checkpoint = "google/vit-base-patch16-224-in21k"

In [13]:
from transformers import AutoImageProcessor

image_processor = AutoImageProcessor.from_pretrained(model_checkpoint)

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


In [14]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [15]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint,
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
)

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
print_trainable_parameters(model)

trainable params: 85800963 || all params: 85800963 || trainable%: 100.00


In [17]:
for name, module in model.named_modules():
    print(name, ":", module)

 : ViTForImageClassification(
  (vit): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in

In [18]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["query", "key", "value", "dense"],
    lora_dropout=0.1,
    bias="none",
    modules_to_save=["classifier"],
)
lora_model = get_peft_model(model, config)
print_trainable_parameters(lora_model)

trainable params: 2656515 || all params: 88457478 || trainable%: 3.00


In [19]:
# import numpy as np
# from sklearn.metrics import accuracy_score, f1_score

# def compute_metrics(eval_pred):
#     predictions, references = eval_pred

#     # Convert multi-label references (e.g., [1, 0, 0]) to class indices (e.g., 0)
#     references = np.argmax(references, axis=1)

#     # If predictions are probabilities, convert them to class indices
#     if predictions.ndim > 1:
#         predictions = np.argmax(predictions, axis=1)

#     # Compute accuracy and F1-score
#     accuracy = accuracy_score(references, predictions)
#     f1 = f1_score(references, predictions, average='weighted')

#     return {
#         "accuracy": accuracy,
#         "f1": f1,
#     }

from sklearn.metrics import accuracy_score, f1_score, hamming_loss, jaccard_score

def compute_metrics(eval_pred, threshold=0.5):
    predictions, references = eval_pred

    # Apply sigmoid if necessary
    predictions = 1 / (1 + np.exp(-predictions)) if predictions.ndim > 1 else predictions

    # Binarize predictions
    predictions = (predictions >= threshold).astype(int)
    references = (references >= 0.5).astype(int)  # Assuming references are probabilities

    # Compute metrics
    accuracy = accuracy_score(references, predictions)
    f1_macro = f1_score(references, predictions, average='macro')
    f1_micro = f1_score(references, predictions, average='micro')
    f1_samples = f1_score(references, predictions, average='samples')
    hamming = hamming_loss(references, predictions)
    jaccard = jaccard_score(references, predictions, average='macro')

    return {
        "accuracy": accuracy,
        "f1_macro": f1_macro,
        "f1_micro": f1_micro,
        "f1_samples": f1_samples,
        "hamming_loss": hamming,
        "jaccard": jaccard,
    }

In [20]:
# import torch


# def collate_fn(examples):
#     # Unpack the tuples into separate lists
#     pixel_values = torch.stack([example[0] for example in examples])
#     labels = torch.tensor(np.array([example[1] for example in examples]))
#     return {"pixel_values": pixel_values, "labels": labels}

import torch

def collate_fn(examples):
    # Unpack the tuples into separate lists
    pixel_values = torch.stack([example[0] for example in examples])

    # Directly convert labels to a tensor with specified dtype
    labels = torch.stack([torch.tensor(example[1], dtype=torch.float32) for example in examples])

    return {"pixel_values": pixel_values, "labels": labels}

In [21]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [23]:
import os
import logging
from transformers import TrainingArguments, Trainer

# Enable logging and tqdm
logging.basicConfig(level=logging.INFO)
from transformers.utils import logging as hf_logging
hf_logging.set_verbosity_info()

# Ensure the model is loaded on the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

model_name = model_checkpoint.split("/")[-1]
batch_size = 128

args = TrainingArguments(
    f"{model_name}-finetuned-lora-usg",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-3,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=batch_size,
    fp16=torch.cuda.is_available(),
    num_train_epochs=10,
    logging_steps=5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    label_names=["labels"],
    disable_tqdm=False,  # Ensure tqdm is enabled
)

trainer = Trainer(
    lora_model,
    args,
    train_dataset=org_train_data,
    eval_dataset=org_val_data,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Using auto half precision backend


In [24]:
# Start training
train_results = trainer.train()

***** Running training *****
  Num examples = 2,373
  Num Epochs = 10
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 4
  Total optimization steps = 40
  Number of trainable parameters = 2,656,515


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,F1 Micro,F1 Samples,Hamming Loss,Jaccard,Runtime,Samples Per Second,Steps Per Second
0,No log,0.516533,0.511811,0.275145,0.666045,0.639108,0.234908,0.234252,440.3689,1.154,0.009
1,0.560000,0.453428,0.527559,0.32933,0.683521,0.654856,0.221785,0.26713,4.01,126.682,0.997
2,0.453200,0.328997,0.649606,0.687252,0.787766,0.733596,0.150262,0.550591,3.985,127.479,1.004
4,0.252300,0.170475,0.885827,0.912638,0.935829,0.862205,0.047244,0.84217,4.1465,122.514,0.965
5,0.172400,0.131322,0.917323,0.929893,0.948109,0.870079,0.038714,0.870331,3.961,128.25,1.01
6,0.120700,0.10971,0.937008,0.949364,0.959649,0.885827,0.030184,0.904107,4.049,125.462,0.988
8,0.074100,0.09452,0.948819,0.962718,0.969325,0.892388,0.022966,0.928403,4.5,112.89,0.889



***** Running Evaluation *****
  Num examples = 508
  Batch size = 128
Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-usg/checkpoint-4
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-usg/checkpoint-4/preprocessor_config.json

***** Running Evaluation *****
  Num examples = 508
  Batch size = 128
Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-usg/checkpoint-9
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-usg/checkpoint-9/preprocessor_config.json

***** Running Evaluation *****
  Num examples = 508
  Batch size = 128
Saving model checkpoint to vit-base-patch16-224-in21k-finetuned-lora-usg/checkpoint-14
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
Image processor saved in vit-base-patch16-224-in21k-finetuned-lora-usg/checkpoint-14/preprocessor

In [25]:
trainer.evaluate(org_test_data)


***** Running Evaluation *****
  Num examples = 509
  Batch size = 128


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.10113943368196487,
 'eval_accuracy': 0.9430255402750491,
 'eval_f1_macro': 0.9499800598464132,
 'eval_f1_micro': 0.9653333333333334,
 'eval_f1_samples': 0.8795022920759659,
 'eval_hamming_loss': 0.025540275049115914,
 'eval_jaccard': 0.9055484153244552,
 'eval_runtime': 450.4169,
 'eval_samples_per_second': 1.13,
 'eval_steps_per_second': 0.009,
 'epoch': 8.421052631578947}

In [26]:
!cp -r  /content/vit-base-patch16-224-in21k-finetuned-lora-usg /content/drive/MyDrive/lora_weights_vit_base_patch_16/

In [27]:
!nvidia-smi

Tue Oct 22 18:06:56 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   30C    P0              50W / 400W |  13449MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    