In [3]:

pip install kaggle




In [4]:
mkdir ~/.kaggle


mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [5]:
! kaggle datasets download virajbagal/roco-dataset


Dataset URL: https://www.kaggle.com/datasets/virajbagal/roco-dataset
License(s): CC0-1.0
roco-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
! unzip roco-dataset.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: all_data/validation/radiology/images/PMC3870636_CRIM.OTOLARYNGOLOGY2013-650428.002.jpg  
  inflating: all_data/validation/radiology/images/PMC3870648_CRIM.DENTISTRY2013-378062.012.jpg  
  inflating: all_data/validation/radiology/images/PMC3871037_enm-28-326-g001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872022_CRIM.MEDICINE2013-653925.001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872161_CRIM.SURGERY2013-209494.001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872390_CRIM.OBGYN2013-906351.001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872571_OJO-6-193-g005.jpg  
  inflating: all_data/validation/radiology/images/PMC3872571_OJO-6-193-g007.jpg  
  inflating: all_data/validation/radiology/images/PMC3872649_SNI-4-150-g001.jpg  
  inflating: all_data/validation/radiology/images/PMC3872649_SNI-4-150-g014.jpg  
  inflating: all_data/validation/radiolog

In [7]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate
!pip install rouge_score
!pip install evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=ca6f9adaa093e974f8d6ad4d2b540010163901bd75677f024a0ca2962700094e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.2


In [8]:

import os
import re
import json
import torch
import numpy as np
import pandas as pd
from PIL import Image

Parameters

In [9]:
test_valid_percentage = 30 # (test - 15, valid - 15)
train_data_percentage = 60
valid_data_percentage = 60
test_data_percentage = 60
max_target_length = 256

In [10]:
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor, AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"



In [11]:

import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)



Initialize VisionEncoderDecoderModelPermalink

In [12]:
image_encoder_model = "google/vit-base-patch16-224-in21k"
text_decode_model = "gpt2"
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(image_encoder_model, text_decode_model)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight',

In [13]:
#image feature extractor
feature_extractor = AutoFeatureExtractor.from_pretrained(image_encoder_model)
# text tokenizer
tokenizer = AutoTokenizer.from_pretrained(text_decode_model)



In [14]:
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [15]:
output_dir = "vit-gpt-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('vit-gpt-model/tokenizer_config.json',
 'vit-gpt-model/special_tokens_map.json',
 'vit-gpt-model/vocab.json',
 'vit-gpt-model/merges.txt',
 'vit-gpt-model/added_tokens.json',
 'vit-gpt-model/tokenizer.json')

Data Loading and Preparation

In [16]:
# file paths data
image_dir = './all_data/train/radiology/images/'
data_file = './all_data/train/radiology/traindata.csv'

In [17]:
data = pd.read_csv(data_file)
data

Unnamed: 0,id,name,caption
0,ROCO_00002,PMC4083729_AMHSR-4-14-g002.jpg,Computed tomography scan in axial view showin...
1,ROCO_00003,PMC2837471_IJD2009-150251.001.jpg,Bacterial contamination occurred after comple...
2,ROCO_00004,PMC2505281_11999_2007_30_Fig6_HTML.jpg,The patient had residual paralysis of the han...
3,ROCO_00005,PMC3745845_IJD2013-683423.005.jpg,Panoramic radiograph after immediate loading.\n
4,ROCO_00007,PMC4917066_amjcaserep-17-301-g001.jpg,Plain abdomen x-ray: Multiple air levels at t...
...,...,...,...
65445,ROCO_81819,PMC3517833_CRIM.HEMATOLOGY2012-490438.001.jpg,Initial CT abdomen with contrast showing a di...
65446,ROCO_81820,PMC5487234_rb-50-03-0190-g13.jpg,44-year-old male patient after surgical amput...
65447,ROCO_81821,PMC2974222_kjr-11-612-g001.jpg,Primary pulmonary tuberculosis in 18-year-old...
65448,ROCO_81822,PMC3532764_AJNS-7-151-g002.jpg,"MRI brain with gadolinium, coronal view, show..."


In [18]:
# Replace column name 'name' with 'image_path'
data['image_path'] = data.pop('name')

In [19]:
# Prepend 'image_dir' to all entries in 'image_path' column
data['image_path'] = image_dir + data['image_path']

data

Unnamed: 0,id,caption,image_path
0,ROCO_00002,Computed tomography scan in axial view showin...,./all_data/train/radiology/images/PMC4083729_A...
1,ROCO_00003,Bacterial contamination occurred after comple...,./all_data/train/radiology/images/PMC2837471_I...
2,ROCO_00004,The patient had residual paralysis of the han...,./all_data/train/radiology/images/PMC2505281_1...
3,ROCO_00005,Panoramic radiograph after immediate loading.\n,./all_data/train/radiology/images/PMC3745845_I...
4,ROCO_00007,Plain abdomen x-ray: Multiple air levels at t...,./all_data/train/radiology/images/PMC4917066_a...
...,...,...,...
65445,ROCO_81819,Initial CT abdomen with contrast showing a di...,./all_data/train/radiology/images/PMC3517833_C...
65446,ROCO_81820,44-year-old male patient after surgical amput...,./all_data/train/radiology/images/PMC5487234_r...
65447,ROCO_81821,Primary pulmonary tuberculosis in 18-year-old...,./all_data/train/radiology/images/PMC2974222_k...
65448,ROCO_81822,"MRI brain with gadolinium, coronal view, show...",./all_data/train/radiology/images/PMC3532764_A...


In [20]:
for index, row in data.iterrows():
    image_path = row['image_path']
    if not os.path.exists(image_path):
        data.drop(index, inplace=True)
    else:
        try:
            image = Image.open(image_path)
        except Exception:
            data.drop(index, inplace=True)

In [21]:
# Reset the index after dropping rows
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,id,caption,image_path
0,ROCO_00002,Computed tomography scan in axial view showin...,./all_data/train/radiology/images/PMC4083729_A...
1,ROCO_00003,Bacterial contamination occurred after comple...,./all_data/train/radiology/images/PMC2837471_I...
2,ROCO_00004,The patient had residual paralysis of the han...,./all_data/train/radiology/images/PMC2505281_1...
3,ROCO_00005,Panoramic radiograph after immediate loading.\n,./all_data/train/radiology/images/PMC3745845_I...
4,ROCO_00007,Plain abdomen x-ray: Multiple air levels at t...,./all_data/train/radiology/images/PMC4917066_a...
...,...,...,...
65414,ROCO_81819,Initial CT abdomen with contrast showing a di...,./all_data/train/radiology/images/PMC3517833_C...
65415,ROCO_81820,44-year-old male patient after surgical amput...,./all_data/train/radiology/images/PMC5487234_r...
65416,ROCO_81821,Primary pulmonary tuberculosis in 18-year-old...,./all_data/train/radiology/images/PMC2974222_k...
65417,ROCO_81822,"MRI brain with gadolinium, coronal view, show...",./all_data/train/radiology/images/PMC3532764_A...


## Split and Sample Data

In [22]:
from sklearn.model_selection import train_test_split
# Split data into train, test, and valid datasets
train_data, valid_test_data = train_test_split(data, test_size=test_valid_percentage/100, random_state=42)
valid_data, test_data = train_test_split(valid_test_data, test_size=0.5, random_state=42)


In [23]:
# Reset index
train_data = train_data.reset_index(drop=True)
valid_data = valid_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

In [24]:
print("Train data shape: ", train_data.shape)
print("Valid data shape: ", valid_data.shape)
print("Test data shape: ", test_data.shape)

Train data shape:  (45793, 3)
Valid data shape:  (9813, 3)
Test data shape:  (9813, 3)


In [25]:
# Select n% of data
train_data = train_data.sample(frac=train_data_percentage/100, random_state=42)
valid_data = valid_data.sample(frac=valid_data_percentage/100, random_state=42)
test_data = test_data.sample(frac=test_data_percentage/100, random_state=42)

In [26]:
#!pip install datasets

In [27]:
from datasets import Dataset, DatasetDict

# Convert DataFrame to Hugging Face dataset dictionary format
train_data_dict = Dataset.from_pandas(train_data)
valid_data_dict = Dataset.from_pandas(valid_data)
test_data_dict = Dataset.from_pandas(test_data)

dataset_dict = DatasetDict({
    'train': train_data_dict,
    'validation': valid_data_dict,
    'test': test_data_dict
})

print(dataset_dict)

DatasetDict({
    train: Dataset({
        features: ['id', 'caption', 'image_path', '__index_level_0__'],
        num_rows: 27476
    })
    validation: Dataset({
        features: ['id', 'caption', 'image_path', '__index_level_0__'],
        num_rows: 5888
    })
    test: Dataset({
        features: ['id', 'caption', 'image_path', '__index_level_0__'],
        num_rows: 5888
    })
})


Define Custom Dataset Class

In [28]:
class ImageCaptioningDataset(torch.utils.data.Dataset):
    def __init__(self, ds, ds_type, max_target_length):
        self.ds = ds
        self.max_target_length = max_target_length
        self.ds_type = ds_type

    def __getitem__(self, idx):
        image_path = self.ds[self.ds_type]['image_path'][idx]
        caption = self.ds[self.ds_type]['caption'][idx]
        model_inputs = dict()
        model_inputs['labels'] = self.tokenization_fn(caption, self.max_target_length)
        model_inputs['pixel_values'] = self.feature_extraction_fn(image_path)
        return model_inputs

    def __len__(self):
        return len(self.ds[self.ds_type])

    # text preprocessing step
    def tokenization_fn(self, caption, max_target_length):
        """Run tokenization on caption."""
        labels = tokenizer(caption,
                          padding="max_length",
                          max_length=max_target_length,
                          truncation=True).input_ids

        return labels

    # image preprocessing step
    def feature_extraction_fn(self, image_path):
        image = Image.open(image_path)
        if image.mode != "RGB":
            image = image.convert("RGB")

        encoder_inputs = feature_extractor(images=image, return_tensors="np")

        return encoder_inputs.pixel_values[0]



In [29]:
train_ds = ImageCaptioningDataset(dataset_dict, 'train', max_target_length)
eval_ds = ImageCaptioningDataset(dataset_dict, 'validation', max_target_length)
test_ds = ImageCaptioningDataset(dataset_dict, 'test', max_target_length)


Define Training Arguments and Metric Calculation

In [30]:
#!pip install accelerate

In [31]:
#!pip install transformers[torch]


In [32]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Define metric

In [33]:
#!pip install evaluate

In [34]:
#!pip install rouge_score


In [35]:
import evaluate
metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [36]:
import numpy as np

ignore_pad_token_for_loss = True

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds,
                            references=decoded_labels,
                            use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    return result

## Training

In [37]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=default_data_collator,
)

In [80]:
# trainer.train()

AttributeError: `AcceleratorState` object has no attribute `distributed_type`. This happens if `AcceleratorState._reset_state()` was called and an `Accelerator` or `PartialState` was not reinitialized.

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,    # Increase batch size if GPU memory allows
    per_device_eval_batch_size=8,     # Increase batch size if GPU memory allows
    gradient_accumulation_steps=2,    # Accumulate gradients over 2 steps
    num_train_epochs=1,               # Train for 3 epochs initially, adjust as needed
    output_dir="./image-captioning-output",
)

# Instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=default_data_collator,
)

# Train the model
trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

In [83]:
trainer.save_model("./image-captioning-output")
tokenizer.save_pretrained("./image-captioning-output")

Evaluation and Predictions


In [84]:
# Get predictions from the model
predictions = trainer.predict(test_ds)

# Process and evaluate the predictions
preds = predictions.predictions
labels = predictions.label_ids

# Post-process the predictions and labels
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# Calculate evaluation metrics
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
bleu_scores = []

# Print the actual captions and predicted captions
i = 0
for actual_caption, predicted_caption in zip(decoded_labels, decoded_preds):
    i += 1
    bleu_score = sentence_bleu([actual_caption.split()], predicted_caption.split())
    bleu_scores.append(bleu_score)
    if i % 20 == 0:
        print("Actual Caption:", actual_caption)
        print("Predicted Caption:", predicted_caption)
        print("BLEU Score:", bleu_score)
        print()

avg_bleu_score = np.mean(bleu_scores)
print("Average BLEU Score:", avg_bleu_score)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

Actual Caption:  Vascular anatomic variants. The coronal multiplanar reformatted image demonstrates the origin of the right hepatic artery (black arrow) from the superior mesenteric artery (white arrow) and diffuse telangiectasias (black arrowheads) in the peripheral parenchyma.

Predicted Caption:  A CT scan of the abdomen showing a large mass in the right kidney with a small amount of
BLEU Score: 3.099734087990692e-155

Actual Caption:  Identification of anastomosis site by duplex scan. The extent of calcification is easily estimated by the thickness and intensity of echo density. The figure shows the calcified tibial artery, which is not suitable as a distal anastomosis site.

Predicted Caption:  A transverse view of the right kidney showing a large mass in the right kidney with a small
BLEU Score: 4.117710129809639e-232

Actual Caption:  White line: posterior condylar line. Red line: anatomical transepicondylar axis.

Predicted Caption:  A coronal T2-weighted image of the right kne