1. Importing Libraries

In [34]:
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch.utils.data import Dataset

# HuggingFace Transformers
from transformers import (
    AutoFeatureExtractor,
    AutoTokenizer,
    BertModel,
    BertTokenizer,
    GPT2Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    VisionEncoderDecoderModel,
    ViTFeatureExtractor,
    default_data_collator,
)

# Evaluation
import evaluate
from bert_score import BERTScorer


2. Loading and Preprocessing Data

In [None]:
df2 = pd.read_csv('archive/indiana_projections.csv')
df1 = pd.read_csv('archive/indiana_reports.csv')


df2 = df2[df2['view'] == 'PA']  # 'view' column is used to determine "Frontal"

images_captions_df = pd.DataFrame(columns=['imgs', 'captions'])

for _, row in df2.iterrows():
    uid = row['uid']
    image = row['filename']

    matching_report = df1[df1['uid'] == uid]
    if not matching_report.empty:
        caption = matching_report.iloc[0]['findings']
        if isinstance(caption, str) and caption.strip():
            images_captions_df = pd.concat([
                images_captions_df, 
                pd.DataFrame([{'imgs': image, 'captions': caption}])
            ], ignore_index=True)

# print(images_captions_df.head())

3. Preprocessing and Tokenizer Setup

In [None]:
encoder_checkpoint = "google/vit-base-patch16-224-in21k" # visual encoder
decoder_checkpoint = "gpt2" # text decorder

feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(decoder_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

base_path = Path('archive/images/images_normalized')


images_captions_df['imgs'] = images_captions_df['imgs'].apply(
    lambda x: Path(x).name  
)

images_captions_df['imgs'] = images_captions_df['imgs'].apply(
    lambda x: str(base_path / x)
)




4. Sample Encoding Check

In [None]:
max_length = 384 # max length of the caption
sample = images_captions_df.iloc[99]

image = Image.open(sample['imgs']).convert('RGB')
caption = sample['captions']

inputs = feature_extractor(images=image, return_tensors='pt')

# Tokenize the caption
outputs = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )

5. Dataset Class Definition

In [39]:
class LoadDataset(Dataset):
    def __init__(self, df):
        self.images = df['imgs'].values
        self.captions = df['captions'].values
    
    def __getitem__(self, idx):
        inputs = dict()

        image_path = str(self.images[idx])
        image = Image.open(image_path).convert("RGB")
        image = feature_extractor(images=image, return_tensors='pt')

        caption = self.captions[idx]
        labels = tokenizer(
            caption, 
            max_length=max_length, 
            truncation=True, 
            padding='max_length',
            return_tensors='pt',
        )['input_ids'][0]
        
        inputs['pixel_values'] = image['pixel_values'].squeeze()   
        inputs['labels'] = labels
        
        return inputs
    
    def __len__(self):
        return len(self.images)


6. Train/Test Split and Dataset Instantiation

In [None]:
train_df, test_df = train_test_split(images_captions_df, test_size=0.2, shuffle=True, random_state=42)

# Splits the dataset into 80% train, 20% test with a fixed random seed.
train_df = train_df.iloc[:2000]
test_df = test_df.iloc[:400]

train_ds = LoadDataset(train_df)
test_ds = LoadDataset(test_df)


7. Load Pretrained Encoder-Decoder Model

In [None]:
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    encoder_checkpoint, 
    decoder_checkpoint
)
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.num_beams = 4 #Sets the padding token to properly mask padded parts of sequences during loss computation

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'transformer.h.10.crossattention.c_proj.bias', 'transformer.h.10.cros

8. Define Training Arguments and Trainer

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="image-caption-generator-debug",
    eval_strategy="epoch",
    per_device_train_batch_size=4,            
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=3,                       
    save_strategy="epoch",        
    report_to="none",
    gradient_accumulation_steps=2,
    predict_with_generate=True,
    generation_max_length=30,
    generation_num_beams=2,
    no_cuda=True
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    args=training_args,
)


  trainer = Seq2SeqTrainer(


9. Start Training

In [10]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

10. Save the Trained Model and Tools

In [11]:
model.save_pretrained("model/CXR_model")
feature_extractor.save_pretrained("model/CXR_model")
tokenizer.save_pretrained("model/CXR_model")



('model/CXR_model\\tokenizer_config.json',
 'model/CXR_model\\special_tokens_map.json',
 'model/CXR_model\\vocab.json',
 'model/CXR_model\\merges.txt',
 'model/CXR_model\\added_tokens.json',
 'model/CXR_model\\tokenizer.json')

11. Generate Predictions on Test Set

In [None]:
DS=[] # List to store the original captions
GPT=[] # List to store the generated captions
model.eval()
for i in tqdm(range(0,250)): # 250 test samples
    inputs = test_ds[i]['pixel_values']
    
    with torch.no_grad():

        out = model.generate(
            inputs.unsqueeze(0),
            num_beams=2, 
            max_length=max_length
        )

    y_hat=tokenizer.decode(test_ds[i]['labels'],skip_special_tokens=True)
    DS.append(y_hat)

    y_pred=tokenizer.decode(out[0], skip_special_tokens=True)
    GPT.append(y_pred)


  0%|          | 0/250 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  0%|          | 1/250 [01:04<4:28:24, 64.68s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 2/250 [02:09<4:27:37, 64.75s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  1%|          | 3/250 [03:11<4:22:01, 63.65s/it]The attention mask and the pad token id were not set. As a co

KeyboardInterrupt: 

12. Reload the Model for Inference

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("model/CXR_model") # Load the trained model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") # Load the tokenizer
tokenizer.pad_token = tokenizer.eos_token 
model.config.pad_token_id = tokenizer.pad_token_id 

feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k") # Load the feature extractor
model.eval()



VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (inte

13. BLEU Evaluation on Full Test Set

In [27]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=GPT, references=DS)
print(results)

{'bleu': 0.0, 'precisions': [0.02618705035971223, 0.002525616972146053, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 9.481582537517053, 'translation_length': 13900, 'reference_length': 1466}


14. BERTScore Evaluation

In [28]:
reference =DS
candidate = GPT
scorer = BERTScorer(model_type='bert-base-uncased')
P, R, F1 = scorer.score(candidate, reference)

15. Qualitative + BLEU Evaluation on 5 Random Samples

In [None]:
# Load BLEU metric using 'evaluate'
bleu = evaluate.load("bleu")

# Pick 5 random samples from your test set
sample_df = images_captions_df.sample(5)

for idx, row in sample_df.iterrows():
    image_path = row['imgs']  # path already includes archive/images/...

    true_caption = row['captions']

    # Load and preprocess image
    image = Image.open(image_path).convert("RGB")
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values

    # Generate caption
    output_ids = model.generate(pixel_values, max_length=50, num_beams=4)
    pred_caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Show results
    print(f"\n🔹 Image: {os.path.basename(image_path)}")
    print(f"🧠 Predicted : {pred_caption}")
    print(f"📘 Ground Truth : {true_caption}")

    # Evaluate BLEU
    bleu.add(prediction=pred_caption, reference=[true_caption])

# Final BLEU score
print("\n📊 Average BLEU score on 5 samples:")
print(bleu.compute())


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🔹 Image: 280_IM-1232-1001.dcm.png
🧠 Predicted :  cardomedast silhouette pulmonarycul are normal size cont. lungs clear focal of airspace,othax ple eff, pneumor, pneumor, ple eff, focal space,othax oruralusion Visual osous of thor arearkable
📘 Ground Truth : The heart and lungs have XXXX XXXX in the interval. Both lungs are clear and expanded. Heart and mediastinum normal.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🔹 Image: 1839_IM-0543-12013.dcm.png
🧠 Predicted :  cardomedast silhouette pulmonarycul are normal size cont. lungs clear focal of airspace,othax oruralusion Noothax ple eff orothax largeuralusion
📘 Ground Truth : The heart is normal in size. The mediastinum is stable. Granulomatous sequela are noted. The previously visualized nodular density in the right upper lobe is not well-seen on today's study. There is no acute infiltrate or pleural effusion.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🔹 Image: 2153_IM-0773-1001.dcm.png
🧠 Predicted :  cardomedast silhouette pulmonarycul are normal size cont. lungs clear focal of airspace,othax ple eff, pneumor, pneumor, ple eff, focal space,othax oruralusion Visual osous of thor arearkable
📘 Ground Truth : The heart size and pulmonary vascularity appear within normal limits. The lungs are free of focal airspace disease. No pleural effusion or pneumothorax is seen. Left hemidiaphragm is elevated.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



🔹 Image: 3138_IM-1476-1001.dcm.png
🧠 Predicted :  cardomedast silhouette pulmonarycul are normal size cont. lungs clear focal of airspace,othax oruralusion Noothax ple eff orothax largeuralusion
📘 Ground Truth : The trachea is midline. The cardiomediastinal silhouette is normal. The lungs are clear, without evidence of acute infiltrate or effusion. There is no evidence of tuberculous disease. There is no pneumothorax. The visualized bony structures reveal no acute abnormalities.

🔹 Image: 850_IM-2373-0001-0001.dcm.png
🧠 Predicted :  cardomedast silhouette pulmonarycul are normal size cont. lungs clear focal of airspace,othax ple eff, pneumor, pneumor, ple eff, focal space,othax oruralusion Visual osous of thor arearkable
📘 Ground Truth : Stable appearance of the cardiomediastinal silhouette. There is no pneumothorax, pleural effusion, or focal airspace consolidation.

📊 Average BLEU score on 5 samples:
{'bleu': 0.0, 'precisions': [0.21768707482993196, 0.0, 0.0, 0.0], 'brevity_penalty'