In [2]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# Load processor and model (BLIP-2)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [3]:
import os
import pandas as pd
import torch
from PIL import Image
from torch.utils.data import Dataset
from transformers import BlipProcessor

class CXRDataset(Dataset):
    def __init__(self, root, caption_file, processor, max_length=300):
        """
        Args:
            root (str): Root directory of images.
            caption_file (str): CSV file containing captions and image file paths.
            processor (BlipProcessor): BLIP processor for handling images and text.
            max_length (int): Maximum length for tokenized captions.
        """
        self.root = root
        self.processor = processor
        self.data = pd.read_csv(caption_file)
        self.max_length = max_length
        self.image_size = 224  # BLIP-2 works with standard resolutions

    def __getitem__(self, idx):
        """
        Returns:
            img (torch.Tensor): Processed image.
            caption (str): Text caption.
            encoding (dict): Tokenized caption for BLIP-2.
        """
        caption = self.data.iloc[idx, 1]
        image_path1 = os.path.join(self.root, self.data.iloc[idx, 2])
        image_path2 = os.path.join(self.root, self.data.iloc[idx, 3])

        # Load images (convert to RGB)
        image1 = Image.open(image_path1).resize((self.image_size, self.image_size)).convert('RGB')
        image2 = Image.open(image_path2).resize((self.image_size, self.image_size)).convert('RGB')

        # Merge two grayscale images into a 3-channel image
        img = Image.merge("RGB", (image1.convert("L"), image2.convert("L"), image1.convert("L")))

        # Process the image and caption using BLIP-2 processor
        encoding = self.processor(images=img, text=caption, padding="max_length",
                                  truncation=True, max_length=self.max_length, return_tensors="pt")

        return encoding["pixel_values"].squeeze(0), encoding["input_ids"].squeeze(0)

    def __len__(self):
        return len(self.data)

# Initialize BLIP-2 Processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")



In [4]:
from torch.utils.data import DataLoader

train_captions = '/content/drive/MyDrive/Small_human_extracted/Train_captions.csv'
valid_captions = '/content/drive/MyDrive/Small_human_extracted/Valid_captions.csv'
test_captions = '/content/drive/MyDrive/Small_human_extracted/Test_captions.csv'

# No need for custom transforms - processor handles image normalization
train_dataset = CXRDataset(
    root="/content/drive/MyDrive/Small_human_extracted/Images/Train/",
    caption_file=train_captions,
    processor=processor
)
valid_dataset = CXRDataset(
    root="/content/drive/MyDrive/Small_human_extracted/Images/Valid/",
    caption_file=valid_captions,
    processor=processor
)

# DataLoader remains the same
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Reduce batch size for VRAM
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [None]:

from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import os
import pandas as pd
from torch.utils.data import Dataset
from transformers import BlipProcessor
from torch.utils.data import DataLoader
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Load processor and model (BLIP-2)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# Hyperparameters
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
epochs = 20
best_bleu4 = 0

# Training loop
for epoch in range(epochs):
  model.train()
  for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
      pixel_values = batch[0].to(device)
      input_ids = batch[1].to(device)
      optimizer.zero_grad()
      outputs = model(pixel_values=pixel_values, input_ids=input_ids, labels=input_ids)
      loss = outputs.loss
      loss.backward()
      optimizer.step()

  # Validation loop
  model.eval()
  bleu4_scores = []
  with torch.no_grad():
      for batch in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
          pixel_values = batch[0].to(device)
          input_ids = batch[1].to(device)
          attention_mask = (input_ids != processor.tokenizer.pad_token_id).to(device)
          generated_ids = model.generate(
              pixel_values=pixel_values,
              max_length=300,
              repetition_penalty=1.2,
              temperature=0.7,
              do_sample=True
          )
          generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
          target_text = processor.batch_decode(input_ids, skip_special_tokens=True)

          for gen, target in zip(generated_text, target_text):
              reference = [target.split()]
              candidate = gen.split()
              smoothing = SmoothingFunction().method4  # Handle cases with zero counts
              score = sentence_bleu(reference, candidate, smoothing_function=smoothing)
              bleu4_scores.append(score)
  avg_bleu4 = sum(bleu4_scores)/len(bleu4_scores)
  print(f"Validation Bleu-4 score: {avg_bleu4}")

  # Save the best model
  if avg_bleu4 > best_bleu4:
      best_bleu4 = avg_bleu4
      print(f"Saving new best model with Bleu-4 score: {best_bleu4}")
      model.save_pretrained(f"/content/drive/MyDrive/best_blip_model_epoch_{epoch+1}")


Epoch 1/20 - Training: 100%|██████████| 335/335 [21:26<00:00,  3.84s/it]
Epoch 1/20 - Validation: 100%|██████████| 48/48 [14:35<00:00, 18.25s/it]


Validation Bleu-4 score: 0.0047424086158075664
Saving new best model with Bleu-4 score: 0.0047424086158075664


Epoch 2/20 - Training: 100%|██████████| 335/335 [09:46<00:00,  1.75s/it]
Epoch 2/20 - Validation: 100%|██████████| 48/48 [12:05<00:00, 15.11s/it]


Validation Bleu-4 score: 0.004529771831993313


Epoch 3/20 - Training: 100%|██████████| 335/335 [09:45<00:00,  1.75s/it]
Epoch 3/20 - Validation: 100%|██████████| 48/48 [12:21<00:00, 15.45s/it]


Validation Bleu-4 score: 0.004367474984247636


Epoch 4/20 - Training: 100%|██████████| 335/335 [09:45<00:00,  1.75s/it]
Epoch 4/20 - Validation: 100%|██████████| 48/48 [12:17<00:00, 15.37s/it]


Validation Bleu-4 score: 0.004357837596955029


Epoch 5/20 - Training: 100%|██████████| 335/335 [09:42<00:00,  1.74s/it]
Epoch 5/20 - Validation: 100%|██████████| 48/48 [12:15<00:00, 15.33s/it]


Validation Bleu-4 score: 0.004501649768139496


Epoch 6/20 - Training: 100%|██████████| 335/335 [09:42<00:00,  1.74s/it]
Epoch 6/20 - Validation: 100%|██████████| 48/48 [12:16<00:00, 15.34s/it]


Validation Bleu-4 score: 0.00399557780333634


Epoch 7/20 - Training: 100%|██████████| 335/335 [09:42<00:00,  1.74s/it]
Epoch 7/20 - Validation: 100%|██████████| 48/48 [12:05<00:00, 15.12s/it]


Validation Bleu-4 score: 0.0038474611937333886


Epoch 8/20 - Training: 100%|██████████| 335/335 [09:46<00:00,  1.75s/it]
Epoch 8/20 - Validation: 100%|██████████| 48/48 [12:17<00:00, 15.37s/it]


Validation Bleu-4 score: 0.005248485717648825
Saving new best model with Bleu-4 score: 0.005248485717648825


Epoch 9/20 - Training: 100%|██████████| 335/335 [09:46<00:00,  1.75s/it]
Epoch 9/20 - Validation: 100%|██████████| 48/48 [12:20<00:00, 15.42s/it]


Validation Bleu-4 score: 0.0032222238945513134


Epoch 10/20 - Training: 100%|██████████| 335/335 [09:44<00:00,  1.74s/it]
Epoch 10/20 - Validation:  10%|█         | 5/48 [01:17<11:01, 15.39s/it]