In [1]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch

# Load processor and model (BLIP-2)
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [2]:
import os
import pandas as pd
import torch
from PIL import Image
from torch.utils.data import Dataset
from transformers import BlipProcessor

class CXRDataset(Dataset):
    def __init__(self, root, caption_file, processor, max_length=300):
        """
        Args:
            root (str): Root directory of images.
            caption_file (str): CSV file containing captions and image file paths.
            processor (BlipProcessor): BLIP processor for handling images and text.
            max_length (int): Maximum length for tokenized captions.
        """
        self.root = root
        self.processor = processor
        self.data = pd.read_csv(caption_file)
        self.max_length = max_length
        self.image_size = 224  # BLIP-2 works with standard resolutions

    def __getitem__(self, idx):
        """
        Returns:
            img (torch.Tensor): Processed image.
            caption (str): Text caption.
            encoding (dict): Tokenized caption for BLIP-2.
        """
        caption = self.data.iloc[idx, 1]
        image_path1 = os.path.join(self.root, self.data.iloc[idx, 2])
        image_path2 = os.path.join(self.root, self.data.iloc[idx, 3])

        # Load images (convert to RGB)
        image1 = Image.open(image_path1).resize((self.image_size, self.image_size)).convert('RGB')
        image2 = Image.open(image_path2).resize((self.image_size, self.image_size)).convert('RGB')

        # Merge two grayscale images into a 3-channel image
        img = Image.merge("RGB", (image1.convert("L"), image2.convert("L"), image1.convert("L")))

        # Process the image and caption using BLIP-2 processor
        encoding = self.processor(images=img, text=caption, padding="max_length",
                                  truncation=True, max_length=self.max_length, return_tensors="pt")

        return encoding["pixel_values"].squeeze(0), encoding["input_ids"].squeeze(0)

    def __len__(self):
        return len(self.data)

# Initialize BLIP-2 Processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")



In [None]:
from torch.utils.data import DataLoader
test_captions = '/content/drive/MyDrive/Small_human_extracted/Test_captions.csv'

test_dataset = CXRDataset(
    root="/content/drive/MyDrive/Small_human_extracted/Images/Test/",
    caption_file=test_captions,
    processor=processor
)

test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)  # Reduce batch size for VRAM


In [None]:
import torch
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the best model
best_model_path = "/content/drive/MyDrive/best_blip_model_epoch_7"
model = BlipForConditionalGeneration.from_pretrained(best_model_path)
model.to(device)
model.eval()
bleu4_scores = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Test Inference"):
        pixel_values = batch[0].to(device)
        input_ids = batch[1].to(device)
        generated_ids = model.generate(
            pixel_values=pixel_values,
            max_length=300
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
        target_text = processor.batch_decode(input_ids, skip_special_tokens=True)

        for gen, target in zip(generated_text, target_text):
            reference = [target.split()]
            candidate = gen.split()
            smoothing = SmoothingFunction().method4
            score = sentence_bleu(reference, candidate, smoothing_function=smoothing)
            bleu4_scores.append(score)

avg_bleu4 = sum(bleu4_scores) / len(bleu4_scores) if bleu4_scores else 0
print(f"Test Bleu-4 score: {avg_bleu4}")

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
best_model_path = "/content/drive/MyDrive/best_blip_model_epoch_67"
model = BlipForConditionalGeneration.from_pretrained(best_model_path)
model.to(device)
model.eval()
with torch.no_grad():
    for i, batch in enumerate(test_loader):
        if i >= 1 :
          break
        pixel_values = batch[0].to(device)
        input_ids = batch[1].to(device)
        generated_ids = model.generate(
            pixel_values=pixel_values,
            max_length=300
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
        target_text = processor.batch_decode(input_ids, skip_special_tokens=True)

        for gen, target in zip(generated_text, target_text):
            print("Prediction:", gen)
            print("Ground Truth:", target)
            print("---")

Prediction: the heart size and pulmonary vascularity appear within normal limits. the lungs are free of focal airspace disease. no pleural effusion or pneumothorax is seen. no evidence of active disease.
Ground Truth: there are diffuse bilateral interstitial and alveolar opacities consistent with chronic obstructive lung disease and bullous emphysema. there are irregular opacities in the left lung apex, that could represent a cavitary lesion in the left lung apex. there are streaky opacities in the right upper lobe, xxxx scarring. the cardiomediastinal silhouette is normal in size and contour. there is no pneumothorax or large pleural effusion. 1. bullous emphysema and interstitial fibrosis. 2. probably scarring in the left apex, although difficult to exclude a cavitary lesion. 3. opacities in the bilateral upper lobes could represent scarring, however the absence of comparison exam, recommend short interval followup radiograph or ct thorax to document resolution.
---
Prediction: the h

In [3]:
from torch.utils.data import DataLoader
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Load the best model
best_model_path = "/content/drive/MyDrive/best_blip_model_epoch_27"
model = BlipForConditionalGeneration.from_pretrained(best_model_path)
model.to(device)
model.eval()
train_captions = '/content/drive/MyDrive/Small_human_extracted/Train_captions.csv'
valid_captions = '/content/drive/MyDrive/Small_human_extracted/Valid_captions.csv'
test_captions = '/content/drive/MyDrive/Small_human_extracted/Test_captions.csv'

# No need for custom transforms - processor handles image normalization
train_dataset = CXRDataset(
    root="/content/drive/MyDrive/Small_human_extracted/Images/Train/",
    caption_file=train_captions,
    processor=processor
)
valid_dataset = CXRDataset(
    root="/content/drive/MyDrive/Small_human_extracted/Images/Valid/",
    caption_file=valid_captions,
    processor=processor
)

# DataLoader remains the same
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # Reduce batch size for VRAM
valid_loader = DataLoader(valid_dataset, batch_size=8, shuffle=False)

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import os
import pandas as pd
from torch.utils.data import Dataset
from transformers import BlipProcessor
from torch.utils.data import DataLoader
from tqdm import tqdm
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction


# Hyperparameters
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
epochs = 20
best_bleu4 = 0

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
        pixel_values = batch[0].to(device)
        input_ids = batch[1].to(device)
        attention_mask = (input_ids != processor.tokenizer.pad_token_id).to(device)

        optimizer.zero_grad()
        outputs = model(pixel_values=pixel_values, input_ids=input_ids, labels=input_ids, attention_mask=attention_mask)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} - Training Loss: {total_loss / len(train_loader):.4f}")

    # Validation loop
    model.eval()
    bleu4_scores = []
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
            pixel_values = batch[0].to(device)
            input_ids = batch[1].to(device)
            attention_mask = (input_ids != processor.tokenizer.pad_token_id).to(device)

            generated_ids = model.generate(
                pixel_values=pixel_values,
                max_length=300,
                repetition_penalty=1.2,
                temperature=0.7,
                do_sample=True
            )

            generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
            target_text = processor.batch_decode(input_ids, skip_special_tokens=True)

            for gen, target in zip(generated_text, target_text):
                reference = [target.split()]
                candidate = gen.split()
                smoothing = SmoothingFunction().method4
                score = sentence_bleu(reference, candidate, smoothing_function=smoothing)
                bleu4_scores.append(score)

    avg_bleu4 = sum(bleu4_scores) / len(bleu4_scores)
    print(f"Validation Bleu-4 Score: {avg_bleu4:.4f}")


    if avg_bleu4 > best_bleu4:
        best_bleu4 = avg_bleu4
        print(f"Saving new best model with Bleu-4 score: {best_bleu4:.4f}")
        model.save_pretrained(f"/content/drive/MyDrive/best_blip_model_epoch_{epoch+1}_bleu4_{best_bleu4:.4f}")


Epoch 1/20 - Training: 100%|██████████| 335/335 [12:17<00:00,  2.20s/it]


Epoch 1 - Training Loss: 1.7701


Epoch 1/20 - Validation: 100%|██████████| 48/48 [14:30<00:00, 18.13s/it]


Validation Bleu-4 Score: 0.0054
Saving new best model with Bleu-4 score: 0.0054


Epoch 2/20 - Training: 100%|██████████| 335/335 [12:15<00:00,  2.20s/it]


Epoch 2 - Training Loss: 1.7589


Epoch 2/20 - Validation: 100%|██████████| 48/48 [13:44<00:00, 17.18s/it]


Validation Bleu-4 Score: 0.0034


Epoch 3/20 - Training: 100%|██████████| 335/335 [12:17<00:00,  2.20s/it]


Epoch 3 - Training Loss: 1.7563


Epoch 3/20 - Validation: 100%|██████████| 48/48 [13:50<00:00, 17.31s/it]


Validation Bleu-4 Score: 0.0046


Epoch 4/20 - Training: 100%|██████████| 335/335 [12:19<00:00,  2.21s/it]


Epoch 4 - Training Loss: 1.7552


Epoch 4/20 - Validation: 100%|██████████| 48/48 [13:51<00:00, 17.33s/it]


Validation Bleu-4 Score: 0.0041


Epoch 5/20 - Training: 100%|██████████| 335/335 [12:17<00:00,  2.20s/it]


Epoch 5 - Training Loss: 1.7930


Epoch 5/20 - Validation: 100%|██████████| 48/48 [13:45<00:00, 17.20s/it]


Validation Bleu-4 Score: 0.0041


Epoch 6/20 - Training: 100%|██████████| 335/335 [12:12<00:00,  2.19s/it]


Epoch 6 - Training Loss: 1.7524


Epoch 6/20 - Validation: 100%|██████████| 48/48 [13:44<00:00, 17.19s/it]


Validation Bleu-4 Score: 0.0048


Epoch 7/20 - Training: 100%|██████████| 335/335 [12:15<00:00,  2.19s/it]


Epoch 7 - Training Loss: 1.7513


Epoch 7/20 - Validation: 100%|██████████| 48/48 [13:46<00:00, 17.22s/it]


Validation Bleu-4 Score: 0.0050


Epoch 8/20 - Training: 100%|██████████| 335/335 [12:02<00:00,  2.16s/it]


Epoch 8 - Training Loss: 1.7518


Epoch 8/20 - Validation:   6%|▋         | 3/48 [00:51<12:57, 17.28s/it]