In [1]:
# Define file paths
train_caption_file = "../Datasets/ROCO2/train_captions.csv"
train_image_folder = "../Datasets/ROCO2/train_images/train/"
test_caption_file = "../Datasets/ROCO2/test_captions.csv"
test_image_folder = "../Datasets/ROCO2/test_images/test/"

In [2]:
import os
import pandas as pd
from PIL import Image
import torch
import open_clip
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from torchvision import transforms

# Define Dataset Class
class MedicalImageCaptioningDataset(Dataset):
    def __init__(self, caption_file, image_folder, tokenizer, transform=None):
        self.captions_df = pd.read_csv(caption_file)
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.transform = transform
    
    def __len__(self):
        return len(self.captions_df)
    
    def __getitem__(self, idx):
        image_id = self.captions_df.iloc[idx, 0]
        caption = self.captions_df.iloc[idx, 1]
        
        image_path = os.path.join(self.image_folder, f"{image_id}.jpg")
        image = Image.open(image_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        caption_tokens = self.tokenizer(caption, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
        
        return {
            'image': image,
            'caption': caption_tokens['input_ids'].squeeze(0),
            'attention_mask': caption_tokens['attention_mask'].squeeze(0)
        }

# Define Model Manager Class
class ModelManager:
    def __init__(self, biomedclip_model_name, gpt2_model_name):
        # Initialize BiomedCLIP model
        self.biomedclip_model = open_clip.create_model(biomedclip_model_name)
        # For transformation, use open_clip's recommended preprocess
        self.preprocess = open_clip.get_preprocess(biomedclip_model_name)
        self.gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)
        self.optimizer = AdamW(list(self.biomedclip_model.parameters()) + list(self.gpt2_model.parameters()), lr=1e-4)
    
    def extract_image_embeddings(self, images):
        self.biomedclip_model.eval()
        embeddings = []
        with torch.no_grad():
            for image in images:
                image = self.preprocess(image).unsqueeze(0)  # Add batch dimension
                embedding = self.biomedclip_model.encode_image(image)
                embeddings.append(embedding)
            return torch.cat(embeddings)
    
    def generate_captions(self, image_embeddings, captions):
        self.gpt2_model.eval()
        outputs = self.gpt2_model.generate(
            input_ids=captions,
            attention_mask=None,
            max_length=128
        )
        return outputs

def train_model_mixed_precision(model, dataloader, epochs=5, lr=1e-4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss(ignore_index=model.gpt2_tokenizer.pad_token_id)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    # GradScaler for mixed precision
    scaler = GradScaler()
    
    model = model.to(device)
    model.train()
    
    for epoch in range(epochs):
        running_loss = 0.0
        
        for image_tensor, caption_tensor in dataloader:
            image_tensor = image_tensor.to(device)
            caption_tensor = caption_tensor.to(device)
            
            optimizer.zero_grad()

            # Mixed precision forward pass
            with autocast():
                outputs = model(image_tensor, caption_tensor[:, :-1])  # Exclude last token for target
                logits = outputs.logits
                
                # Compute loss
                loss = criterion(logits.view(-1, logits.size(-1)), caption_tensor[:, 1:].reshape(-1))

            # Backward pass with scaled gradients
            scaler.scale(loss).backward()
            
            # Step optimizer with scaled gradients
            scaler.step(optimizer)
            scaler.update()
            
            running_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(dataloader):.4f}')

    print("Training Finished with Mixed Precision!")

def main():
    # Paths
    train_caption_file = "../Datasets/ROCO2/train_captions.csv"
    train_image_folder = "../Datasets/ROCO2/train_images/train/"
    
    # Initialize tokenizer and transformations
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    # Create dataset and dataloader
    dataset = MedicalImageCaptioningDataset(train_caption_file, train_image_folder, tokenizer, transform)
    dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=4)
    
    # Initialize model manager
    biomedclip_model_name = 'hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224'
    gpt2_model_name = 'gpt2'
    model_manager = ModelManager(biomedclip_model_name, gpt2_model_name)
    
    # Device setup
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_manager.biomedclip_model.to(device)
    model_manager.gpt2_model.to(device)
    
    # Train the model
    train_model(dataloader, model_manager, device)

if __name__ == "__main__":
    main()


  checkpoint = torch.load(checkpoint_path, map_location=map_location)


AttributeError: module 'open_clip' has no attribute 'get_preprocess'