In [1]:
import os
import pandas as pd
import open_clip
from open_clip import create_model_from_pretrained, get_tokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from PIL import Image
from transformers import BertTokenizer
from torchvision import transforms
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the BiomedCLIP Model, Processor, and Tokenizer
model_name = 'hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224'
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(model_name)
tokenizer = get_tokenizer('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')

In [3]:
# Define paths to pre-split datasets
train_caption_file = "../Datasets/ROCO2/train_captions.csv"  # Assuming this file contains training data
val_caption_file = "../Datasets/ROCO2/valid_captions.csv"      # Assuming this file contains validation data
test_caption_file = "../Datasets/ROCO2/test_captions.csv"    # Assuming this file contains testing data

# Define paths to image folders
train_image_folder = "../Datasets/ROCO2/train_images/train/"
val_image_folder = "../Datasets/ROCO2/valid_images/valid/"
test_image_folder = "../Datasets/ROCO2/test_images/test/"

# Read the text files into DataFrames
train_df = pd.read_csv(train_caption_file, sep=',', header=1, names=['ID', 'Caption'])
val_df = pd.read_csv(val_caption_file, sep=',', header=1, names=['ID', 'Caption'])
test_df = pd.read_csv(test_caption_file, sep=',', header=1, names=['ID', 'Caption'])

# Replace the ID column with the Image column
train_df['Image'] = train_df['ID'] + ".jpg"
val_df['Image'] = val_df['ID'] + ".jpg"
test_df['Image'] = test_df['ID'] + ".jpg"

# Drop the old ID column
train_df.drop('ID', axis=1, inplace=True)
val_df.drop('ID', axis=1, inplace=True)
test_df.drop('ID', axis=1, inplace=True)

In [4]:
# Define preprocessing steps for images
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

context_length = 256

In [5]:
class CaptionImageDataset(Dataset):
    def __init__(self, captions_df, image_folder, tokenizer, preprocess, context_length):
        self.captions_df = captions_df
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.preprocess = preprocess
        self.context_length = context_length

    def __len__(self):
        return len(self.captions_df)

    def __getitem__(self, idx):
        img_name = Path(self.image_folder) / self.captions_df.iloc[idx, 0]
        image = Image.open(img_name).convert('RGB')
        image = self.preprocess(image)
        
        caption = self.captions_df.iloc[idx, 1]
        text = self.tokenizer(caption, context_length=self.context_length)
        
        return image, text

In [6]:
train_dataset = CaptionImageDataset(train_df, train_image_folder, tokenizer, preprocess_train, context_length)
val_dataset = CaptionImageDataset(val_df, val_image_folder, tokenizer, preprocess_val, context_length)
test_dataset = CaptionImageDataset(test_df, test_image_folder, tokenizer, preprocess_val, context_length)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4)

In [7]:
import torch
import torch.optim as optim

def train_one_epoch(model, data_loader, optimizer, device):
    model.train()
    total_loss = 0
    for images, texts in data_loader:
        images = images.to(device)
        texts = texts.to(device)
        
        optimizer.zero_grad()
        image_features, text_features = model(images, texts)
        
        logits = torch.matmul(image_features, text_features.t())  # Calculate logits
        logits = logits.softmax(dim=-1)
        
        # Example loss calculation; adjust as needed
        loss = -torch.log(logits.diag()).mean()  
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(data_loader)

def evaluate(model, data_loader, device):
    model.eval()
    all_logits = []
    all_labels = []
    with torch.no_grad():
        for images, texts in data_loader:
            images = images.to(device)
            texts = texts.to(device)
            
            image_features, text_features = model(images, texts)
            logits = torch.matmul(image_features, text_features.t()).softmax(dim=-1)
            
            all_logits.append(logits.cpu().numpy())
            all_labels.append(texts.cpu().numpy())
    
    return np.concatenate(all_logits), np.concatenate(all_labels)


In [8]:
import numpy as np

def save_embeddings(model, data_loader, device, output_file):
    model.eval()
    all_image_embeddings = []
    all_text_embeddings = []
    with torch.no_grad():
        for images, texts in data_loader:
            images = images.to(device)
            texts = texts.to(device)
            
            image_features, text_features = model(images, texts)
            
            all_image_embeddings.append(image_features.cpu().numpy())
            all_text_embeddings.append(text_features.cpu().numpy())
    
    all_image_embeddings = np.concatenate(all_image_embeddings, axis=0)
    all_text_embeddings = np.concatenate(all_text_embeddings, axis=0)
    
    np.savez(output_file, image_embeddings=all_image_embeddings, text_embeddings=all_text_embeddings)


In [None]:
# Define device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# Initialize model and optimizer
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(model_name)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
num_epochs = 10  # Number of epochs for training
# Set the path for saving embeddings
embedding_output_file = 'BioMedClipPubMedBERT_embeddings.npz'

# Train the model
for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {train_loss:.4f}")
    
    # Optionally evaluate on validation set
    val_logits, val_labels = evaluate(model, val_loader, device)
    # Calculate metrics, etc.

# Save embeddings from the training set
save_embeddings(model, train_loader, device, embedding_output_file)

# Save embeddings from the validation set
save_embeddings(model, val_loader, device, 'val_embeddings.npz')

# Optionally save embeddings from the test set
save_embeddings(model, test_loader, device, 'test_embeddings.npz')
