In [None]:
pip install torch torchvision transformers


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [None]:
import os
import json
import torch
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader

class MedicalDataset(Dataset):
    def __init__(self, image_dir, annotations_file, transform=None):
        self.image_dir = image_dir
        self.annotations = json.load(open(annotations_file))
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.annotations[idx]['image'])
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        caption = self.annotations[idx]['caption']
        return image, caption

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = MedicalDataset(image_dir='path/to/images', annotations_file='path/to/annotations.json', transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [None]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, BertTokenizer

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Fine-tuning loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

model.train()
for epoch in range(5):  # Adjust the number of epochs as needed
    for batch in dataloader:
        images, captions = batch
        inputs = feature_extractor(images, return_tensors="pt").pixel_values
        labels = tokenizer(captions, return_tensors="pt", padding=True, truncation=True).input_ids
        labels[labels == tokenizer.pad_token_id] = -100

        outputs = model(pixel_values=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(f"Epoch {epoch}, Loss: {loss.item()}")


In [None]:
model.eval()
val_dataset = MedicalDataset(image_dir='path/to/val_images', annotations_file='path/to/val_annotations.json', transform=transform)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

for batch in val_dataloader:
    images, captions = batch
    inputs = feature_extractor(images, return_tensors="pt").pixel_values
    labels = tokenizer(captions, return_tensors="pt", padding=True, truncation=True).input_ids

    with torch.no_grad():
        outputs = model.generate(inputs)
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        for i, pred in enumerate(decoded_preds):
            print(f"Prediction: {pred}, Ground Truth: {captions[i]}")
