In [8]:
import os
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import clip
import torch
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ninja\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ninja\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Loading Training Dataset

In [2]:
data_directory = "../QL4POMR/Datasets/ROCO2/"
df_train = pd.read_csv(data_directory+"train_captions.csv")
df_train['Image'] = df_train['ID'] + '.jpg'
df_train = df_train.drop(columns=['ID'])
df_train.head()

Unnamed: 0,Caption,Image
0,Head CT demonstrating left parotiditis.,ROCOv2_2023_train_000001.jpg
1,Acquired renal cysts in end-stage renal failur...,ROCOv2_2023_train_000002.jpg
2,Computed tomography of the chest showing the r...,ROCOv2_2023_train_000003.jpg
3,Lateral view of the sacrum showing the low con...,ROCOv2_2023_train_000004.jpg
4,Thoracic CT scan showing perihilar pulmonary l...,ROCOv2_2023_train_000005.jpg


In [9]:
# Define the preprocessing function
def preprocess_caption(caption):
    caption = caption.lower()
    caption = re.sub(r'[^\w\s]', '', caption)
    tokens = word_tokenize(caption)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_caption = ' '.join(tokens)
    return preprocessed_caption

In [10]:
# Apply preprocessing to the captions
df_train['Caption'] = df_train['Caption'].apply(preprocess_caption)
df_train.head()

                                             Caption  \
0             head ct demonstrating left parotiditis   
1  acquired renal cysts endstage renal failure 16...   
2  computed tomography chest showing right breast...   
3  lateral view sacrum showing low contrast bone ...   
4  thoracic ct scan showing perihilar pulmonary l...   

                          Image  
0  ROCOv2_2023_train_000001.jpg  
1  ROCOv2_2023_train_000002.jpg  
2  ROCOv2_2023_train_000003.jpg  
3  ROCOv2_2023_train_000004.jpg  
4  ROCOv2_2023_train_000005.jpg  


In [3]:
class ROCO2Dataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.dataframe.iloc[idx, 1])
        image = Image.open(img_name).convert("RGB")
        caption = self.dataframe.iloc[idx, 0]

        if self.transform:
            image = self.transform(image)
        
        return image, caption

In [6]:
# Prepare dataset and dataloader
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Modify dataset to use CLIP's preprocessing
dataset = ROCO2Dataset(df_train, data_directory+"train_images/train/", transform=preprocess)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [7]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, texts in dataloader:
        images = images.to(device)
        texts = clip.tokenize(texts).to(device)
        
        optimizer.zero_grad()
        logits_per_image, logits_per_text = model(images, texts)
        ground_truth = torch.arange(len(images), device=device)
        loss = (torch.nn.CrossEntropyLoss()(logits_per_image, ground_truth) +
                torch.nn.CrossEntropyLoss()(logits_per_text, ground_truth)) / 2
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader)}")

# Save the model
torch.save(model.state_dict(), 'clip_roco2.pth')

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


RuntimeError: Input Computed tomography (CT) of the chest. Numerous enlarged lymph nodes can be seen within the mediastinum. The largest is in the sub-carinal area measuring 3.1 cm x 4.0 cm. The second largest is in the right peritracheal area measuring 2.1 cm x 2.6 cm. Enlarged lymph nodes are also seen within each hilum, left greater than right. is too long for context length 77