In [1]:
pip install kaggle



In [2]:
 mkdir ~/.kaggle

In [3]:
 cp kaggle.json ~/.kaggle/

cp: cannot stat 'kaggle.json': No such file or directory


In [4]:
 !chmod 600 ~/.kaggle/kaggle.json

chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [5]:
!https://www.kaggle.com/datasets/virajbagal/roco-dataset

/bin/bash: line 1: https://www.kaggle.com/datasets/virajbagal/roco-dataset: No such file or directory


In [6]:
# downloading the dataset as a zip file
! kaggle datasets download virajbagal/roco-dataset

Dataset URL: https://www.kaggle.com/datasets/virajbagal/roco-dataset
License(s): CC0-1.0
Downloading roco-dataset.zip to /content
100% 6.19G/6.19G [05:01<00:00, 16.3MB/s]
100% 6.19G/6.19G [05:01<00:00, 22.1MB/s]


In [None]:
# unziping the downloaded zip file
! unzip roco-dataset.zip

In [8]:
import pandas as pd
import os
from PIL import Image
from PIL import UnidentifiedImageError
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy
import nltk
import torch
import torchvision.transforms as transforms


In [9]:
# downloading the necessaries to proceed with data preprocessing
nlp = spacy.load("en_core_web_sm")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [10]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|ftp\S+', '', text) # removing the links
    text = text.replace('\n', ' ') # removing the new lines
    text = re.sub(r'\w*\d\w*', '', text) # removing the words containing numbers
    text = re.sub(r'\s+', ' ', text).strip() # removing the spaces
    text = re.sub(r'[^\w\s]', '', text) # removing special characters
    words = text.split()
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words] # considering only normal words
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words] # considering the stemmed words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words] #considering the lemmatized words
    text = ' '.join(words)
    return text

In [11]:
caption_df = pd.read_csv('/content/all_data/train/radiologytraindata.csv')

In [12]:
caption_df.head()


Unnamed: 0,id,name,caption
0,ROCO_00002,PMC4083729_AMHSR-4-14-g002.jpg,Computed tomography scan in axial view showin...
1,ROCO_00003,PMC2837471_IJD2009-150251.001.jpg,Bacterial contamination occurred after comple...
2,ROCO_00004,PMC2505281_11999_2007_30_Fig6_HTML.jpg,The patient had residual paralysis of the han...
3,ROCO_00005,PMC3745845_IJD2013-683423.005.jpg,Panoramic radiograph after immediate loading.\n
4,ROCO_00007,PMC4917066_amjcaserep-17-301-g001.jpg,Plain abdomen x-ray: Multiple air levels at t...


In [13]:
# applying the data preprocessing function to every caption and creating a new column
caption_df['cleaned_caption'] = caption_df['caption'].apply(preprocess_text)

In [14]:
caption_df.head()

Unnamed: 0,id,name,caption,cleaned_caption
0,ROCO_00002,PMC4083729_AMHSR-4-14-g002.jpg,Computed tomography scan in axial view showin...,comput tomographi scan axial view show obliter...
1,ROCO_00003,PMC2837471_IJD2009-150251.001.jpg,Bacterial contamination occurred after comple...,bacteri contamin occur complet root canal trea...
2,ROCO_00004,PMC2505281_11999_2007_30_Fig6_HTML.jpg,The patient had residual paralysis of the han...,patient residu paralysi hand poliomyel necessa...
3,ROCO_00005,PMC3745845_IJD2013-683423.005.jpg,Panoramic radiograph after immediate loading.\n,panoram radiograph immedi load
4,ROCO_00007,PMC4917066_amjcaserep-17-301-g001.jpg,Plain abdomen x-ray: Multiple air levels at t...,plain abdomen xray multipl air level midabdome...


In [15]:
# saving the updated dataframe to a csv file
caption_df.to_csv('updated_captions.csv', index=False)

In [16]:
'''def preprocess_image(image_path):
    with Image.open(image_path) as img:
        img = img.resize((224, 224))  # Resize to 224x224 or any size you need
        img = np.array(img) / 255.0  # Normalize the image
    return img'''

'def preprocess_image(image_path):\n    with Image.open(image_path) as img:\n        img = img.resize((224, 224))  # Resize to 224x224 or any size you need\n        img = np.array(img) / 255.0  # Normalize the image\n    return img'

In [17]:
# function for preprocessing the image and converting the image into tensor for better preprocessing
def preprocess_image(image_path):
    try:
        img = Image.open(image_path).convert('RGB')
        preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        img_tensor = preprocess(img).unsqueeze(0)  # Add batch dimension
        img_tensor = img_tensor.to(device)
        return img_tensor
    except UnidentifiedImageError:
        print(f"Cannot identify image file: {image_path}")
        return None

In [18]:
# defining the directories where images can be located
image_dirs = ['/content/all_data/train/non-radiology/images','/content/all_data/train/radiology/images']

In [19]:
# function for finding the image
def find_image(image_name):
    for dir in image_dirs:
        image_path = os.path.join(dir, image_name)
        if os.path.exists(image_path):
            return image_path
    return None

In [20]:
preprocessed_data = []

In [21]:
# for using the gpu ram and resources instead of cpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [22]:
# samples and batches are created due to overload of cpu because of large dataset
# applying the preprocessed image function for every image in the sample length.
max_samples = 1000
processed_samples = 0

for index, row in caption_df.iterrows():
    if processed_samples >= max_samples:
        break
    image_path = find_image(row['name'])
    if image_path:
        preprocessed_image = preprocess_image(image_path)
        if preprocessed_image is not None:
            preprocessed_data.append((row['cleaned_caption'], preprocessed_image))
            processed_samples += 1
    else:
        print(f"Image {row['name']} not found.")


In [23]:
# saving and mapping the images,captions in a single dataframe for the testing,validation with labels.
captions = [item[0] for item in preprocessed_data]
images = [item[1].cpu().numpy() for item in preprocessed_data]
preprocessed_df = pd.DataFrame({'cleaned_caption': captions, 'preprocessed_image': images})
torch.save(preprocessed_df, 'preprocessed_data.pt')

print(preprocessed_df)

                                       cleaned_caption  \
0    comput tomographi scan axial view show obliter...   
1    bacteri contamin occur complet root canal trea...   
2    patient residu paralysi hand poliomyel necessa...   
3                       panoram radiograph immedi load   
4    plain abdomen xray multipl air level midabdome...   
..                                                 ...   
995  anoth section contrastenhanc abdomin ct scan o...   
996               highresolut comput tomographi thorax   
997  parastern long axi view show hypertrabecul ant...   
998  transcathet arteri embol sequenc shown right h...   
999                           preoper parana sinu view   

                                    preprocessed_image  
0    [[[[-1.4157891 -1.3986644 -1.3815396 -1.381539...  
1    [[[[2.2489083 2.2489083 2.2489083 2.2489083 2....  
2    [[[[-0.31980482 -1.1931673  -1.2787911  -1.364...  
3    [[[[2.2489083 2.2489083 2.2489083 2.2489083 2....  
4    [[[[-0.028684