In [1]:
!pip install nbdime

Defaulting to user installation because normal site-packages is not writeable
Collecting nbdime
  Downloading nbdime-4.0.2-py3-none-any.whl (5.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting jupyter-server-mathjax>=0.2.2
  Downloading jupyter_server_mathjax-0.2.6-py3-none-any.whl (3.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting gitpython!=2.1.4,!=2.1.5,!=2.1.6
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 KB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.11-py3-none-any.whl (62 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [3]:
!nbdime config-git --enable --global

In [4]:
!git status

On branch master
Your branch and 'origin/master' have diverged,
and have 1 and 1 different commits each, respectively.
  (use "git pull" to merge the remote branch into yours)

You have unmerged paths.
  (fix conflicts and run "git commit")
  (use "git merge --abort" to abort the merge)

Unmerged paths:
  (use "git add <file>..." to mark resolution)
	[31mboth modified:   MRSMK.ipynb[m

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	[31m.ipynb_checkpoints/[m
	[31mMRSMK (copy).ipynb[m
	[31mNLMCXR_png.tgz[m
	[31mNLMCXR_reports.tgz[m
	[31miu_xray/[m

no changes added to commit (use "git add" and/or "git commit -a")


## **Medical Report Summarisation using Medical Knowledge**

### **References**

**Main Reference**
- Radiology report generation with medical knowledge and multilevel image-report alignment: A new method and its verification
https://www.sciencedirect.com/science/article/pii/S0933365723002282#bib1



## **Data Collection**

### **Collect Datasets**

In [None]:
'''Libraries Installation and Import'''

# install necessary libraries
!pip install Pillow
!pip install torchvision
!pip install nltk
!pip install pyspellchecker
!pip install tqdm
!pip install opencv-python

# importing required libraries
import os
import requests
import tarfile
import numpy as np
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
from PIL import Image
import torchvision.transforms as transforms
import cv2
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from spellchecker import SpellChecker

In [10]:
'''Setup - Generalized'''

# setup to download the IU X-Ray Dataset
dataset = 'iu_xray/'
download_path = os.path.join('./datasets', dataset)

# from google.colab import drive
# drive.mount('/content/drive')
# download_path = os.path.join('/content/drive/MyDrive/Academics/CS550 Machine Learning/CS550 ASMT MRSMK/datasets', dataset)

images_dir = os.path.join(download_path, "images")
reports_dir = os.path.join(download_path, "reports")

images_url = "https://openi.nlm.nih.gov/imgs/collections/NLMCXR_png.tgz"
reports_url = "https://openi.nlm.nih.gov/imgs/collections/NLMCXR_reports.tgz"


# function to check the file size of a given URL
def get_file_size(url):
    response = requests.head(url)
    size_in_bytes = int(response.headers.get('Content-Length', 0))
    size_in_mb = size_in_bytes / (1024 * 1024)
    return size_in_mb


# function to download and extract from a given url to a given directory
def download_and_extract(url, save_dir):
    file_name = url.split('/')[-1]
    file_path = os.path.join(save_dir, file_name)

    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('Content-Length', 0))
    downloaded_size = 0

    with open(file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
                downloaded_size += len(chunk)
                percent_complete = (downloaded_size / total_size) * 100
                print(f"Downloaded {downloaded_size / (1024*1024):.2f} MB out of {total_size / (1024*1024):.2f} MB: {percent_complete:.2f}% complete")

    print("\nDownload complete!")

    with tarfile.open(file_path, 'r:gz') as tar:
        members = tar.getmembers()
        total_files = len(members)

        for idx, member in enumerate(members, start=1):
            tar.extract(member, path=save_dir)
            print(f"Extracting File {idx} out of {total_files}: {member.name}")

    os.remove(file_path)


# downloading  IU X-Ray dataset
if not os.path.exists(images_dir):
    images_size = get_file_size(images_url)
    print(f"Downloading {images_url} to: {images_dir} ({images_size:.2f} MB)")
    os.makedirs(images_dir, exist_ok=True)
    download_and_extract(images_url, images_dir)
    print(f"Downloaded {images_url} to: {images_dir}")
else:
    print(f"{images_url} already exists at: {images_dir}")

if not os.path.exists(reports_dir):
    reports_size = get_file_size(reports_url)
    print(f"Downloading {reports_url} to: {reports_dir} ({reports_size:.2f} MB)")
    os.makedirs(reports_dir, exist_ok=True)
    download_and_extract(reports_url, reports_dir)
    print(f"Downloaded {reports_url} to: {reports_dir}")
else:
    print(f"{reports_url} already exists at: {reports_dir}")

Images directory already exists at: iu_xray/images
Reports directory already exists at: iu_xray/reports


In [11]:
'''Exploring the IU X-Ray Dataset Contents'''
download_path = os.path.join('', dataset)
# displaying directory and subdirectory contents
iu_xray = download_path
print("\nPath: ", iu_xray)
print(f"Directory Contents: {os.listdir(iu_xray)}")

iu_xray_images = images_dir
print("\nPath: ", iu_xray_images)
print(f"Directory Contents: {len(os.listdir(iu_xray_images))} Images")

iu_xray_reports = os.path.join(reports_dir, 'ecgen-radiology')
print("\nPath: ", iu_xray_reports)
print(f"Directory Contents: {len(os.listdir(iu_xray_reports))} Reports")


Path:  iu_xray/
Directory Contents: ['images_preprocessed', 'reports', 'iu_xray_data.csv', 'images']

Path:  iu_xray/images
Directory Contents: 7471 Images

Path:  iu_xray/reports/ecgen-radiology
Directory Contents: 3955 Reports


In [None]:
'''Processing Textual Data from each .xml Report File and Storing it in a .csv File'''

# function to iterate through all .xml report files and storing them in a dataframe
def save_images_df():
    data = []
    cnt = 0
    for file in tqdm.tqdm(os.listdir(iu_xray_reports)):
        if file.endswith(".xml"):
            cnt += 1
            print(f"Processing .xml File {cnt} out of {len(os.listdir(iu_xray_reports))}: {file}")

            file_path = os.path.join(iu_xray_reports, file)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                pmc_id = root.find('.//pmcId').attrib.get('id')

                comparison = indication = findings = impression = None

                for abstract in root.findall('.//AbstractText'):
                    if abstract.attrib.get('Label') == 'COMPARISON':
                        comparison = abstract.text
                    elif abstract.attrib.get('Label') == 'INDICATION':
                        indication = abstract.text
                    elif abstract.attrib.get('Label') == 'FINDINGS':
                        findings = abstract.text
                    elif abstract.attrib.get('Label') == 'IMPRESSION':
                        impression = abstract.text

                for parent_image in root.findall('parentImage'):
                    image_file = parent_image.attrib['id'] + ".png"
                    image_path = os.path.join(iu_xray_images, image_file)
                    image = cv2.imread(image_path)

                    if image is not None:
                        height, width, channels = image.shape
                        caption = parent_image.find('caption').text if parent_image.find('caption') is not None else None
                        data.append([pmc_id, image_file, caption, comparison, indication, findings, impression, height, width])
                    else:
                        print(f"Warning: Unable to read image {image_path}")

            except Exception as e:
                print(f"Error processing file {file}: {e}")

    return data


# create a dataframe and save it as csv
iu_xray_images_df_path = os.path.join(iu_xray, 'iu_xray_images_df.csv')
if not os.path.exists(iu_xray_images_df_path):
    data = save_images_df()
    columns = ['pmc_id', 'image_filename', 'caption', 'comparison', 'indication', 'findings', 'impression', 'height', 'width']
    iu_xray_images_df = pd.DataFrame(data, columns=columns)
    iu_xray_images_df.to_csv(iu_xray_images_df_path, index=False)
    print(f"Dataframe saved to {iu_xray_images_df_path}")
else:
    print(f"Dataframe already exists at {iu_xray_images_df_path}")
    iu_xray_images_df = pd.read_csv(iu_xray_images_df_path)


# display the stored dataframe
print("\n\nDataframe Shape:", iu_xray_images_df.shape)

print("\n\nDataframe Information:\n")
display(iu_xray_images_df.info())

print("\n\nDisplaying Dataframe:\n")
display(iu_xray_images_df.head())

In [None]:
'''Processing Textual Data from each .xml Report File and Storing it in a .csv File'''

# function to iterate through all .xml report files and storing them in a dataframe
def save_reports_df():
    data = []
    cnt = 0
    for file in tqdm.tqdm(os.listdir(iu_xray_reports)):
        if file.endswith(".xml"):
            cnt += 1
            print(f"Processing .xml File {cnt} out of {len(os.listdir(iu_xray_reports))}: {file}")

            file_path = os.path.join(iu_xray_reports, file)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                pmc_id = root.find('.//pmcId').attrib.get('id')

                comparison = indication = findings = impression = None

                for abstract in root.findall('.//AbstractText'):
                    if abstract.attrib.get('Label') == 'COMPARISON':
                        comparison = abstract.text
                    elif abstract.attrib.get('Label') == 'INDICATION':
                        indication = abstract.text
                    elif abstract.attrib.get('Label') == 'FINDINGS':
                        findings = abstract.text
                    elif abstract.attrib.get('Label') == 'IMPRESSION':
                        impression = abstract.text

                report_data = {
                    'pmc_id': pmc_id,
                    'findings': findings,
                    'impression': impression,
                    'comparison': comparison,
                    'indication': indication,
                }

                parent_images = root.findall('parentImage')
                report_data['image_count'] = len(parent_images)

                for i, parent_image in enumerate(parent_images, start=1):
                    image_file = parent_image.attrib['id'] + ".jpg"
                    caption = parent_image.find('caption').text if parent_image.find('caption') is not None else None
                    report_data[f'image_{i}'] = f"{image_file}: {caption}" if caption else image_file

                data.append(report_data)

            except Exception as e:
                print(f"Error processing file {file}: {e}")

    return data


# create a dataframe and save it as csv
iu_xray_reports_df_path = os.path.join(iu_xray, 'iu_xray_reports_df.csv')
if not os.path.exists(iu_xray_reports_df_path):
    data = save_reports_df()
    iu_xray_reports_df = pd.DataFrame(data)
    iu_xray_reports_df.to_csv(iu_xray_reports_df_path, index=False)
    print(f"Dataframe saved to {iu_xray_reports_df_path}")
else:
    print(f"Dataframe already exists at {iu_xray_reports_df_path}")
    iu_xray_reports_df = pd.read_csv(iu_xray_reports_df_path)


# display the stored dataframe
print("\n\nDataframe Shape:", iu_xray_reports_df.shape)

print("\n\nDataframe Information:\n")
display(iu_xray_reports_df.info())

print("\n\nDisplaying Dataframe:\n")
display(iu_xray_reports_df.head())

In [None]:
'''Displaying the Number of Images per Report'''

# displaying the distribution of number of images per report
reports_count = iu_xray_reports_df['image_count'].value_counts().rename_axis('images_qty').reset_index(name='reports_count')
print("\n\nNumber of Images per Report:\n")
display(reports_count)

### **Preprocess Images**

In [46]:
'''Preprocessing Images - Resizing, Tensor Conversion and Normalization'''

# function to find minimum dimensions of given set of images
def find_min_dimensions(image_dir):
    min_width = float('inf')
    min_height = float('inf')

    for filename in os.listdir(image_dir):
        if filename.endswith('.png'):
            img_path = os.path.join(image_dir, filename)
            with Image.open(img_path) as img:
                width, height = img.size
                min_width = min(min_width, width)
                min_height = min(min_height, height)

    return min_width, min_height


# function to preprocess and save images
def preprocess_images(input_dir, output_dir):
    min_width, min_height = find_min_dimensions(iu_xray_images)
    print(f'Minimum Width: {min_width}, Minimum Height: {min_height}\n')

    preprocess = transforms.Compose([
        transforms.Resize((min_width, min_height)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    os.makedirs(output_dir, exist_ok=True)

    cnt = 0
    for filename in os.listdir(input_dir):
        if filename.endswith('.png'):
            cnt += 1
            print(f"Preprocessing File {cnt} out of {len(os.listdir(input_dir))}: {filename}")

            image_path = os.path.join(input_dir, filename)
            image = Image.open(image_path).convert('RGB')
            processed_image = preprocess(image)

            processed_image_path = os.path.join(output_dir, filename)

            processed_image_pil = transforms.ToPILImage()(processed_image)
            processed_image_pil.save(processed_image_path)


# preprocessing images
iu_xray_images_preprocessed = os.path.join(iu_xray, 'images_preprocessed')
if not os.path.exists(iu_xray_images_preprocessed):
    print(f"Preprocessing Images to: {iu_xray_images_preprocessed}")
    preprocess_images(iu_xray_images, iu_xray_images_preprocessed)
    print(f"Preprocessed Images saved to: {iu_xray_images_preprocessed}")
else :
    print(f"Preprocessed Images already exist at: {iu_xray_images_preprocessed}")

Preprocessing Images to: iu_xray/images_preprocessed
Minimum Width: 512, Minimum Height: 362

Preprocessing File 1 out of 7471: CXR3094_IM-1447-3001.png
Preprocessing File 2 out of 7471: CXR2027_IM-0672-0001-0002.png
Preprocessing File 3 out of 7471: CXR2622_IM-1110-1001.png
Preprocessing File 4 out of 7471: CXR2270_IM-0859-2001.png
Preprocessing File 5 out of 7471: CXR2343_IM-0908-2001.png
Preprocessing File 6 out of 7471: CXR2898_IM-1300-0001-0002.png
Preprocessing File 7 out of 7471: CXR1208_IM-0141-3001.png
Preprocessing File 8 out of 7471: CXR1470_IM-0303-1001.png
Preprocessing File 9 out of 7471: CXR1397_IM-0253-1001.png
Preprocessing File 10 out of 7471: CXR3635_IM-1802-1001.png
Preprocessing File 11 out of 7471: CXR1922_IM-0598-1002.png
Preprocessing File 12 out of 7471: CXR2722_IM-1184-1001.png
Preprocessing File 13 out of 7471: CXR2226_IM-0830-13001.png
Preprocessing File 14 out of 7471: CXR2142_IM-0764-1001.png
Preprocessing File 15 out of 7471: CXR1323_IM-0209-2001.png
Prep

### **Preprocess Text**

<span style="color:red"><b><<<<<<< local</b></span>

In [89]:
!pip install tqdm
!pip install opencv-python
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 KB[0m [31m830.5 kB/s[0m eta [36m0:00:00[0m1m841.0 kB/s[0m eta [36m0:00:01[0m
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2


In [39]:
'''Processing Textual Data from each .xml Report File and Storing it in a .csv File'''

# importing required libraries
import os
import xml.etree.ElementTree as ET
import pandas as pd
import tqdm
from tqdm import tqdm
import cv2


# iterating through all .xml report files and storing them in a dataframe
def save_images_df():
    data = []
    cnt = 0
    for file in tqdm.tqdm(os.listdir(iu_xray_reports)):
        if file.endswith(".xml"):
            cnt += 1
            print(f"Processing .xml File {cnt} out of {len(os.listdir(iu_xray_reports))}: {file}")

            file_path = os.path.join(iu_xray_reports, file)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                pmc_id = root.find('.//pmcId').attrib.get('id')

                comparison = indication = findings = impression = None

                for abstract in root.findall('.//AbstractText'):
                    if abstract.attrib.get('Label') == 'COMPARISON':
                        comparison = abstract.text
                    elif abstract.attrib.get('Label') == 'INDICATION':
                        indication = abstract.text
                    elif abstract.attrib.get('Label') == 'FINDINGS':
                        findings = abstract.text
                    elif abstract.attrib.get('Label') == 'IMPRESSION':
                        impression = abstract.text

                for parent_image in root.findall('parentImage'):
                    image_file = parent_image.attrib['id'] + ".png"
                    image_path = os.path.join(iu_xray_images, image_file)
                    image = cv2.imread(image_path)

                    if image is not None:
                        height, width, channels = image.shape
                        caption = parent_image.find('caption').text if parent_image.find('caption') is not None else None
                        data.append([pmc_id, image_file, caption, comparison, indication, findings, impression, height, width])
                    else:
                        print(f"Warning: Unable to read image {image_path}")

            except Exception as e:
                print(f"Error processing file {file}: {e}")

    return data


# create a dataframe and save it as csv
iu_xray_images_df_path = os.path.join(iu_xray, 'iu_xray_images_df.csv')
if not os.path.exists(iu_xray_images_df_path):
    data = save_images_df()
    columns = ['pmc_id', 'image_filename', 'caption', 'comparison', 'indication', 'findings', 'impression', 'height', 'width']
    iu_xray_images_df = pd.DataFrame(data, columns=columns)
    iu_xray_images_df.to_csv(iu_xray_images_df_path, index=False)
    print(f"Dataframe saved to {iu_xray_images_df_path}")
else:
    print(f"Dataframe already exists at {iu_xray_images_df_path}")
    iu_xray_images_df = pd.read_csv(iu_xray_images_df_path)

'''Displaying the Stored .csv File'''

# display the stored dataframe
print("\n\nDataframe Shape:", iu_xray_images_df.shape)

print("\n\nDataframe Information:\n")
display(iu_xray_images_df.info())

print("\n\nDisplaying Dataframe:\n")
display(iu_xray_images_df.head())

'''Processing Textual Data from each .xml Report File and Storing it in a .csv File'''

# importing required libraries
import os
import xml.etree.ElementTree as ET
import pandas as pd
import tqdm

# iterating through all .xml report files and storing them in a dataframe
def save_reports_df():
    data = []
    cnt = 0
    for file in tqdm.tqdm(os.listdir(iu_xray_reports)):
        if file.endswith(".xml"):
            cnt += 1
            print(f"Processing .xml File {cnt} out of {len(os.listdir(iu_xray_reports))}: {file}")

            file_path = os.path.join(iu_xray_reports, file)
            try:
                tree = ET.parse(file_path)
                root = tree.getroot()

                pmc_id = root.find('.//pmcId').attrib.get('id')

                comparison = indication = findings = impression = None

                for abstract in root.findall('.//AbstractText'):
                    if abstract.attrib.get('Label') == 'COMPARISON':
                        comparison = abstract.text
                    elif abstract.attrib.get('Label') == 'INDICATION':
                        indication = abstract.text
                    elif abstract.attrib.get('Label') == 'FINDINGS':
                        findings = abstract.text
                    elif abstract.attrib.get('Label') == 'IMPRESSION':
                        impression = abstract.text

                report_data = {
                    'pmc_id': pmc_id,
                    'findings': findings,
                    'impression': impression,
                    'comparison': comparison,
                    'indication': indication,
                    'images': {}
                }

                for parent_image in root.findall('parentImage'):
                    image_file = parent_image.attrib['id'] + ".jpg"
                    caption = parent_image.find('caption').text if parent_image.find('caption') is not None else None
                    report_data['images'][image_file] = caption

                data.append(report_data)

            except Exception as e:
                print(f"Error processing file {file}: {e}")

    return data


# create a dataframe and save it as csv
iu_xray_reports_df_path = os.path.join(iu_xray, 'iu_xray_reports_df.csv')
if not os.path.exists(iu_xray_reports_df_path):
    data = save_reports_df()
    iu_xray_reports_df = pd.DataFrame(data)
    iu_xray_reports_df.to_csv(iu_xray_reports_df_path, index=False)
    print(f"Dataframe saved to {iu_xray_reports_df_path}")
else:
    print(f"Dataframe already exists at {iu_xray_reports_df_path}")
    iu_xray_reports_df = pd.read_csv(iu_xray_reports_df_path)

'''Displaying the Stored .csv File'''

# display the stored dataframe
print("\n\nDataframe Shape:", iu_xray_reports_df.shape)

print("\n\nDataframe Information:\n")
display(iu_xray_reports_df.info())

print("\n\nDisplaying Dataframe:\n")
display(iu_xray_reports_df.head())

'''Displaying the Number of Images per Report'''

# displaying the distribution of number of images per report
img_count = iu_xray_reports_df['images'].apply(lambda x: len(x) if isinstance(x, dict) else 0)
reports_count = img_count.value_counts().rename_axis('images_qty').reset_index(name='reports_count')
print("\n\nNumber of Images per Report:\n")
display(reports_count)



Dataframe already exists at iu_xray/iu_xray_images_df.csv


Dataframe Shape: (7470, 9)


Dataframe Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7470 entries, 0 to 7469
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   pmc_id          7470 non-null   int64 
 1   image_filename  7470 non-null   object
 2   caption         7468 non-null   object
 3   comparison      5210 non-null   object
 4   indication      7311 non-null   object
 5   findings        6473 non-null   object
 6   impression      7418 non-null   object
 7   height          7470 non-null   int64 
 8   width           7470 non-null   int64 
dtypes: int64(3), object(6)
memory usage: 525.4+ KB


None



Displaying Dataframe:



Unnamed: 0,pmc_id,image_filename,caption,comparison,indication,findings,impression,height,width
0,779,CXR779_IM-2321-1001.png,"Radiographs of the chest, 2 views, dated XXXX,...","CT chest, dated XXXX, XXXX.",XXXX-year-old female. Pain after XXXX.,The cardiomediastinal silhouette is normal in ...,Negative for acute abnormality.,420,512
1,779,CXR779_IM-2321-2001.png,"Radiographs of the chest, 2 views, dated XXXX,...","CT chest, dated XXXX, XXXX.",XXXX-year-old female. Pain after XXXX.,The cardiomediastinal silhouette is normal in ...,Negative for acute abnormality.,624,512
2,1102,CXR1102_IM-0069-12012.png,"AP and lateral views of the chest dated XXXX, ...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...",420,512
3,1102,CXR1102_IM-0069-2001.png,"AP and lateral views of the chest dated XXXX, ...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...",512,512
4,1102,CXR1102_IM-0069-3001.png,"AP and lateral views of the chest dated XXXX, ...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...",512,512


Dataframe already exists at iu_xray/iu_xray_reports_df.csv


Dataframe Shape: (3955, 6)


Dataframe Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3955 entries, 0 to 3954
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   pmc_id      3955 non-null   int64 
 1   findings    3425 non-null   object
 2   impression  3921 non-null   object
 3   comparison  2757 non-null   object
 4   indication  3865 non-null   object
 5   images      3955 non-null   object
dtypes: int64(1), object(5)
memory usage: 185.5+ KB


None



Displaying Dataframe:



Unnamed: 0,pmc_id,findings,impression,comparison,indication,images
0,779,The cardiomediastinal silhouette is normal in ...,Negative for acute abnormality.,"CT chest, dated XXXX, XXXX.",XXXX-year-old female. Pain after XXXX.,{'CXR779_IM-2321-1001.jpg': 'Radiographs of th...
1,1102,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,{'CXR1102_IM-0069-12012.jpg': 'AP and lateral ...
2,1574,Lungs are clear bilaterally. Cardiac and media...,No acute cardiopulmonary abnormality.,"XXXX, XXXX.","Shortness of breath, wheezing, XXXX.",{'CXR1574_IM-0374-1001.jpg': 'PA and lateral c...
3,2833,"The heart, pulmonary XXXX and mediastinum are ...",No acute cardiopulmonary disease.,Two views of the chest dated XXXX.,XXXX-year-old male with XXXX's disease. Chroni...,{'CXR2833_IM-1249-1001.jpg': 'PA and lateral c...
4,3216,"Heart size is normal. No pneumothorax, pleural...",Normal chest radiograph.,None available.,XXXX-year-old female with dyspnea.,{'CXR3216_IM-1520-1001.jpg': 'Chest XXXX and l...




Number of Images per Report:



Unnamed: 0,images_qty,reports_count
0,0,3955


In [40]:
iu_xray_reports_df.head()

Unnamed: 0,pmc_id,findings,impression,comparison,indication,images
0,779,The cardiomediastinal silhouette is normal in ...,Negative for acute abnormality.,"CT chest, dated XXXX, XXXX.",XXXX-year-old female. Pain after XXXX.,{'CXR779_IM-2321-1001.jpg': 'Radiographs of th...
1,1102,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,{'CXR1102_IM-0069-12012.jpg': 'AP and lateral ...
2,1574,Lungs are clear bilaterally. Cardiac and media...,No acute cardiopulmonary abnormality.,"XXXX, XXXX.","Shortness of breath, wheezing, XXXX.",{'CXR1574_IM-0374-1001.jpg': 'PA and lateral c...
3,2833,"The heart, pulmonary XXXX and mediastinum are ...",No acute cardiopulmonary disease.,Two views of the chest dated XXXX.,XXXX-year-old male with XXXX's disease. Chroni...,{'CXR2833_IM-1249-1001.jpg': 'PA and lateral c...
4,3216,"Heart size is normal. No pneumothorax, pleural...",Normal chest radiograph.,None available.,XXXX-year-old female with dyspnea.,{'CXR3216_IM-1520-1001.jpg': 'Chest XXXX and l...


In [41]:
iu_xray_images_df.head()

Unnamed: 0,pmc_id,image_filename,caption,comparison,indication,findings,impression,height,width
0,779,CXR779_IM-2321-1001.png,"Radiographs of the chest, 2 views, dated XXXX,...","CT chest, dated XXXX, XXXX.",XXXX-year-old female. Pain after XXXX.,The cardiomediastinal silhouette is normal in ...,Negative for acute abnormality.,420,512
1,779,CXR779_IM-2321-2001.png,"Radiographs of the chest, 2 views, dated XXXX,...","CT chest, dated XXXX, XXXX.",XXXX-year-old female. Pain after XXXX.,The cardiomediastinal silhouette is normal in ...,Negative for acute abnormality.,624,512
2,1102,CXR1102_IM-0069-12012.png,"AP and lateral views of the chest dated XXXX, ...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...",420,512
3,1102,CXR1102_IM-0069-2001.png,"AP and lateral views of the chest dated XXXX, ...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...",512,512
4,1102,CXR1102_IM-0069-3001.png,"AP and lateral views of the chest dated XXXX, ...","XXXX, XXXX.",Shortness of breath. Unable to XXXX XXXX for l...,There is stable cardiomegaly with XXXX pulmona...,"1. Cardiomegaly, vascular congestion and proba...",512,512


In [42]:
# function for obtaining the different information part of the xml report file and preprocessing them and also adding the concernced image and report information to the dataframe
def decontracted(phrase): #https://stackoverflow.com/a/47091490
  """
  performs text decontraction of words like won't to will not
  """
  # specific
  phrase = re.sub(r"won\'t", "will not", phrase)
  phrase = re.sub(r"can\'t", "can not", phrase)

  # general
  phrase = re.sub(r"n\'t", " not", phrase)
  phrase = re.sub(r"\'re", " are", phrase)
  phrase = re.sub(r"\'s", " is", phrase)
  phrase = re.sub(r"\'d", " would", phrase)
  phrase = re.sub(r"\'ll", " will", phrase)
  phrase = re.sub(r"\'t", " not", phrase)
  phrase = re.sub(r"\'ve", " have", phrase)
  phrase = re.sub(r"\'m", " am", phrase)
  return phrase


In [90]:
from tqdm import tqdm
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
import seaborn as sns

def preprocess_text(data): #https://regex101.com/
  """
  extracts the information data from the xml file and does text preprocessing on them
  here info can be 1 value in this list ["COMPARISON","INDICATION","FINDINGS","IMPRESSION"]
  """
  preprocessed = []

  for sentence in tqdm(data.values):

    sentence = BeautifulSoup(sentence, 'lxml').get_text()

    regex = r"\d." 
    sentence = re.sub(regex,"",sentence) #removing all values like "1." and "2." etc

    regex = r"X+"
    sentence = re.sub(regex,"",sentence) #removing words like XXXX

    regex = r"[^.a-zA-Z]" 
    sentence = re.sub(regex," ",sentence) #removing all special characters except for full stop

    regex = r"http\S+"
    sentence = re.sub(regex,"", sentence)
    sentence = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?$%^&*'/+\[\]_]+", "", sentence)
    sentence = re.sub('&', 'and',sentence)
    sentence = re.sub('@', 'at',sentence)
    sentence = re.sub('0', 'zero',sentence)
    sentence = re.sub('1', 'one',sentence)
    sentence = re.sub('2', 'two',sentence)
    sentence = re.sub('3', 'three',sentence)
    sentence = re.sub('4', 'four',sentence)
    sentence = re.sub('5', 'five',sentence)
    sentence = re.sub('6', 'six',sentence)
    sentence = re.sub('7', 'seven',sentence)
    sentence = re.sub('8', 'eight',sentence)
    sentence = re.sub('9', 'nine',sentence)
    sentence = re.sub('year old', "", sentence)#Occur multiple times in Indication feature but not necessary     
    sentence = re.sub('yearold', "", sentence)
    sentence = decontracted(sentence) #perform decontraction
    sentence = sentence.strip().lower() #strips the begining and end of the string of spaces and converts all into lowercase
    sentence = " ".join(sentence.split()) #removes unwanted spaces
    if sentence=="": #if the resulting sentence is an empty string return null value
      sentence = np.nan
    preprocessed.append(sentence)
  return preprocessed

In [91]:
# Check for NaN values in each dataframe
NaN_reports = iu_xray_reports_df.isnull().sum()
NaN_images = iu_xray_images_df.isnull().sum()

# Print total NaN values for relevant columns in iu_xray_reports_df
print("Total NaN Values in 'comparison' column (reports):", NaN_reports['comparison'])
print("Total NaN Values in 'indication' column (reports):", NaN_reports['indication'])
print("Total NaN Values in 'findings' column (reports):", NaN_reports['findings'])
print("Total NaN Values in 'impression' column (reports):", NaN_reports['impression'])

# Print total NaN values for relevant columns in iu_xray_images_df
print("Total NaN Values in 'caption' column (images):", NaN_images['caption'])
print("Total NaN Values in 'comparison' column (images):", NaN_images['comparison'])
print("Total NaN Values in 'indication' column (images):", NaN_images['indication'])
print("Total NaN Values in 'findings' column (images):", NaN_images['findings'])
print("Total NaN Values in 'impression' column (images):", NaN_images['impression'])

# Replacing NaN values in iu_xray_reports_df
iu_xray_reports_df['comparison'] = iu_xray_reports_df['comparison'].fillna('No Comparison')
iu_xray_reports_df['indication'] = iu_xray_reports_df['indication'].fillna('No Indication')
iu_xray_reports_df['findings'] = iu_xray_reports_df['findings'].fillna('No Findings')
iu_xray_reports_df['impression'] = iu_xray_reports_df['impression'].fillna('No Impression')

# Replacing NaN values in iu_xray_images_df
iu_xray_images_df['caption'] = iu_xray_images_df['caption'].fillna('Unknown')
iu_xray_images_df['comparison'] = iu_xray_images_df['comparison'].fillna('No Comparison')
iu_xray_images_df['indication'] = iu_xray_images_df['indication'].fillna('No Indication')
iu_xray_images_df['findings'] = iu_xray_images_df['findings'].fillna('No Findings')
iu_xray_images_df['impression'] = iu_xray_images_df['impression'].fillna('No Impression')


iu_xray_reports_df['findings'] = preprocess_text(iu_xray_reports_df['findings'])
iu_xray_reports_df['impression'] = preprocess_text(iu_xray_reports_df['impression'])
iu_xray_reports_df['comparison'] = preprocess_text(iu_xray_reports_df['comparison'])
iu_xray_reports_df['indication'] = preprocess_text(iu_xray_reports_df['indication'])

# Preprocess columns in the images dataframe
iu_xray_images_df['caption'] = preprocess_text(iu_xray_images_df['caption'])
iu_xray_images_df['comparison'] = preprocess_text(iu_xray_images_df['comparison'])
iu_xray_images_df['indication'] = preprocess_text(iu_xray_images_df['indication'])
iu_xray_images_df['findings'] = preprocess_text(iu_xray_images_df['findings'])
iu_xray_images_df['impression'] = preprocess_text(iu_xray_images_df['impression'])

iu_xray_reports_df.replace("", float("NaN"), inplace=True)
iu_xray_reports_df.replace("", float("NaN"), inplace=True)

iu_xray_reports_df.to_csv('processed_iu_xray_reports_df.csv', index=False)
iu_xray_images_df.to_csv('processed_iu_xray_images_df.csv', index=False)

print("Preprocessed dataframes saved as 'processed_iu_xray_reports_df.csv' and 'processed_iu_xray_images_df.csv'")

NameError: name 'iu_xray_reports_df' is not defined

In [56]:
iu_xray_reports_df.head(10)

Unnamed: 0,pmc_id,findings,impression,comparison,indication,images
0,779,the cardiomediastinal silhouette is normal in ...,negative for acute abnormality,ct chest dated,female pain after,{'CXR779_IM-2321-1001.jpg': 'Radiographs of th...
1,1102,there is stable cardiomegaly with pulmonary va...,cardiomegaly vascular congestion and probable ...,,shortness of breath unable to for lateral view,{'CXR1102_IM-0069-12012.jpg': 'AP and lateral ...
2,1574,lungs are clear bilaterally cardiac and medias...,no acute cardiopulmonary abnormality,,shortness of breath wheezing,{'CXR1574_IM-0374-1001.jpg': 'PA and lateral c...
3,2833,the heart pulmonary and mediastinum are within...,no acute cardiopulmonary disease,two views of the chest dated,male with s disease chronic fatigue syndrome c...,{'CXR2833_IM-1249-1001.jpg': 'PA and lateral c...
4,3216,heart size is normal no pneumothorax pleural e...,normal chest radiograph,none available,female with dyspnea,{'CXR3216_IM-1520-1001.jpg': 'Chest XXXX and l...
5,2704,frontal and lateral views of the chest show an...,indeterminant small nodular opacities may be g...,,abscess post left mastectomy,"{'CXR2704_IM-1171-1001.jpg': 'CHEST, Two (2) V..."
6,563,the examination consists of frontal and latera...,no evidence of acute cardiopulmonary process,none,rib pain,{'CXR563_IM-2164-1001.jpg': 'Xray Chest PA and...
7,282,mediastinal contours are within normal limits ...,no acute cardiopulmonary abnormality,none,preop hernia repair asthma,{'CXR282_IM-1243-1001.jpg': 'PA and lateral vi...
8,3060,the heart is normal in size the mediastinum is...,no acute disease,,atelectasis,{'CXR3060_IM-1426-1003.jpg': 'CHEST 2V FRONTAL...
9,1829,and lateral chest examination was obtained the...,no pneumothorax following removal of left side...,,status post chest tube removal,{'CXR1829_IM-0537-1001.jpg': 'Chest PA and lat...


In [57]:
iu_xray_reports_df.replace("", float("NaN"), inplace=True)
iu_xray_reports_df.head(10)

Unnamed: 0,pmc_id,findings,impression,comparison,indication,images
0,779,the cardiomediastinal silhouette is normal in ...,negative for acute abnormality,ct chest dated,female pain after,{'CXR779_IM-2321-1001.jpg': 'Radiographs of th...
1,1102,there is stable cardiomegaly with pulmonary va...,cardiomegaly vascular congestion and probable ...,,shortness of breath unable to for lateral view,{'CXR1102_IM-0069-12012.jpg': 'AP and lateral ...
2,1574,lungs are clear bilaterally cardiac and medias...,no acute cardiopulmonary abnormality,,shortness of breath wheezing,{'CXR1574_IM-0374-1001.jpg': 'PA and lateral c...
3,2833,the heart pulmonary and mediastinum are within...,no acute cardiopulmonary disease,two views of the chest dated,male with s disease chronic fatigue syndrome c...,{'CXR2833_IM-1249-1001.jpg': 'PA and lateral c...
4,3216,heart size is normal no pneumothorax pleural e...,normal chest radiograph,none available,female with dyspnea,{'CXR3216_IM-1520-1001.jpg': 'Chest XXXX and l...
5,2704,frontal and lateral views of the chest show an...,indeterminant small nodular opacities may be g...,,abscess post left mastectomy,"{'CXR2704_IM-1171-1001.jpg': 'CHEST, Two (2) V..."
6,563,the examination consists of frontal and latera...,no evidence of acute cardiopulmonary process,none,rib pain,{'CXR563_IM-2164-1001.jpg': 'Xray Chest PA and...
7,282,mediastinal contours are within normal limits ...,no acute cardiopulmonary abnormality,none,preop hernia repair asthma,{'CXR282_IM-1243-1001.jpg': 'PA and lateral vi...
8,3060,the heart is normal in size the mediastinum is...,no acute disease,,atelectasis,{'CXR3060_IM-1426-1003.jpg': 'CHEST 2V FRONTAL...
9,1829,and lateral chest examination was obtained the...,no pneumothorax following removal of left side...,,status post chest tube removal,{'CXR1829_IM-0537-1001.jpg': 'Chest PA and lat...


<span style="color:red"><b>=======</b></span>

In [None]:
'''Preprocessing Text - Lowercasing, Decontracting, Punctuation Removal, Number Removal, Two-Letter Word Removal, Stop Word Removal, Negation Handling, Spell Checking, Extra Space Removal, Stemming, Lemmatization,'''

# download nltk resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


# initialize stemmer, lemmatizer, and spell checker
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
spell = SpellChecker()


# function to convert text to lowercase
def lowercase(text):
    return [line.lower() for line in text]


# function to decontract words
def decontracted(phrase):
    contractions = {
        "won't": "will not", "can't": "can not", "couldn't": "could not",
        "shouldn't": "should not", "wouldn't": "would not", "n't": " not",
        "'re": " are", "'s": " is", "'d": " would", "'ll": " will",
        "'t": " not", "'ve": " have", "'m": " am"
    }
    for contraction, full_form in contractions.items():
        phrase = phrase.replace(contraction, full_form)
    return phrase


# function to remove punctuations
def rem_punctuations(text):
    return [re.sub(r'[^\w\s]', '', line) for line in text]


# function to remove numbers
def rem_numbers(text):
    return [re.sub(r'\d+', '', line) for line in text]


# function to remove two-letter words except "no" and "ct"
def rem_two_letter_words(text):
    return [' '.join(word for word in line.split() if len(word) > 2 or word in ["no", "ct"]) for line in text]


# function to remove stop words
def rem_stop_words(text):
    stop_words = set(stopwords.words('english'))
    return [' '.join(word for word in line.split() if word not in stop_words) for line in text]


# function to handle negations
def handle_negations(text):
    negations = {"no": "not", "not": "not"}
    return [' '.join(negations.get(word, word) for word in line.split()) for line in text]


# function to correct spelling
def correct_spelling(text):
    corrected = []
    for line in text:
        corrected_line = ' '.join([list(spell.candidates(word))[0] if spell.candidates(word) else word for word in line.split()])
        corrected.append(corrected_line)
    return corrected


# function to remove extra spaces
def rem_extra_spaces(text):
    return [' '.join(line.split()) for line in text]


# function to apply stemming
def apply_stemming(text):
    return [' '.join(stemmer.stem(word) for word in line.split()) for line in text]


# function to apply lemmatization
def apply_lemmatization(text):
    return [' '.join(lemmatizer.lemmatize(word) for word in line.split()) for line in text]


# function to preprocess text
def preprocess_text(data):
    preprocessed = []
    for sentence in tqdm(data.values):
        sentence = lowercase(sentence)
        sentence = decontracted(sentence)
        sentence = rem_punctuations([sentence])[0]
        sentence = rem_numbers([sentence])[0]
        sentence = rem_two_letter_words([sentence])[0]
        sentence = rem_stop_words([sentence])[0]
        sentence = handle_negations([sentence])[0]
        sentence = correct_spelling([sentence])[0]
        sentence = rem_extra_spaces([sentence])[0]
        sentence = apply_stemming([sentence])[0]
        sentence = apply_lemmatization([sentence])[0]

        preprocessed.append(sentence)

    return preprocessed

In [None]:
# function to preprocess text and save the corresponding dataframe
def preprocess_and_save_dataframe(dataframe, path):
    columns_to_preprocess = {
        'caption': 'unknown',
        'comparison': 'no comparison',
        'indication': 'no indication',
        'findings': 'no findings',
        'impression': 'no impression'
    }

    for column, fill_value in columns_to_preprocess.items():
        if column in dataframe.columns:
            print(f"Preprocessing Column: {column}")
            dataframe[column] = dataframe[column].fillna(fill_value)
            dataframe[column] = preprocess_text(dataframe[column])

    dataframe.to_csv(path, index=False)

    return dataframe


# save and display the preprocessed dataframe
# iu_xray_images_df_preprocessed_path = os.path.join(iu_xray, 'iu_xray_images_df_preprocessed.csv')
# if not os.path.exists(iu_xray_images_df_preprocessed_path):
#     print(f"Preprocessing Text Dataframe {iu_xray_images_df_path} to: {iu_xray_images_df_preprocessed_path}")
#     iu_xray_images_df_preprocessed = preprocess_and_save_dataframe(iu_xray_images_df, iu_xray_images_df_preprocessed_path)
#     print(f"Preprocessed Text DataFrame {iu_xray_images_df_path} saved to: {iu_xray_images_df_preprocessed_path}")
# else:
#     print(f"Preprocessed Text DataFrame {iu_xray_images_df_path} already exists at: {iu_xray_images_df_preprocessed_path}")
#     iu_xray_images_df_preprocessed = pd.read_csv(iu_xray_images_df_preprocessed_path)
# display(iu_xray_images_df_preprocessed.head())

iu_xray_reports_df_preprocessed_path = os.path.join(iu_xray, 'iu_xray_reports_df_preprocessed.csv')
if not os.path.exists(iu_xray_reports_df_preprocessed_path):
    print(f"Preprocessing Text DataFrame {iu_xray_reports_df_path} to: {iu_xray_reports_df_preprocessed_path}")
    iu_xray_reports_df_preprocessed = preprocess_and_save_dataframe(iu_xray_reports_df, iu_xray_reports_df_preprocessed_path)
    print(f"Preprocessed Text DataFrame {iu_xray_reports_df_path} saved to: {iu_xray_reports_df_preprocessed_path}")
else:
    print(f"Preprocessed Text DataFrame {iu_xray_reports_df_path} already exists at: {iu_xray_reports_df_preprocessed_path}")
    iu_xray_reports_df_preprocessed = pd.read_csv(iu_xray_reports_df_preprocessed_path)
display(iu_xray_reports_df_preprocessed.head())

<span style="color:red"><b>>>>>>>> remote</b></span>

### **Create Data Loaders**

## **Model Implementation**

### **Visual Extractor**

### **Text Encoder**

### **Multilevel Alignment**

### **Report Generator**

### **Complete Model**

## **Training**

### **Training**

## **Testing**

### **Testing**

## **Dataset Download as Zip File**

In [None]:
'''Downloading Dataset Directory with all Changes'''

# importing required libraries
import shutil
import os
from google.colab import files


# zipping and downloading the archive
zip_filename = 'IUXR.zip'
shutil.make_archive(zip_filename[:-4], 'zip', download_path)
files.download(zip_filename)