## **Medical Report Summarisation using Medical Knowledge**

### **References**

**Main Reference**
- Radiology report generation with medical knowledge and multilevel image-report alignment: A new method and its verification
https://www.sciencedirect.com/science/article/pii/S0933365723002282#bib1



## **Data Collection**

### **Collect Datasets**

In [None]:
'''Setup - Generalized'''

# importing required libraries
import os
import tarfile
import requests


# setup to download the IU X-Ray Dataset
dataset = 'iu_xray/'
download_path = os.path.join('./datasets/', dataset)

images_dir = os.path.join(download_path, "images")
reports_dir = os.path.join(download_path, "reports")

images_url = "https://openi.nlm.nih.gov/imgs/collections/NLMCXR_png.tgz"
reports_url = "https://openi.nlm.nih.gov/imgs/collections/NLMCXR_reports.tgz"


# function to check the file size of a given URL
def get_file_size(url):
    response = requests.head(url)
    size_in_bytes = int(response.headers.get('Content-Length', 0))
    size_in_mb = size_in_bytes / (1024 * 1024)
    return size_in_mb


# function to download and extract from a given url to a given directory
def download_and_extract(url, save_dir):
    file_name = url.split('/')[-1]
    file_path = os.path.join(save_dir, file_name)

    response = requests.get(url, stream=True)
    with open(file_path, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            file.write(chunk)

    with tarfile.open(file_path, 'r:gz') as tar:
        members = tar.getmembers()
        total_files = len(members)

        for idx, member in enumerate(members, start=1):
            tar.extract(member, path=save_dir)
            print(f"Extracting File {idx} out of {total_files}: {member.name}")

    os.remove(file_path)


# downloading  IU X-Ray dataset
if not os.path.exists(images_dir):
    images_size = get_file_size(images_url)
    print(f"Downloading {images_url} to: {images_dir} ({images_size:.2f} MB)")
    os.makedirs(images_dir, exist_ok=True)
    download_and_extract(images_url, images_dir)
    print(f"Downloaded {images_url} to: {images_dir}")
else:
    print(f"{images_url} already exists at: {images_dir}")

if not os.path.exists(reports_dir):
    reports_size = get_file_size(reports_url)
    print(f"Downloading {reports_url} to: {reports_dir} ({reports_size:.2f} MB)")
    os.makedirs(reports_dir, exist_ok=True)
    download_and_extract(reports_url, reports_dir)
    print(f"Downloaded {reports_url} to: {reports_dir}")
else:
    print(f"{reports_url} already exists at: {reports_dir}")

In [None]:
'''Exploring the IU X-Ray Dataset Contents'''

# displaying directory and subdirectory contents
iu_xray = download_path
print("\nPath: ", iu_xray)
print(f"Directory Contents: {os.listdir(iu_xray)}")

iu_xray_images = images_dir
print("\nPath: ", iu_xray_images)
print(f"Directory Contents: {len(os.listdir(iu_xray_images))} Images")

iu_xray_reports = os.path.join(reports_dir, 'ecgen-radiology')
print("\nPath: ", iu_xray_reports)
print(f"Directory Contents: {len(os.listdir(iu_xray_reports))} Reports")

## **Data Preprocessing**

### **Preprocess Images**

In [None]:
'''Preprocessing Images - Resizing and Normalization'''

# importing required libraries
import os
from PIL import Image
import torchvision.transforms as transforms


# finding minimum dimensions of images
def find_min_dimensions(image_dir):
    min_width = float('inf')
    min_height = float('inf')

    for filename in os.listdir(image_dir):
        if filename.endswith('.png'):
            img_path = os.path.join(image_dir, filename)
            with Image.open(img_path) as img:
                width, height = img.size
                min_width = min(min_width, width)
                min_height = min(min_height, height)

    return min_width, min_height


# preprocessing images and saving them to new directory
def preprocess_images(input_dir, output_dir):
    min_width, min_height = find_min_dimensions(iu_xray_images)
    print(f'Minimum Width: {min_width}, Minimum Height: {min_height}\n')

    preprocess = transforms.Compose([
        transforms.Resize((min_width, min_height)),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    os.makedirs(output_dir, exist_ok=True)

    cnt = 0
    for filename in os.listdir(input_dir):
        if filename.endswith('.png'):
            cnt += 1
            print(f"Preprocessing File {cnt} out of {len(os.listdir(input_dir))}: {filename}")

            image_path = os.path.join(input_dir, filename)
            image = Image.open(image_path).convert('RGB')
            processed_image = preprocess(image)

            processed_image_path = os.path.join(output_dir, filename)

            processed_image_pil = transforms.ToPILImage()(processed_image)
            processed_image_pil.save(processed_image_path)

iu_xray_images_preprocessed = os.path.join(iu_xray, 'images_preprocessed')

if not os.path.exists(iu_xray_images_preprocessed):
    print(f"Preprocessing Images to: {iu_xray_images_preprocessed}")
    preprocess_images(iu_xray_images, iu_xray_images_preprocessed)
    print(f"Preprocessed Images saved to: {iu_xray_images_preprocessed}")
else :
    print(f"Preprocessed Images already exist at: {iu_xray_images_preprocessed}")

### **Preprocess Text**

In [None]:
'''Processing Textual Data from each .xml Report File and Storing it in a .csv File'''

# importing required libraries
import os
import xml.etree.ElementTree as ET
import pandas as pd
import cv2


# iterating through all .xml report files and storing them in a dataframe
data = []
cnt = 0
for file in os.listdir(iu_xray_reports):
    if file.endswith(".xml"):
        cnt += 1
        print(f"Processing .xml File {cnt} out of {len(os.listdir(iu_xray_reports))}: {file}")

        file_path = os.path.join(iu_xray_reports, file)
        try:
            tree = ET.parse(file_path)
            root = tree.getroot()

            comparison = indication = findings = impression = None

            for abstract in root.findall('.//AbstractText'):
                if abstract.attrib.get('Label') == 'COMPARISON':
                    comparison = abstract.text
                elif abstract.attrib.get('Label') == 'INDICATION':
                    indication = abstract.text
                elif abstract.attrib.get('Label') == 'FINDINGS':
                    findings = abstract.text
                elif abstract.attrib.get('Label') == 'IMPRESSION':
                    impression = abstract.text

            for parent_image in root.findall('parentImage'):
                image_file = parent_image.attrib['id'] + ".png"
                image_path = os.path.join(iu_xray_images, image_file)
                image = cv2.imread(image_path)

                if image is not None:
                    height, width, channels = image.shape
                    caption = parent_image.find('caption').text if parent_image.find('caption') is not None else None
                    data.append([image_file, caption, comparison, indication, findings, impression, height, width])
                else:
                    print(f"Warning: Unable to read image {image_path}")

        except Exception as e:
            print(f"Error processing file {file}: {e}")


# create DataFrame from the collected data
columns = ['image_filename', 'caption', 'comparison', 'indication', 'findings', 'impression', 'height', 'width']
iu_xray_df = pd.DataFrame(data, columns=columns)


# save dataframe as .csv file
iu_xray_csv = os.path.join(iu_xray, 'iu_xray_data.csv')
iu_xray_df.to_csv(iu_xray_csv, index=False)
print(f"Dataframe saved to {iu_xray_csv}")

In [None]:
'''Displaying the Stored .csv File'''

# display the stored dataframe
print("\n\nDataframe Shape:\n")
display(iu_xray_df.shape)

print("\n\nDataframe Information:\n")
display(iu_xray_df.info())

print("\n\nDisplaying Dataframe:\n")
display(iu_xray_df.head())

### **Create Data Loaders**

## **Model Implementation**

### **Visual Extractor**

### **Text Encoder**

### **Multilevel Alignment**

### **Report Generator**

### **Complete Model**

## **Training**

### **Training**

## **Testing**

### **Testing**