# **COCA Coronary Calcium and Chest CT - Exploratory Data Analysis**

## Sources:

Data - https://stanfordaimi.azurewebsites.net/datasets/e8ca74dc-8dd4-4340-815a-60b41f6cb2aa

# Import Dependencies

In [161]:
# File Support
try: 
    import pydicom as dcm
except:
    # Use try except for Google Colab
    !pip install pydicom
from pydicom.data import get_testdata_files
import xml
import xml.etree.ElementTree as ET 

# Base
import numpy as np
import pandas as pd
import random

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# SK-learn
import sklearn

# Files
import os
from os.path import join
from glob import glob

# Config

In [162]:
class CFG:
    random_seed = 42
    gated = True
    path = "Coronary CT Data\Gated_release_final" if gated else "Coronary CT Data/deidentified_nongated"

## Reproducibility

In [163]:
def set_seed(seed=CFG.random_seed):
    print(f"Seed: {seed}")
    random.seed(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)

set_seed()

Seed: 42


# Load Data

In [164]:
def read_dicom(path):
    return dcm.dcmread(path).pixel_array

In [165]:
image_paths = glob(f"{CFG.path}/**/**/**/*.dcm")
label_paths = glob(f"{CFG.path}/**/**.xml") # Fix
print(f"{len(image_paths)} images")
print(f"{len(label_paths)} patients") # Fix

40113 images
451 patients


In [166]:
label_paths[0]

'Coronary CT Data\\Gated_release_final\\calcium_xml\\0.xml'

In [167]:
arr = read_dicom(image_paths[0])

In [168]:
arr2 = read_dicom(image_paths[70])

In [260]:
def parseXML(xmlfile): 
    # create element tree object 
    tree = ET.parse(xmlfile) 

    all_images = []

    images = tree.find("dict").find("array")
    images = images.findall("dict")

    # Images
    for image in images:
        image_data = {}
        arr = [i.text for i in image if i.tag not in ["array", "dict"]]
        
        for i in range(len(arr)//2):
            image_data[arr[2*i]] = arr[2*i+1]

        image_data['ROIs'] = []

        # ROI
        all_roi = image.find("array").findall('dict')
        for roi in all_roi:
            roi_data = {}
            arr = [i.text for i in roi if i.tag not in ["array", "dict"]]
        
            for i in range(len(arr)//2):
                roi_data[arr[2*i]] = arr[2*i+1]

            all_points = roi.findall('array')
            roi_data['point_mm'] = [i.text for i in all_points[0].findall("string")]
            roi_data['point_px'] = [i.text for i in all_points[1].findall("string")]
            
            image_data['ROIs'].append(roi_data)
        all_images.append(image_data)

    return all_images

    
parseXML(label_paths[0])[0]

{'ImageIndex': '34',
 'NumberOfROIs': '1',
 'ROIs': [{'Area': '0.029283028095960617',
   'Center': '(-30.509375, -248.681641, -183.250000)',
   'Dev': '23.731674194335938',
   'IndexInImage': '0',
   'Length': '0.97119766473770142',
   'Max': '206',
   'Mean': '160.76922607421875',
   'Min': '131',
   'Name': 'Right Coronary Artery',
   'NumberOfPoints': '20',
   'Point_mm': 'Point_px',
   'Total': '2090',
   'Type': '20',
   'point_mm': ['(-30.697357, -249.156250, -183.250000)',
    '(-30.699219, -249.154388, -183.250000)',
    '(-31.173828, -249.154388, -183.250000)',
    '(-31.648438, -249.154388, -183.250000)',
    '(-32.123047, -249.154388, -183.250000)',
    '(-32.595795, -248.681641, -183.250000)',
    '(-32.123047, -248.208893, -183.250000)',
    '(-31.648438, -248.208893, -183.250000)',
    '(-31.173828, -248.208893, -183.250000)',
    '(-30.699219, -248.208893, -183.250000)',
    '(-30.697357, -248.207031, -183.250000)',
    '(-30.224609, -247.734283, -183.250000)',
    '(-29