### Import libraries

In [2]:
import sys
!{sys.executable} -m pip install pydicom

Successfully installed pydicom-2.1.2


In [3]:
!{sys.executable} -m pip install opencv-python



In [4]:
import pandas as pd
import os
import shutil
from glob import glob
import pydicom as dicom
import cv2

In [5]:
cohen_dir = 'covid-chestxray-dataset'
cohen_csv_path = f'./{cohen_dir}/metadata.csv'
cohen_img_path = f'./{cohen_dir}/images'

In [6]:
fig1_dir = 'Figure1-COVID-chestxray-dataset'
fig1_csv_path = f'./{fig1_dir}/metadata.csv'
fig1_img_path = f'./{fig1_dir}/images'

In [7]:
actmed_dir = 'Actualmed-COVID-chestxray-dataset'
actmed_csv_path = f'./{actmed_dir}/metadata.csv'
actmed_img_path = f'./{actmed_dir}/images'

In [8]:
sirm_dir = 'COVID-19-Radiography-Database'
sirm_cov_csv_path = f'./{sirm_dir}/COVID.metadata.xlsx'
sirm_cov_img_path = f'./{sirm_dir}/COVID'

In [9]:
rsna_csv1_path = './stage_2_detailed_class_info.csv'
rsna_csv2_path = './stage_2_train_labels.csv'
rsna_img_path = './stage_2_train_images'

### Initialize directories, files and metrics variables

In [10]:
dataset_dir = 'dataset'

images_dir = f'{dataset_dir}/images'
data_file_name = 'data_file.csv'

deletions = 0

if not os.path.exists(dataset_dir):
    os.mkdir(dataset_dir)
    print('Successfully created \'dataset\' directory created.')

if not os.path.exists(images_dir):
    os.mkdir(images_dir)
    print('Successfully created \'images\' directory created.')
else:
    files = glob(f'{images_dir}/*')
    for f in files:
        try:
            os.remove(f)
            deletions += 1
        except OSError as e:
            print('Error: %s : %s' % (f, e.strerror))
    print(f'Successfully deleted {deletions} images from the \'images\' directory')

data_file = open(f'./{dataset_dir}/{data_file_name}', 'w')
data_file.write('filename,diagnosis\n')
data_file.close()
print(f'Successfully deleted {deletions} entries from \'{data_file_name}\'')

covid_positives = 0
covid_negatives = 0
xray_index = 0

Successfully deleted 1154 images from the 'images' directory
Successfully deleted 1154 entries from 'data_file.csv'


### Extract data from 'covid-chestray-dataset'

In [None]:
cohen_csv = pd.read_csv(cohen_csv_path, nrows=None)
print(cohen_csv.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in cohen_csv.iterrows():
    if row['folder'] == 'volumes':
        continue
    
    image = row['filename']
    
    src_path = f'./{cohen_img_path}/{image}'
    dst_path = f'./{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding'].split('/')[-1]
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
        
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

### Extract data from 'Figure1-COVID-chestxray-dataset'

In [None]:
fig1_csv = pd.read_csv(fig1_csv_path, encoding='ISO-8859-1', nrows=None)
print(fig1_csv.shape)

fig1_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in fig1_csv.iterrows():
    if row['finding'] == 'NaN':
        continue
    
    patient_id = row['patientid']
    image = f'{patient_id}.jpg'
    
    src_path = f'{fig1_img_path}/{image}'
    if not os.path.exists(src_path):
        image = f'{patient_id}.png'
        src_path = f'{fig1_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

### Extract data from 'Actualmed-COVID-chestxray-dataset'

In [None]:
actmed_csv = pd.read_csv(actmed_csv_path, nrows=None)
print(actmed_csv.shape)

actmed_csv['finding'].fillna('NaN', inplace=True)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in actmed_csv.iterrows():
    if row['finding'] == 'NaN':
        continue
    
    image = row['imagename']
    
    src_path = f'{actmed_img_path}/{image}'
    dst_path = f'{images_dir}/{image}'
    
    image_type = image.split('.')[-1]
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    finding = row['finding']
    
    if finding == 'COVID-19':
        sample = new_image_name + ',' + 'positive' + '\n'
        covid_positives += 1
    else:
        sample = new_image_name + ',' + 'negative' + '\n'
        covid_negatives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

#### Due to potential duplicates and other uncertainties, the datasets that the below cells are exploiting aren't being used as of yet.

### Extract data from 'COVID-19-Radiography-Database'

In [None]:
sirm_cov_csv = pd.read_excel(sirm_cov_csv_path)
print(sirm_cov_csv.shape)

cohen_urls = set(cohen_csv['url'])

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in sirm_cov_csv.iterrows():
    if row['URL'] in cohen_urls:
        continue
    
    image_type = row['FORMAT'].lower()
    image = f"COVID ({row['FILE NAME'].split(' ')[1]}).{image_type}"
    
    src_path = f'./{sirm_cov_img_path}/{image}'
    dst_path = f'./{images_dir}/{image}'
    
    new_image_name = f'x-ray-{xray_index}.{image_type}'
    
    shutil.copy2(src_path, dst_path)
    os.rename(dst_path, f'./{images_dir}/{new_image_name}')
    
    sample = new_image_name + ',' + 'positive' + '\n'
    covid_positives += 1
    
    data_file.write(sample)
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()

In [None]:
rsna_csv2 = pd.read_csv(rsna_csv2_path, nrows=None)
print(rsna_csv2.shape)

data_file = open(f'./{dataset_dir}/{data_file_name}', 'a')

for _, row in rsna_csv2.iterrows():
    patient_id = row['patientId']
    dcm_path = f'{rsna_img_path}/{patient_id}.dcm'
    
    ds = dicom.dcmread(dcm_path)
    pixel_array = ds.pixel_array
    
    new_image_name = f'x-ray-{xray_index}.png'
    dst_path = f'{images_dir}/{new_image_name}'
    
    cv2.imwrite(dst_path, pixel_array)
    
    sample = new_image_name + ',' + 'negative' + '\n'
    covid_negatives += 1
    
    data_file.write(sample)    
    xray_index += 1
    
    print(new_image_name, ': OK')

print('Covid positives:', covid_positives)
print('Covid negatives:', covid_negatives)

data_file.close()