# Importing Libraries and Reading CSV Files

In [None]:
import time
since = time.time()
!pip3 install python-gdcm

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sn
import pydicom as dicom # Dicom (Digital Imaging in Medicine) - medical image datasets, storage and transfer
import os
from tqdm import tqdm # allows you to output a smart progress bar by wrapping around any iterable
import glob # retrieve files/pathnames matching a specified pattern
import pprint # pretty-print” arbitrary Python data structures
import ast # 
from pydicom.pixel_data_handlers.util import apply_voi_lut #
import wandb #

from PIL import Image
import shutil
import gdcm

path = '/kaggle/input/siim-covid19-detection/'
train_image_level = pd.read_csv(path + "train_image_level.csv")
train_study_level = pd.read_csv(path + "train_study_level.csv")

train_image_level.head()

In [None]:
train_image_level.describe()


2040 ids without box

In [None]:
train_study_level_key = train_study_level.id.str[:-6]
training_set = pd.merge(left = train_study_level, right = train_image_level, how = 'right', left_on = train_study_level_key, right_on = 'StudyInstanceUID')
print(training_set.shape)

In [None]:
print(training_set.loc[0])

# Read Image in DCM format

## Searching Desired StudyInstanceUID in all Subfolders

In [None]:
def read_dcm(i):
    path_train = path + 'train/' + training_set.loc[i, 'StudyInstanceUID']
    #print(os.listdir(path_train))
    img_id = training_set.loc[i, 'id_y'].replace('_image','.dcm')
    
    for dirname, _, filenames in os.walk(path_train):
        for filename in filenames:
            path_img_id = os.path.join(dirname, filename)
            if path_img_id[-16:-4] == img_id:
                #print(path_img_id[-16:-4])
                break
      
    last_folder_in_path = os.listdir(path_train)[0]
    path_train = path_train + '/{}/'.format(last_folder_in_path)
    data_file = dicom.dcmread(path_img_id)#(path_train + img_id)
    return img_id, data_file

def Image_resize(img,len_x):
    img = img.resize((len_x, len_x), Image.ANTIALIAS)
    return img

_,img = read_dcm(1)
print(img)

# DCM Image as Pixel Array

In [None]:
img_arr = img.pixel_array
plt.imshow(img_arr)
print('Shape:', img_arr.shape)

In [None]:
plt.hist(img_arr.ravel()) #calculating histogram

In [None]:
_,img = read_dcm(5) # Another Image
img_arr = img.pixel_array

plt.imshow(img_arr)
print('Image Shape:',img_arr.shape)

In [None]:
plt.hist(img_arr.ravel()) 

## Observation
### Some images have an intensity range of 0 to 255
### Some images have a higher intensity range.(can be 16 bit images/ 12 bit allocated)


# Histogram Equalization + Converting to RGB Images

In [None]:
def To_16bit(img_arr):
    min_arr = np.amin(img_arr)
    max_arr = np.amax(img_arr)
    range_array = max_arr - min_arr

    return np.round((img_arr-min_arr)/range_array*(np.power(2,16)*3-1)) 


def To_RGB(img_arr): #Extend to 24 bit then segment by 8 bit
    min_arr = np.amin(img_arr)
    max_arr = np.amax(img_arr)
    range_array = max_arr - min_arr
    
    lenx, leny = img_arr.shape
    rgbArray = np.zeros((lenx,leny,3), 'uint8')

    arr2 = np.round((img_arr-min_arr)/range_array*(np.power(2,24)-1))
    rgbArray[:,:, 0] = arr2 % np.power(2,8)
    arr2 = (arr2-rgbArray[:,:, 0])/np.power(2,8)
    rgbArray[:,:, 1] = arr2 % np.power(2,8)
    arr2 = (arr2-rgbArray[:,:, 1])/np.power(2,8)
    rgbArray[:,:, 2] = arr2 % np.power(2,8)
    
    return rgbArray
    
def To_RGB2(img_arr): #Extend to 16 bit then segment by 8 bit top(G), 8 bit bottom(R), 8 bit overall(B)
    min_arr = np.amin(img_arr)
    max_arr = np.amax(img_arr)
    range_array = max_arr - min_arr
    
    lenx, leny = img_arr.shape
    rgbArray = np.zeros((lenx,leny,3), 'uint8')

    arr2 = np.round((img_arr-min_arr)/range_array*(np.power(2,16)-1))
    rgbArray[:,:, 0] = arr2 % np.power(2,8)                 #8 bit bottom
    arr2 = np.floor(arr2/np.power(2,8))                     #8 bit top
    rgbArray[:,:, 1] = arr2 
    
    rgbArray[:,:, 2] = np.round((img_arr-min_arr)/range_array*(np.power(2,8)-1)) #8 bit overall
    
    
    return rgbArray    

In [None]:
rgb_arr = To_RGB2(img_arr)
plt.imshow(rgb_arr)
print('Image Shape:', rgb_arr.shape)

# Plotting

### A few images are still grey (All of R,G,B components have the same value)

In [None]:
fig, axes = plt.subplots(3,3, figsize=(20,16))
fig.subplots_adjust(hspace=.1, wspace=.1)
axes = axes.ravel()

start_index = 0

for row in range(9):
    img_id, img = read_dcm(row + start_index)
    img = img.pixel_array
    img = To_RGB2(img)
    print(img_id, training_set.loc[row, 'label'].split(' ')[0])
    if (training_set.loc[row + start_index,'boxes'] == training_set.loc[row + start_index,'boxes']):
        boxes = ast.literal_eval(training_set.loc[row + start_index,'boxes'])
        for box in boxes:
            p = matplotlib.patches.Rectangle((box['x'], box['y']),
                                              box['width'], box['height'],
                                              ec = 'r', fc = 'none', lw = 2.
                                            )
            axes[row].add_patch(p)
    axes[row].imshow(img, cmap = 'gray')
    axes[row].set_title(training_set.loc[row, 'label'].split(' ')[0] + '  Image Shape:' +str(img.shape))
    axes[row].set_xticklabels([])
    axes[row].set_yticklabels([])

In [None]:
time_elapsed = time.time() - since
print('Time from start {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))

In [None]:
folderlocation = './Images'
if not os.path.exists(folderlocation):
        os.mkdir(folderlocation)

for iter_name in ['train', 'val', 'test']:
    folderlocation = './Images/' + iter_name
    if not os.path.exists(folderlocation):
            os.mkdir(folderlocation)
            
    folderlocation = './Images/' + iter_name +'/none'
    if not os.path.exists(folderlocation):
            os.mkdir(folderlocation)
            
    folderlocation = './Images/' + iter_name +'/opacity'
    if not os.path.exists(folderlocation):
            os.mkdir(folderlocation)
    


In [None]:
iter_split=0
folderlocation = './Images/'

for row in range(len(training_set)): # len(training_set)
    img_id, img = read_dcm(row)
    img = img.pixel_array
    img = To_RGB2(img)
    iter_split = iter_split +1
    
    img_destination = folderlocation +'train/'+ training_set.loc[row, 'label'].split(' ')[0] +'/'+img_id[:-4] +'.jpeg'
    
    if iter_split%20 == 19:
        img_destination = folderlocation +'test/'+ training_set.loc[row, 'label'].split(' ')[0] +'/'+img_id[:-4] +'.jpeg'
    if iter_split%20 == 18:
        img_destination = folderlocation +'val/'+ training_set.loc[row, 'label'].split(' ')[0] +'/'+img_id[:-4] +'.jpeg'
        
    im = Image.fromarray(img)
    im = Image_resize(im,2048)
    im.save(img_destination)
    
    
    if row%500 == 499:
        time_elapsed = time.time() - since
        print('Time from start {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
        print('Percentage Complete:', np.round(10000*(row+1)/6334)/100)
    
    
print('100% Complete.')    

In [None]:
shutil.make_archive('SIIM-FISABIO-RSNA-JPEG', 'zip', folderlocation)
shutil.rmtree(folderlocation)