<img src = "https://i.imgur.com/LKLpFOv.png">

In [None]:
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import glob
import os
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import pprint
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import albumentations as A
import cv2
import wandb

from PIL import Image
from colorama import Fore, Back, Style
# colored output
y_ = Fore.YELLOW
r_ = Fore.RED
g_ = Fore.GREEN
b_ = Fore.BLUE
m_ = Fore.MAGENTA

sns.set(font="Serif",style ="white")

<img src="https://camo.githubusercontent.com/dd842f7b0be57140e68b2ab9cb007992acd131c48284eaf6b1aca758bfea358b/68747470733a2f2f692e696d6775722e636f6d2f52557469567a482e706e67">

I will be integrating W&B for ```visualizations``` and ```logging artifacts```!

[SIIM Project on W&B Dashboard](https://wandb.ai/ruchi798/siim?workspace=user-ruchi798)

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("api_key")

os.environ["WANDB_SILENT"] = "true"

CONFIG = {'competition': 'siim-fisabio-rsna', '_wandb_kernel': 'ruch'}

! wandb login $api_key

In [None]:
train_image_level = pd.read_csv("../input/siim-covid19-detection/train_image_level.csv")
train_study_level = pd.read_csv("../input/siim-covid19-detection/train_study_level.csv")

| id    | unique study identifier                                      |
|-------|--------------------------------------------------------------|
| boxes | bounding boxes in easily-readable dictionary format          |
| label | the correct prediction label for the provided bounding boxes |

In [None]:
train_image_level.head()

| id                       | unique study identifier                                  |
|--------------------------|----------------------------------------------------------|
| Negative for Pneumonia   | 1 : if the study is negative for pneumonia, 0: otherwise |
| Typical Appearance       | 1: if the study has this appearance, 0: otherwise        |
| Indeterminate Appearance | 1: if the study has this appearance, 0: otherwise        |
| Atypical Appearance      | 1: if the study has this appearance, 0: otherwise        |

In [None]:
train_study_level.head()

In [None]:
train_directory = "../input/siim-covid19-detection/train/"
test_directory = "../input/siim-covid19-detection/test/"

train_study_level['StudyInstanceUID'] = train_study_level['id'].apply(lambda x: x.replace('_study', ''))
del train_study_level['id']
train_df = train_image_level.merge(train_study_level, on='StudyInstanceUID')

In [None]:
train_df.head()

In [None]:
training_paths = []

for sid in tqdm(train_df['StudyInstanceUID']):
    training_paths.append(glob.glob(os.path.join(train_directory, sid +"/*/*"))[0])

train_df['path'] = training_paths

In [None]:
train_df.head()

# Distribution of class labels

In [None]:
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (20, 32),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

fig, ax = plt.subplots(4,2)
sns.kdeplot(train_df["Negative for Pneumonia"], shade=True,ax=ax[0,0],color="#ffb4a2")
ax[0,0].set_title("Negative for Pneumonia Distribution",font="Serif", fontsize=20,weight="bold")
sns.countplot(x = train_df["Negative for Pneumonia"], ax=ax[0,1],color="#ffb4a2")
ax[0,1].set_title("Negative for Pneumonia Distribution",font="Serif", fontsize=20,weight="bold")

sns.kdeplot(train_df["Typical Appearance"], shade=True,ax=ax[1,0],color="#e5989b")
ax[1,0].set_title("Typical Appearance Distribution",font="Serif", fontsize=20,weight="bold")
sns.countplot(x = train_df["Typical Appearance"], ax=ax[1,1],color="#e5989b")
ax[1,1].set_title("Typical Appearance Distribution",font="Serif", fontsize=20,weight="bold")

sns.kdeplot(train_df["Indeterminate Appearance"], shade=True,ax=ax[2,0],color="#b5838d")
ax[2,0].set_title("Indeterminate Appearance Distribution",font="Serif", fontsize=20,weight="bold")
sns.countplot(x = train_df["Indeterminate Appearance"], ax=ax[2,1],color="#b5838d")
ax[2,1].set_title("Indeterminate Appearance Distribution",font="Serif", fontsize=20,weight="bold")

sns.kdeplot(train_df["Atypical Appearance"], shade=True,ax=ax[3,0],color="#6d6875")
ax[3,0].set_title("Atypical Appearance Distribution",font="Serif", fontsize=20,weight="bold")
sns.countplot(x = train_df["Atypical Appearance"], ax=ax[3,1],color="#6d6875")
ax[3,1].set_title("Atypical Appearance Distribution",font="Serif", fontsize=20,weight="bold")

fig.subplots_adjust(wspace=0.2, hspace=0.4, top=0.93)
plt.show()

In [None]:
#====== Function to plot WandB bar chart ======
def plot_wb_bar(df,col1,col2): 
    run = wandb.init(project='siim', job_type='image-visualization',name=col1,config = CONFIG)
    
    dt = [[label, val] for (label, val) in zip(df[col1], df[col2])]
    table = wandb.Table(data=dt, columns = [col1,col2])
    wandb.log({col1 : wandb.plot.bar(table, col1,col2,title=col1)})

    run.finish()
    
#====== Function to create a dataframe of value counts ======
def count_values(df,col):
    df = pd.DataFrame(df[col].value_counts().reset_index().values,columns=[col, "counts"])
    return df

plot_wb_bar(count_values(train_df,"Negative for Pneumonia"),"Negative for Pneumonia", 'counts')
plot_wb_bar(count_values(train_df,"Typical Appearance"),"Typical Appearance", 'counts')
plot_wb_bar(count_values(train_df,"Indeterminate Appearance"),"Indeterminate Appearance", 'counts')
plot_wb_bar(count_values(train_df,"Atypical Appearance"),"Atypical Appearance", 'counts')

# DICOM data

### Image and Metadata

In [None]:
voi_lut=True
fix_monochrome=True

def dicom_dataset_to_dict(filename,func):
    """Credit: https://github.com/pydicom/pydicom/issues/319
               https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    """
    
    dicom_header = dicom.dcmread(filename) 
    
    #====== DICOM FILE DATA ======
    dicom_dict = {}
    repr(dicom_header)
    for dicom_value in dicom_header.values():
        if dicom_value.tag == (0x7fe0, 0x0010):
            #discard pixel data
            continue
        if type(dicom_value.value) == dicom.dataset.Dataset:
            dicom_dict[dicom_value.name] = dicom_dataset_to_dict(dicom_value.value)
        else:
            v = _convert_value(dicom_value.value)
            dicom_dict[dicom_value.name] = v
      
    del dicom_dict['Pixel Representation']
    
    if func!='metadata_df':
        #====== DICOM IMAGE DATA ======
        # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
        if voi_lut:
            data = apply_voi_lut(dicom_header.pixel_array, dicom_header)
        else:
            data = dicom_header.pixel_array
        # depending on this value, X-ray may look inverted - fix that:
        if fix_monochrome and dicom_header.PhotometricInterpretation == "MONOCHROME1":
            data = np.amax(data) - data
        data = data - np.min(data)
        data = data / np.max(data)
        modified_image_data = (data * 255).astype(np.uint8)
    
        return dicom_dict, modified_image_data
    
    else:
        return dicom_dict

def _sanitise_unicode(s):
    return s.replace(u"\u0000", "").strip()

def _convert_value(v):
    t = type(v)
    if t in (list, int, float):
        cv = v
    elif t == str:
        cv = _sanitise_unicode(v)
    elif t == bytes:
        s = v.decode('ascii', 'replace')
        cv = _sanitise_unicode(s)
    elif t == dicom.valuerep.DSfloat:
        cv = float(v)
    elif t == dicom.valuerep.IS:
        cv = int(v)
    else:
        cv = repr(v)
    return cv

for filename in train_df.path[0:5]:
    df, img_array = dicom_dataset_to_dict(filename, 'fetch_both_values')
    
    fig, ax = plt.subplots(1, 2, figsize=[15, 8])
    ax[0].imshow(img_array, cmap=plt.cm.gray)
    ax[1].imshow(img_array, cmap=plt.cm.plasma)    
    plt.show()
    
    pprint.pprint(df)

In [None]:
# dicom_data_list = []
# for filename in train_df.path:
#     try:
#         data_di = dicom_dataset_to_dict(filename,'metadata_df')
#         dicom_data_list.append(data_di)
    
#     except:
#         continue

# dicom_data_df = pd.DataFrame(dicom_data_list) 
# dicom_data_df

# #====== Saving to csv files and creating artifacts ======
# dicom_data_df.to_csv("dicom_metadata.csv")

# run = wandb.init(project='siim', name='dicom_metadata')

# artifact = wandb.Artifact('dicom_metadata', type='dataset')

# #====== Add a file to the artifact's contents ======
# artifact.add_file("dicom_metadata.csv")

# #====== Save the artifact version to W&B and mark it as the output of this run ====== 
# run.log_artifact(artifact)

# run.finish()

In [None]:
run = wandb.init(project='siim', config = CONFIG)
artifact = run.use_artifact('ruchi798/siim/dicom_metadata:v1', type='dataset')
artifact_dir = artifact.download()
run.finish()

path = os.path.join(artifact_dir,"dicom_metadata.csv")
metadata = pd.read_csv(path)
metadata = metadata.drop(columns=["Unnamed: 0"])
metadata.head()

A snapshot of the newly created artifact:
<img src="https://i.imgur.com/QVHNpcB.png">

In [None]:
def label_sizes(col):
    labels = metadata[col].value_counts().index
    sizes = metadata[col].value_counts()
    uc = metadata[col].nunique()
    return labels, sizes, uc

def plot_pie(col1,col2,c1,c2):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(24,10))
    axs = [ax1, ax2]
    
    labels, sizes, uc = label_sizes(col1)
    explode = (0.05,)*uc
    
    if col1 == "De-identification Method":
        labels = list(map(lambda b: b.replace("CTP Default:  based on DICOM PS3.15 AnnexE. Details in 0012,0064","CTP Default:  based on DICOM PS3.15"), labels))
    
    
    ax1.pie(sizes, explode=explode, colors=c1, startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.6)
    ax1.add_artist(plt.Circle((0,0),0.4,fc='white'))
    ax1.set_title(col1 + " Distribution",weight="bold")

    labels, sizes, uc = label_sizes(col2)
    explode = (0.05,)*uc
    ax2.pie(sizes, explode=explode, colors=c2, startangle=60, labels=labels,autopct='%1.0f%%', pctdistance=0.6)
    ax2.add_artist(plt.Circle((0,0),0.4,fc='white'))
    ax2.set_title(col2 + " Distribution",weight="bold")
    
    plt.show()
    
plot_pie("Modality","Photometric Interpretation",['#5C8DFF','#abc4ff'],['#05979E','#87F5FB'])

plt.figure(figsize=(16, 8))
sns.countplot(y="Body Part Examined",data=metadata,linewidth=3,palette="PRGn")
plt.title("Body Part Examined Distribution",font="Serif", size = 20,weight="bold")
plt.show()

plt.figure(figsize=(16, 8))
sns.countplot(y="Private Creator",data=metadata,linewidth=3,palette=['#F9ADA0','#F9627D',"#6DAEDB"])
plt.title("Private Creator Distribution",font="Serif", size = 20, weight="bold")
plt.show()

plot_pie("De-identification Method","Patient's Sex",['#F3C98B',"#fff3b0",'#DE8E17'],['#E6C4E9','#C77ACD'])

In [None]:
# WandB plots

m = metadata.copy()
m = m.rename(columns={"Patient's Sex": "Patient Sex"})
cols_to_plot = ["Modality","Photometric Interpretation","Body Part Examined","Private Creator","De-identification Method","Patient Sex"]

for col in cols_to_plot:
    plot_wb_bar(count_values(m,col),col, 'counts')

In [None]:
# initializing the run
run = wandb.init(project="siim",
                 job_type="upload",
                 config = CONFIG
                 )

# creating an artifact
artifact = wandb.Artifact(name="dicom_metadata_image", type="raw_data")

# setting up a WandB Table object to hold the dataset
columns = ['image',"Body Part Examined","Image Type","Modality","Patient's Name","Patient ID","Patient's Sex","Study Instance UID"]
table = wandb.Table(
    columns=columns
)

for filename in train_df.path[0:5]:
    data_di, img_array = dicom_dataset_to_dict(filename,'fetch_both_values')
    
    body_part_examined = data_di.get("Body Part Examined")
    img_type = data_di.get('Image Type')
    modality = data_di.get("Modality")
    p_name = data_di.get("Patient's Name")
    p_id = data_di.get("Patient ID")
    p_gender = data_di.get("Patient's Sex")
    study_inst_uid = data_di.get("Study Instance UID")
    
    img_object = Image.fromarray(img_array)
    # raw image
    raw_img = wandb.Image(img_object)

    # adding a row to the table
    row = [raw_img,body_part_examined,img_type,modality,p_name,p_id,p_gender,study_inst_uid]
    table.add_data(*row)
       
# adding the table to the artifact
artifact.add(table, "dicom_examples")
   
# logging the artifact
run.log_artifact(artifact)

run.finish()

Visualizing the DICOM data in a W&B table: 

<img src = "https://i.imgur.com/SeAidj8.gif">

### Images and Classes

In [None]:
# initializing the run
run = wandb.init(project="siim",
                 job_type="upload",
                 config = CONFIG
                 )

# creating an artifact 
artifact = wandb.Artifact(name="dicom_images", type="raw_data")

# setting up a WandB Table object to hold the dataset
columns=["dicom image", "class"]

table = wandb.Table(
    columns=columns
)

classes = ['Negative for Pneumonia','Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']
for siim_class in classes:
    print(siim_class)
    for _, row in train_df[train_df[siim_class]==1].iloc[:2].iterrows():
        filename = row['path']
        df, img_array = dicom_dataset_to_dict(filename,'fetch_both_values')
        
        fig, ax = plt.subplots(1, 2, figsize=[15, 8])
        ax[0].imshow(img_array, cmap=plt.cm.gray)
        ax[1].imshow(img_array, cmap=plt.cm.plasma)   
        plt.show()
        
        img_object = Image.fromarray(img_array)
        
        # raw image
        raw_img = wandb.Image(img_object)

        # adding a row to the table
        row = [raw_img,siim_class]
        table.add_data(*row)
        
# adding the table to the artifact
artifact.add(table, "raw_examples")
    
# logging the artifact
run.log_artifact(artifact)

run.finish()

We can interact with the W&B table by specifying filters on any column to **limit the visible rows down to only rows that match**!

Here I've filtered the table to see only those images that have the class label as ```Atypical Appearance``` or ```Indeterminate Appearance```.

<img src = "https://i.imgur.com/Cbcn9nP.gif">

Super thankful to @[xhlulu](https://www.kaggle.com/xhlulu) for converting [dicom image data to jpg files](https://www.kaggle.com/xhlulu/siim-covid-19-convert-to-jpg-256px)! ⚡

In [None]:
train_jpg_directory = '../input/siim-covid19-resized-to-256px-jpg/train'
test_jpg_directory = '../input/siim-covid19-resized-to-256px-jpg/test'

def getImagePaths(path):
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

train_images_path = getImagePaths(train_jpg_directory)
test_images_path = getImagePaths(test_jpg_directory)

print(f"{y_}Number of train images: {g_} {len(train_images_path)}\n")
print(f"{y_}Number of test images: {g_} {len(test_images_path)}\n")

def getShape(data, images_paths):
    shape = cv2.imread(images_paths[0]).shape
    for image_path in images_paths:
        image_shape=cv2.imread(image_path).shape
        if (image_shape!=shape):
            return data +" - Different image shape"
        else:
            return data +" - Same image shape " + str(shape)

In [None]:
run = wandb.init(project='siim', name='count',config = CONFIG)

wandb.log({'Training samples': len(train_images_path) , 
           'Test samples': len(test_images_path) 
          })

run.finish()

### Checking if images in each directory have the same shape

In [None]:
getShape('train',train_images_path)

In [None]:
getShape('test',test_images_path)

# Data Augmentation

In [None]:
def plot_augmentations(images, titles, sup_title):
    fig, axes = plt.subplots(figsize=(20, 16), nrows=3, ncols=4, squeeze=False)
    
    for indx, (img, title) in enumerate(zip(images, titles)):
        axes[indx // 4][indx % 4].imshow(img)
        axes[indx // 4][indx % 4].set_title(title, fontsize=15)
        
    plt.tight_layout()
    fig.suptitle(sup_title, fontsize = 20)
    fig.subplots_adjust(wspace=0.2, hspace=0.2, top=0.93)
    plt.show()
    
def augment(paths, data):
    
    # list of albumentations
    albumentations = [A.RandomSunFlare(p=1), A.RandomFog(p=1), A.RandomBrightness(p=1),
                              A.RandomCrop(p=1,height = 128, width = 128), A.Rotate(p=1, limit=90),
                              A.RGBShift(p=1), A.RandomSnow(p=1),
                              A.HorizontalFlip(p=1), A.VerticalFlip(p=1), A.RandomContrast(limit = 0.5,p = 1),
                              A.HueSaturationValue(p=1,hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=50)]
    
    # image titles
    titles = ["RandomSunFlare","RandomFog","RandomBrightness",
                       "RandomCrop","Rotate", "RGBShift", "RandomSnow","HorizontalFlip", "VerticalFlip", "RandomContrast","HSV"]
    
    for i in paths:
        image_path = i
        
        # getting image name from path
        image_name = image_path.split("/")[4].split(".")[0]
        
        # reading image
        image = cv2.imread(image_path)

        # list of images
        images = []
        
        # creating image augmentations
        for augmentation_type in albumentations:
            augmented_img = augmentation_type(image = image)['image']
            images.append(augmented_img)

        # original image
        titles.insert(0, "Original")
        images.insert(0,image)  
        
        sup_title = "Image Augmentation for " + data + " - " + image_name
        plot_augmentations(images, titles, sup_title)
        
        titles.remove("Original")

### Data Augmentation (train samples)

In [None]:
augment(train_images_path[0:2],'train')

### Data Augmentation (test samples)

In [None]:
augment(train_images_path[0:2],'test')

I've created a [dataset](https://www.kaggle.com/ruchi798/siimfisabiorsna-covid19-detection-augmented) of image augmentations for all the training and testing images as well 🥳

This is what my [project](https://wandb.ai/ruchi798/siim?workspace=user-ruchi798) looks like on the W&B dashboard ⬇️
<img src="https://i.imgur.com/lFIrsJT.png">

Work in Progress ⏳