## Introdution
This Notebook is how anyone can get started with this compition 

This notebook is heavily inspired  by : https://www.kaggle.com/ruchi798/siim-covid-19-detection-eda-data-augmentation

**Problem Statement:**
* You’ll identify and localize COVID-19 abnormalities on chest radiographs. In particular, you'll categorize the radiographs as negative for pneumonia or typical, indeterminate, or atypical for COVID-19. 

**Evaluation Metric**
*  mean Average Precision (mAP)



## Importing Nessesary Libraries

In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm 
import glob 
import os
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
import pprint
import pydicom as dicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import albumentations as A
import cv2
import wandb
from PIL import Image

## Getting the Input Data 

We are given with 2 csv file: train_study_level and train_image_level and Folder train with all the training images 

We are supposed to merge the 2 csv files and make an column called "path" (Path to Image) into a DataFrame called train

In [None]:
train_study_level=pd.read_csv("../input/siim-covid19-detection/train_study_level.csv")
train_study_level.head()

In [None]:
train_image_level=pd.read_csv("../input/siim-covid19-detection/train_image_level.csv")
train_image_level.head()

In [None]:
train_study_level["StudyInstanceUID"]=train_study_level["id"].apply(lambda x:x.replace('_study','') )
del train_study_level['id']
train=train_image_level.merge(train_study_level,on="StudyInstanceUID")
train.head()

glob :Module used to retrieve files/pathnames matching a specified pattern


In [None]:
train_dir="../input/siim-covid19-detection/train/"
test_dir="../input/siim-covid19-detection/test/"

training_paths = []

for i in tqdm(train['StudyInstanceUID']):
    training_paths.append(glob.glob(os.path.join(train_dir, i +"/*/*"))[0])

train['path'] = training_paths

In [None]:
train.head()

## Data Exploration 

In [None]:
fig,ax=plt.subplots(4,1)

sns.kdeplot(train["Negative for Pneumonia"],shade=True,ax=ax[0])
ax[0].set_title("Negative for Pneumonia")
sns.kdeplot(train["Typical Appearance"],shade=True,ax=ax[1])
ax[1].set_title("Typical Appearance")
sns.kdeplot(train["Indeterminate Appearance"],shade=True,ax=ax[2])
ax[2].set_title("Indeterminate Appearance")
sns.kdeplot(train["Atypical Appearance"],shade=True,ax=ax[3])
ax[3].set_title("Atypical Appearance")



fig.subplots_adjust( hspace=2.0, top=2.0)
plt.show()

In [None]:
fig,ax=plt.subplots(4,1)

sns.countplot(x=train["Negative for Pneumonia"],ax=ax[0])
ax[0].set_title("Negative for Pneumonia")
sns.countplot(x=train["Typical Appearance"],ax=ax[1])
ax[1].set_title("Typical Appearance")
sns.countplot(x=train["Indeterminate Appearance"],ax=ax[2])
ax[2].set_title("Indeterminate Appearance")
sns.countplot(x=train["Atypical Appearance"],ax=ax[3])
ax[3].set_title("Atypical Appearance")



fig.subplots_adjust( hspace=2.0, top=2.0)
plt.show()

## Dicom Data
* Images given to us are in the dicom format 
* Digital Imaging and Communications in Medicine is the standard for the communication and management of medical imaging information and related data
* We will need to read and convert dicom images to jpeg for convinience using dicom python library

#### Images and Metadata

In [None]:
voi_lut=True
fix_monochrome=True

def dicom_dataset_to_dict(filename):
    """Credit: https://github.com/pydicom/pydicom/issues/319
               https://www.kaggle.com/raddar/convert-dicom-to-np-array-the-correct-way
    """
    
    dicom_header = dicom.dcmread(filename) 
    
    #====== DICOM FILE DATA ======
    dicom_dict = {}
    repr(dicom_header)
    for dicom_value in dicom_header.values():
        if dicom_value.tag == (0x7fe0, 0x0010):
            #discard pixel data
            continue
        if type(dicom_value.value) == dicom.dataset.Dataset:
            dicom_dict[dicom_value.name] = dicom_dataset_to_dict(dicom_value.value)
        else:
            v = _convert_value(dicom_value.value)
            dicom_dict[dicom_value.name] = v
      
    del dicom_dict['Pixel Representation']
    
    #====== DICOM IMAGE DATA ======
    # VOI LUT (if available by DICOM device) is used to transform raw DICOM data to "human-friendly" view
    if voi_lut:
        data = apply_voi_lut(dicom_header.pixel_array, dicom_header)
    else:
        data = dicom_header.pixel_array
    # depending on this value, X-ray may look inverted - fix that:
    if fix_monochrome and dicom_header.PhotometricInterpretation == "MONOCHROME1":
        data = np.amax(data) - data
    data = data - np.min(data)
    data = data / np.max(data)
    modified_image_data = (data * 255).astype(np.uint8)
    
    return dicom_dict, modified_image_data

def _sanitise_unicode(s):
    return s.replace(u"\u0000", "").strip()

def _convert_value(v):
    t = type(v)
    if t in (list, int, float):
        cv = v
    elif t == str:
        cv = _sanitise_unicode(v)
    elif t == bytes:
        s = v.decode('ascii', 'replace')
        cv = _sanitise_unicode(s)
    elif t == dicom.valuerep.DSfloat:
        cv = float(v)
    elif t == dicom.valuerep.IS:
        cv = int(v)
    else:
        cv = repr(v)
    return cv

In [None]:
for filename in train.path[0:5]:
    df, img_array = dicom_dataset_to_dict(filename)
    
    fig, ax = plt.subplots(1, 2, figsize=[10, 8])
    ax[0].imshow(img_array, cmap=plt.cm.gray)
    ax[1].imshow(img_array, cmap=plt.cm.plasma)    
    plt.show()
    
    pprint.pprint(df)

* Converting and storing Dicom images in jpeg format 

In [None]:
classes = ['Negative for Pneumonia','Typical Appearance', 'Indeterminate Appearance', 'Atypical Appearance']
for siim_class in classes:
    print(siim_class)
    for _, row in train[train[siim_class]==1].iloc[:2].iterrows():
        filename = row['path']
        df, img_array = dicom_dataset_to_dict(filename)
        
        fig, ax = plt.subplots(1, 2, figsize=[10, 8])
        ax[0].imshow(img_array, cmap=plt.cm.gray)
        ax[1].imshow(img_array, cmap=plt.cm.plasma)   
        plt.show()
        
        img_object = Image.fromarray(img_array)
        
        

        

Converted Dataset : https://www.kaggle.com/xhlulu/siim-covid19-resized-to-256px-jpg

In [None]:
train_jpg_directory = '../input/siim-covid19-resized-to-256px-jpg/train'
test_jpg_directory = '../input/siim-covid19-resized-to-256px-jpg/test'

def getImagePaths(path):
    image_names = []
    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            image_names.append(fullpath)
    return image_names

train_images_path = getImagePaths(train_jpg_directory)
test_images_path = getImagePaths(test_jpg_directory)

print(f"Number of train images: {len(train_images_path)}\n")
print(f"Number of test images:  {len(test_images_path)}\n")

def getShape(data, images_paths):
    shape = cv2.imread(images_paths[0]).shape
    for image_path in images_paths:
        image_shape=cv2.imread(image_path).shape
        if (image_shape!=shape):
            return data +" - Different image shape"
        else:
            return data +" - Same image shape " + str(shape)