In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
current_path='/kaggle/input/siim-covid19-detection'
print (os.listdir(current_path))
print (1)

In [None]:
!wget 'https://anaconda.org/conda-forge/libjpeg-turbo/2.1.0/download/linux-64/libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2' -q
!wget 'https://anaconda.org/conda-forge/libgcc-ng/9.3.0/download/linux-64/libgcc-ng-9.3.0-h2828fa1_19.tar.bz2' -q
!wget 'https://anaconda.org/conda-forge/gdcm/2.8.9/download/linux-64/gdcm-2.8.9-py37h500ead1_1.tar.bz2' -q
!wget 'https://anaconda.org/conda-forge/conda/4.10.1/download/linux-64/conda-4.10.1-py37h89c1867_0.tar.bz2' -q
!wget 'https://anaconda.org/conda-forge/certifi/2020.12.5/download/linux-64/certifi-2020.12.5-py37h89c1867_1.tar.bz2' -q
!wget 'https://anaconda.org/conda-forge/openssl/1.1.1k/download/linux-64/openssl-1.1.1k-h7f98852_0.tar.bz2' -q
!conda install 'libjpeg-turbo-2.1.0-h7f98852_0.tar.bz2' -c conda-forge -y
!conda install 'libgcc-ng-9.3.0-h2828fa1_19.tar.bz2' -c conda-forge -y
!conda install 'gdcm-2.8.9-py37h500ead1_1.tar.bz2' -c conda-forge -y
!conda install 'conda-4.10.1-py37h89c1867_0.tar.bz2' -c conda-forge -y
!conda install 'certifi-2020.12.5-py37h89c1867_1.tar.bz2' -c conda-forge -y
!conda install 'openssl-1.1.1k-h7f98852_0.tar.bz2' -c conda-forge -y


<b><a href='#1'>1:  Reading Files</a></b>
<br>
<b><a href='#2'>2:  Adding Images Path to DataFrame</a></b><br>
<b><a href='#3'>3:  Combining Image Level df and Study Level df</a></b><br>
<b><a href='#4'>4:  Adding Data from Image File</a></b><br>
<b><a href='#5'>5:  What exactly are we PREDICTING?</a></b><br>
<b><a href='#6'>6:  Distribution of Gender</a></b><br>
<b><a href='#7'>7:  Xray Categories</a></b><br>
<b> &nbsp;&nbsp;  <a href='#7.1'>7.1: Chest</a></b><br>
<b> &nbsp;&nbsp;  <a href='#7.2'>7.2: Thorax</a></b><br>
<b> &nbsp;&nbsp;  <a href='#7.3'>7.3: Empty Label</a></b><br>
<b> &nbsp;&nbsp;  <a href='#7.4'>7.4: Skull</a></b><br>
<b> &nbsp;&nbsp;  <a href='#7.5'>7.5: Port Chest</a></b><br>
<br>
<b><a href='#8'>8:  Distribution Of Labels</a></b><br>

<a id='1'></a>
#### Reading Files


In [None]:
import pandas as pd
train_image_level=pd.read_csv(os.path.join(current_path, 'train_image_level.csv'))
train_study_level=pd.read_csv(os.path.join(current_path,'train_study_level.csv'))
sample_submission=pd.read_csv(os.path.join(current_path,'sample_submission.csv'))
train_image_level.head()

In [None]:
train_study_level.head()

Note that all images are stored in paths with the form study/series/image. The study ID here relates directly to the study-level predictions, and the image ID is the ID used for image-level predictions.

<a id='2'></a>
#### Adding Images paths to dataframe
Now we will add the image path to the dataframe so we have the complete dataset

In [None]:
all_files_train=[]
all_files_test=[]
all_files_train=[os.path.join(dirname,filename) for dirname,_,filenames in os.walk(os.path.join(current_path,'train')) for filename in filenames]
all_files_test=[os.path.join(dirname,filename) for dirname,_,filenames in os.walk(os.path.join(current_path,'test')) for filename in filenames]

all_files_train_dict={x.split('/')[-1].replace('.dcm','_image'): x for x in all_files_train}
all_files_test_dict={x.split('/')[-1].replace('.dcm','_image'): x for x in all_files_test}
print (len(all_files_train)+len(all_files_test))
print ("All files present")

train_image_level['path']=train_image_level.id.map(all_files_train_dict)
train_image_level['id']=train_image_level['id'].apply(lambda x: x.replace('_image',''))
train_image_level['simplified_path']=train_image_level['path'].apply(lambda x: '/'.join(x.split('/')[5:]))
train_study_level=train_study_level.rename(columns={'id':'StudyInstanceUID'}, inplace=False)

train_study_level['StudyInstanceUID']=train_study_level['StudyInstanceUID'].apply(lambda x: x.replace('_study',''))

We have now added image path to the train_image_level dataframe. 

In [None]:
train_image_level.head()

<a id='3'></a>
#### Merging Study level Dataset and Image level dataset
We will now merge it with train_study level so that we have one complete dataset to analyse

* PLEASE NOTE that each study can have MULTIPLE IMAGES
* Because of this reason we are joining the train_image_level dataframe which have all the images id with the train_study_level which have all the study id.
* Because two images can have ONE studyID hence in complete_data dataframe we might see duplicate StudyId.

In [None]:
complete_data=train_image_level.merge(train_study_level, how='inner', on='StudyInstanceUID')

In [None]:
complete_data.head()

Reordering columns for better view

In [None]:
columns_reordered=['id',
 'StudyInstanceUID',
 'boxes',
 'label',
 'Negative for Pneumonia',
 'Typical Appearance',
 'Indeterminate Appearance',
 'Atypical Appearance',
 'path',
 'simplified_path']
complete_data=complete_data[columns_reordered]


<a id='4'></a>
#### Adding Data from image file (.dcm)
Adding data from image file as well

In [None]:
import pydicom
from pydicom import dcmread
def process_dicom(dicom_obj):
    pixel_data=(0x7fe0, 0x0010) #ignore the pixel data
    data_dict={}
    for x in dicom_obj:
        if x.tag==pixel_data:
            continue
        value=dicom_obj[x.tag].value
        name=x.name
        data_dict[name]=value
    return data_dict

In [None]:
from tqdm import tqdm
dicom_dict={}
needed_columns=["Patient ID","Patient's Sex","Body Part Examined","Imager Pixel Spacing",
                "Photometric Interpretation"]
# "Study Instance UID","Study ID"

for i,x in tqdm(complete_data.iterrows()):
    dicom_obj=pydicom.dcmread(x['path'],stop_before_pixels=True)
    dicom_obj_dict=process_dicom(dicom_obj)
    for key in dicom_obj_dict:
        if type(dicom_obj_dict[key])==list:
            continue
        if key in needed_columns:
            if key not in dicom_dict:
                dicom_dict[key]=[]
            dicom_dict[key].append(dicom_obj_dict[key])
            
for col in needed_columns:
    complete_data[col]=dicom_dict[col]

<a id='5'></a>
#### What exactly are we predicting?
#### Its important here to distinguish BETWEEN study and image data.
* for example there are 6054 studies hence each study can be on more than one image as well. But there are total 6334 images. There can be study on more than ONE image as well. 

* For prediction we need to make TWO prediction
    * For first one on train_image_level
        * If there is an object on the image we will predict the confidence in our prediction and the corresponding bounding box 
        * for example: "opacity 1 1543 341 2484 1002 opacity 0.9 222 22 22 22"
        * this means that there are two objects in the image we are predicting and first object we are predicting with 100% confidence and other one we are predicting with 90% confidence. 
        * The four numbers after the confidence score are the coordinates of the bounding box corresponding to xmin, ymin, xmax,ymax. 
        * In general our prediction will be like this "opacity confidence_score xmin ymin xmax ymax" OR if there is no object then our prediction should be like this "ImageId none 1 0 0 1 1"
    * For second one train_study_level
        * In this one OUR model have to predict one of the following labels "negative", "typical", "indeterminate", "atypical"
        * The prediction have to be in the format "StudyID prediction confidencescore 0 0 1 1"
        * For Example if our prediction is negative with a confidence score of 60% then our prediction sould be like as follows
        * "StudyID negative 0.6 0 0 1 1"
    * Please note that with train_study_level prediction we will add StudyID and for the train_image_level we will add Image to distinguish our predictions
        



Seems like we have a good enough distribution of MALE and FEMALE

<a id='6'></a>
#### Distribution of MALE AND FEMALE chest XRAY images

In [None]:
import seaborn as sns
sns.countplot(x="Patient's Sex",data=complete_data)
print (complete_data["Patient's Sex"].value_counts())

On Body Part Examined We need to do some bit of Data Cleaning.

### I just found out that PECHO is chest in spanish. :D

In [None]:
import seaborn as sns
import matplotlib.pylab as plt
plt.xticks(rotation=45)
sns.countplot(x="Body Part Examined",data=complete_data)
print (complete_data["Body Part Examined"].value_counts())

<a id='7'></a>
#### Lets View some XRAY photos with different categories, Head, Thorax, Skull, etc

In [None]:
def replace_with_correct(x):
    x=x.strip()
    
    if x.find('PECHO')!=-1 or x.lower().find('pecho')!=-1:
        # PECHO is chest in spanish ;P
        return 'chest'
    if x=='TORAX' or x=='TÒRAX' or x=='2- TORAX' or x=='T?RAX':
        return 'THORAX'.lower()
    elif x=='':
        return "EMPTY".lower()
    else:
        return x.lower()
    
complete_data['Body Part Examined']=complete_data['Body Part Examined'].apply(replace_with_correct)

### We see that most of the data is of the chest but there are skull xrays, thorax xrays as well.
### Lets try to view some of the sample images of each part

In [None]:
plt.clf()
plt.xticks(rotation=45)
sns.countplot(x="Body Part Examined",data=complete_data)
print (complete_data["Body Part Examined"].value_counts())

In [None]:
def plot_figures(figures, nrows = 1, ncols=1):
    fig, axeslist = plt.subplots(ncols=ncols, nrows=nrows)
    for ind,title in enumerate(figures):
        axeslist.ravel()[ind].imshow(figures[title], cmap=plt.gray())
        axeslist.ravel()[ind].set_title(title)
        axeslist.ravel()[ind].set_axis_off()
    plt.tight_layout() # optional

all_cats=list(complete_data["Body Part Examined"].value_counts().index)
print (all_cats)

<a id='7.1'></a>
#### SAMPLE IMAGES OF CHEST

In [None]:
complete_data_chest=complete_data[complete_data['Body Part Examined']=='chest']
complete_data_chest=complete_data_chest.sample(8)
figures={x['id']:dcmread(x['path']).pixel_array for i,x in complete_data_chest.iterrows()}
plot_figures(figures,2,4)


<a id='7.2'></a>
#### SAMPLE IMAGES OF THORAX

In [None]:
complete_data_thorax=complete_data[complete_data['Body Part Examined']=='thorax'].sample(8)
figures={x['id']:dcmread(x['path']).pixel_array for i,x in complete_data_thorax.iterrows()}
plot_figures(figures,2,4)


<a id='7.3'></a>
#### SAMPLE IMAGES OF (EMPTY) LABEL

In [None]:
complete_data_empty=complete_data[complete_data['Body Part Examined']=='empty'].sample(8)
figures={x['id']:dcmread(x['path']).pixel_array for i,x in complete_data_empty.iterrows()}
plot_figures(figures,2,4)

<a id='7.4'></a>
#### SAMPLE IMAGES OF SKULL

In [None]:
complete_data_skull=complete_data[complete_data['Body Part Examined']=='skull'].sample(8)
figures={x['id']:dcmread(x['path']).pixel_array for i,x in complete_data_skull.iterrows()}
plot_figures(figures,2,4)

<a id='7.5'></a>
#### SAMPLE IMAGES OF PORT CHEST

In [None]:
complete_data_portchest=complete_data[complete_data['Body Part Examined']=='port chest'].sample(8)
figures={x['id']:dcmread(x['path']).pixel_array for i,x in complete_data_portchest.iterrows()}
plot_figures(figures,2,4)

#### We see that irrespective of the category the images are of chest, Hence we can deduce that Body Part Examined field is useless as it doesnt categorises it correctly. 


<a id='8'></a>
#### Lets analyse the classes in Train Study Level
#### Distribution of Labels

In [None]:
columns=list(train_study_level.columns)[1:]
train_study_bar=train_study_level[columns]
counts = train_study_bar.apply(lambda x: x.value_counts()).transpose()# / len(x)).transpose()
fig = plt.figure()
ax = fig.add_subplot(111)
counts.plot(ax=ax,kind='bar', stacked=True, rot=0)
vals = ax.get_yticks()
plt.xticks(rotation=45)
ax.set_yticklabels(['{:3.2f}'.format(x) for x in vals])
ax.yaxis.grid(True)
ax.set_axisbelow(True)
plt.title("Distribution of Labels")
plt.show()
