# EDA

In [8]:
import os
from os import listdir
import pandas as pd
import numpy as np
import glob
from skimage import exposure
import matplotlib.pyplot as plt
%matplotlib inline

# pydicom
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
from fastai.imports import *
from fastai.medical.imaging import *


from pydicom import dcmread, read_file
from pydicom.data import get_
data_file

import cv2

# color
from colorama import Fore, Back, Style

import seaborn as sns
sns.set(style="whitegrid")

# plotly
import plotly.express as px
import plotly

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

import os
from ast import literal_eval


# Settings for pretty nice plots
plt.style.use('fivethirtyeight')
plt.show()

Dado a que se están utilizando imágenes DICOM se deben instalar la librería correspondiente para su manejo

In [1]:
!pip install python-gdcm

Collecting python-gdcm
  Downloading python_gdcm-3.0.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.4 MB)
[K     |████████████████████████████████| 9.4 MB 718 kB/s eta 0:00:01     |█████████████▉                  | 4.0 MB 807 kB/s eta 0:00:07
[?25hInstalling collected packages: python-gdcm
Successfully installed python-gdcm-3.0.9.1


## Exploración de datos

Cargamos los datos en memoria

In [None]:
PATH = '/kaggle/input/siim-covid19-detection/'
#submission = pd.read_csv('/kaggle/input/siim-covid19-detection/sample_submission.csv', index_col=None)
image_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_image_level.csv', index_col=None)
study_df = pd.read_csv('/kaggle/input/siim-covid19-detection/train_study_level.csv', index_col=None)
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', None)
print(f"{y_}Train image level csv shape : {image_df.shape}{res}\n{g_}Train study level csv shape : {study_df.shape}{res}")

In [None]:
all_files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        all_files.append(os.path.join(dirname, filename))

Con los datos listos, procedemos a analizar lo que el conjunto de datos de estudio y de imágenes nos brindan

**Study level csv**

In [None]:
study_df.info()

In [None]:
study_df.head()

In [None]:
study_df.describe()

**Image Level csv**

In [None]:
image_df.info()

In [None]:
image_df.head()

In [None]:
image_df.describe()

Se procede a manejar las diferentes categorias con diferentes colores

In [None]:
study_grp = pd.melt(study_df, id_vars=list(study_df.columns)[:1], value_vars=list(study_df.columns)[1:],
             var_name='label', value_name='value')
study_grp = study_grp.loc[study_grp['value']!=0]
colors = {'Typical Appearance' : '#DCD427',
'Negative for Pneumonia' : '#0092CC',
'Indeterminate Appearance' : '#CC3333',
#'Atypical Appearance' : '#779933',
          'Atypical Appearance' : '#E6E6E6'
         }

study_grp = study_grp.groupby('label').sum().sort_values('value',ascending=False).reset_index()
study_grp['color'] = study_grp['label'].apply(lambda x: colors[x])
study_grp

Se define una función para representar gráficamente la distribución de las etiquetas en el dataset de estudio : 

In [None]:
def plot_study_label(df):
    pio.templates.default = "plotly_dark"
    fig = px.bar(df, x='label', y='value',
             hover_data=['label', 'value'], color='label',
             #labels={column: label},
             color_discrete_map=colors,
             text='value')
    fig.update_layout(xaxis={'categoryorder':'array', 'categoryarray': df['label'],
                             'title' : None, 
                             'showgrid':False},
                      yaxis={'showgrid':False,
                            'title' : 'Count'},
                      showlegend=False,
                     title = 'Study samples in train data')
    fig.update_traces(textfont_size=16)
    fig.show()

In [None]:
plot_study_label(study_grp)

Para una mejor comprensión repecto a la distribución se procede a generar una gráfica de sectores: 

In [None]:
study_grp['pct'] = round((study_grp['value'] / study_grp['value'].sum())*100,2)

fig = go.Figure(data=[go.Pie(labels=study_grp['label'],
                             values=study_grp['pct'],
                             hole=.3,
                             pull=[0.1, 0.1, 0.1, 0.1]
                            )
                     ]
               )
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=16,
                  marker=dict(colors=study_grp['color'], line=dict(color='#000000', width=2))
                 )
fig.update_layout(title={'text': "% of labels in training data",
        'y':0.9,
        'x':0.45,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()



Por ahora tenemos una mejor compresión de las etiquetas y cómo están distribuidas dentro de ambos data sets. Procederemos a explorar los archivos DICOM, tomaremos una imagen de muestra y la analizaremos : 

In [None]:
file_path = PATH+"train/00086460a852/9e8302230c91/65761e66de9f.dcm"
dicom = read_file(file_path, stop_before_pixels=False)

Con esta imagen cargada en memoria procederemos a ver su representación: 

In [None]:
img=dicom.pixel_array
type(img), img.shape

Con esto tenemos una idea de su tipo y tamaño, ahora procederemos a graficarla

In [None]:
box = image_df.loc[image_df['id']=='65761e66de9f_image'].reset_index(drop=True)
from ast import literal_eval

import matplotlib.patches as patches
# Create figure and axes
fig, ax = plt.subplots(figsize=(10, 8))
ax.imshow(img, cmap="gray")
# Create a Rectangle patch
rect1 = patches.Rectangle((720.65215, 636.51048), 332.19348, 648.12561, linewidth=1.5, edgecolor='r', facecolor='none')
rect2 = patches.Rectangle((2044.77989, 847.90622), 329.87049, 576.11169, linewidth=1.5, edgecolor='r', facecolor='none')
# Add the patch to the Axes
ax.add_patch(rect1)
ax.add_patch(rect2)
plt.show()

Ahora que tenemos una idea de cómo es una radiografía, procederemos a comparlas entre las etiquetas de diagnóstico

In [None]:
def get_samples(num):
    study_df_grp = pd.melt(study_df, id_vars=list(study_df.columns)[:1], value_vars=list(study_df.columns)[1:],
             var_name='label', value_name='value')
    study_df_grp = study_df_grp.loc[study_df_grp['value']!=0].reset_index(drop=True)
    labels = list(study_df_grp['label'].unique())
    study_samples = {}
    for label in labels:
        study_ids = study_df_grp.loc[study_df_grp['label'] == label].sample(num)['id'].tolist() #Get num sample rows from the datafame
        samples = []
        for study_id in study_ids:
            image = {}
            study_instance_id = study_id.split('_')[0]
            image_id = image_df.loc[image_df['StudyInstanceUID']==study_instance_id]['id'].values[0].split('_')[0] #Get the image matching study id
            file_name = [string for string in all_files if image_id in string]
            image['study_id'] = study_instance_id
            image['dicom_file'] = file_name[0]
            #Get the bounding boxes
            box = None
            try:
                box = literal_eval(image_df.loc[image_df['StudyInstanceUID']==study_instance_id]['boxes'].values[0])
            except ValueError:
                pass
            image['boxes'] = box
            samples.append(image)
        study_samples[label] = samples
    return study_samples

samples = get_samples(6)

def display_all_class_samples():
    ''' Input : List of samples 
    '''
    all_class_samples = []
    for key in samples:
        sample_dict = samples[key][0]
        sample_dict['class'] = key
        all_class_samples.append(sample_dict)
    fig1, ax1 = plt.subplots(1,4, figsize=(18, 5), facecolor='w', edgecolor='b')
    fig1.subplots_adjust(hspace =.3, wspace=0.3)
    axs = ax1.ravel()
    for item, ax in zip(all_class_samples, axs):
        dicom = read_file(item['dicom_file'], stop_before_pixels=False)
        img = dicom.pixel_array
        ax.imshow(img, cmap="gray")
        if 'boxes' in item and item['boxes'] is not None:
            for box in item['boxes']:             
                rect = patches.Rectangle((box['x'], box['y']), box['width'], box['height'], linewidth=1.5, edgecolor='r', facecolor='none')
                ax.add_patch(rect)
        ax.set_title('{}'.format(item['class']),fontsize = 18)    
    plt.tight_layout(pad=3.0)
    plt.subplots_adjust(top=0.91)
    plt.suptitle('Samples across all classes',fontsize = 20)
    plt.show()


In [None]:
display_all_class_samples()

Es útil poder ver las radiografías entre las diferentes clases ahora procederemos a explorar cada categoría:

In [None]:
def display_samples(samples, title, draw_boxes=False):
    ''' Input : List of samples 
    '''
    fig1, ax1 = plt.subplots(2,3, figsize=(18, 12), facecolor='w', edgecolor='b')
    fig1.subplots_adjust(hspace =.3, wspace=0.3)
    axs = ax1.ravel()
    for item, ax in zip(samples, axs):
        dicom = read_file(item['dicom_file'], stop_before_pixels=False)
        img = dicom.pixel_array
        ax.imshow(img, cmap="gray")
        if draw_boxes == True and item['boxes'] is not None:
            for box in item['boxes']:             
                rect = patches.Rectangle((box['x'], box['y']), box['width'], box['height'], linewidth=1.5, edgecolor='r', facecolor='none')
                ax.add_patch(rect)
        ax.set_title('Study : {}'.format(item['study_id']),fontsize = 18)
        
    plt.tight_layout(pad=3.0)
    plt.subplots_adjust(top=0.91)
    plt.suptitle(title,fontsize = 20)
    plt.show()

**Negativo neumonía**

In [None]:
display_samples(samples['Negative for Pneumonia'],'Negative for Pneumonia')

In [None]:
display_histogram(samples['Negative for Pneumonia'],'Negative for Pneumonia')

**Aspecto típico**

In [None]:
display_samples(samples['Typical Appearance'],'Typical Appearance', draw_boxes=True)

In [None]:
display_histogram(samples['Typical Appearance'],'Typical Appearance')

**Apariencia intermedia**

In [None]:
display_samples(samples['Indeterminate Appearance'],'Indeterminate Appearance', draw_boxes=True)

In [None]:
display_histogram(samples['Indeterminate Appearance'],'Indeterminate Appearance')

**Apariencia atípica**

In [None]:
display_samples(samples['Atypical Appearance'],'Atypical Appearance', draw_boxes=True)

In [None]:
display_histogram(samples['Atypical Appearance'],'Atypical Appearance')

Con esto tenemos una idea de cómo son las imágenes DICOM pero aún no las hemos explotado al 100% pues ellas manejan un espacio para metadata, procederemos a explorar este espacio:def get_files(file_format):
    files=[]
    train_files = []
    for file in all_files:
        if file_format in file:
            files.append(file)
    return files
train_files = get_files('/train/')


In [None]:
def get_files(file_format):
    files=[]
    train_files = []
    for file in all_files:
        if file_format in file:
            files.append(file)
    return files
train_files = get_files('/train/')


In [None]:
train_df = extract_metadata(columns, train_files)

In [None]:


train_df['Rows'] = train_df['Rows'].astype(int)
train_df['Columns'] = train_df['Columns'].astype(int)
train_df.to_csv('train_imgs_meta.csv', index=None)



Metadata de las imágenes de entrenamiento

In [None]:
train_df

Exploraremos aún más esta data :

In [None]:
train_df['PatientSex'].value_counts().reset_index()\
    .style.background_gradient(subset=['PatientSex'], cmap='winter_r')\

In [None]:
train_df['BodyPartExamined'].value_counts().reset_index()\
    .style.background_gradient(subset=['BodyPartExamined'], cmap='nipy_spectral_r')\

In [None]:
train_df['BitsStored'] = train_df['BitsStored'].astype(int)
def combine_image_size(row):
    return str(row['Rows']) + ',' + str(row['Columns'])
train_df['ImageSize'] = train_df.apply(lambda x: combine_image_size(x), axis=1)

In [None]:
fig = go.Figure(go.Scattergl(
    x=train_df['Rows'], y=train_df['Columns'],
    name='Image Size',
    mode='markers',  
    marker=dict(
        color='#0092CC',
    )
))
fig.update_layout(xaxis={'title' : 'Rows', 
                             'showgrid':False},
                      yaxis={'showgrid':False,
                            'title' : 'Columns'},
                      showlegend=False,
                     title = 'Train - image size')
fig.update_traces(textfont_size=16)
fig.show()