In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        pass

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import warnings
import numpy as np 
import random
import pandas as pd 
import missingno as msno
from collections import Counter
import glob
from tqdm.notebook import tqdm
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import os
import pydicom
from pydicom.pixel_data_handlers.util import apply_voi_lut
import cv2

# from skimage import exposure

In [None]:
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

seed = 2021
seed_everything(seed)

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 150)

In [None]:
# Defining all our palette colours.
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

primary_green = px.colors.qualitative.Plotly[2]

plotly_discrete_sequence = px.colors.qualitative.G10

plt.rcParams['figure.dpi'] = 120
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['font.family'] = 'serif'
plt.rcParams['axes.facecolor'] = primary_bgcolor

colors = [primary_blue, primary_blue2, primary_blue3, primary_grey, primary_black, primary_bgcolor, primary_green]
sns.palplot(sns.color_palette(colors))

In [None]:
sns.palplot(sns.color_palette(plotly_discrete_sequence))

# <p style="background-color:skyblue; font-family:newtimeroman; font-size:250%; text-align:center; border-radius: 15px 50px;">🦠SIIM Covid-19🦟 EDA and Visualization 📊</p>

This is an **object detection** and **classification problem**, meaning that for each instance we'll have to predict a bounding box and a class. It seems to be a multi-label problem cuz there are 4 columns per image, but as they are auto-axclusive, the challenge is a multi-class problem.

 <div class="alert alert-success" role="alert">
    <p>💡 <b>Competition Goal</b>: Categorize chest radiographs as negative for pneumonia, typical, indeterminate, or atypical for COVID-19. If some abnormalities are found, provide the bounding boxes. </p>
</div>

We can see that we have:
* `train_study_level.csv` - the train study-level metadata, with one row for each study, including correct labels.
* `train_image_level.csv` - the train image-level metadata, with one row for each image, including both correct labels and any bounding boxes in a dictionary format. Some images in both test and train have multiple bounding boxes.
* `sample_submission.csv` - a sample submission file containing all image- and study-level IDs.
* train folder - comprises chest scans in DICOM format, stored in paths following the schema: `study/series/image`
* test folder - The hidden test dataset is of roughly the same scale as the training dataset.

In [None]:
virus_url = 'https://upload.wikimedia.org/wikipedia/commons/2/21/Virus_gray_black.svg'

In [None]:
# Read in metadata
train_study_df = pd.read_csv("../input/siim-covid19-detection/train_study_level.csv")
train_image_df = pd.read_csv("../input/siim-covid19-detection/train_image_level.csv")

print(f"Train Study Shape: {train_study_df.shape} \n" +
      f"Train Image Shape: {train_image_df.shape} \n" + "\n" +
      f"Note: There are {train_image_df['boxes'].isna().sum()} missing values in train_image.")

In [None]:
train_study_df.head()

In [None]:
train_image_df.head()

<a id='1'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">1. First overview and EDA 📊</p>

The bounding box labels are provided in the `label` column. The format is as follows:

`[class ID] [confidence score] [bounding box]`

`class ID` - either opacity or none
`confidence score` - confidence from your neural network model. If none, the confidence is 1.
`bounding box` - typical x0 y0 x1 y1 format. If class ID is none, the bounding box is 1 0 0 1 1.
The bounding boxes are also provided in easily readable dictionary format in column boxes, and the study that each image is a part of is provided `inStudyInstanceUID`.

Lets take a look about class distribution.

In [None]:
train_image_df['class'] = train_image_df['label'].apply(lambda x: x.split(' ')[0])

In [None]:
plot_df = train_image_df['class'].value_counts().reset_index()

fig = go.Figure(go.Bar(
    x = plot_df['class'],
    y = plot_df['index'],
    orientation='h',
    marker_color=[primary_blue, primary_grey],
    marker_line_color=primary_black,
    marker_line_width=1.5, 
    opacity=0.8,
))
# Change the bar mode
fig.update_layout(
    title='<span style="font-size:32px; font-family:Serif"><b>Class sidtribution</b></span>',
    yaxis_title=f'<b>Class</b>',
    xaxis_title=f'<b>Count</b>',
    legend_title="Group",
    font=dict(
        family="Times New Roman",
        size=14,
    )
)
fig.add_layout_image(
    dict(
        source=virus_url,
        xref="x", yref="paper",
        x=4000, y=0.1,
        sizex=400, sizey=0.25, 
        xanchor="center", yanchor="bottom",
        sizing='stretch',
    ),
)
fig.show()

Let's now analyze the `train_study_df` labels.

In [None]:
labels = train_study_df.drop(columns='id').columns

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=labels
)

for i, label in enumerate(labels):
    plot_df = train_study_df[label].value_counts().reset_index()
    
    fig.add_trace(go.Bar(
        x = plot_df[label],
        y = plot_df['index'],
        orientation='h',
        marker_color=[primary_grey, primary_blue],
        marker_line_color=primary_black,
        marker_line_width=1.5, 
        opacity=0.8,
        name=label
    ), row=i//2 + 1, col=i%2 + 1,)
    
    fig.add_layout_image(
        dict(
            source=virus_url,
            xref="x", yref="y domain",
            x=plot_df[plot_df['index'] == 1][label].values[0] * 0.7, y=0.6,
            sizex=320, sizey=0.24, 
            xanchor="center", yanchor="bottom",
            sizing='stretch',
        ), row=i//2 + 1, col=i%2 + 1,
    )
    
# Change the bar mode
fig.update_layout(
    title='<span style="font-size:32px; font-family:Serif"><b>Label sidtribution</b> in study data</span>',
    showlegend=False,
)
fig.show()

<a id='2'></a>
# <p style="background-color:skyblue; font-family:newtimeroman; font-size:150%; text-align:center; border-radius: 15px 50px;">2. Lets visaulize the images</p>

In [None]:
dataset_path = Path('../input/siim-covid19-detection')
list(dataset_path.iterdir())

To be continue...