# Capstone 3: Corona Chest X-ray Image Processing

## 0. Libraries

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split # splitting train and validation data

# For visualizing images
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.image as mpimg
import random
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# For augmenting data
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# For modelling
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Flatten, Conv2D, MaxPool2D, Activation
from tensorflow.keras import Sequential, layers
import tensorflow_hub as hub
from sklearn.model_selection import StratifiedKFold

# For evaluation
from sklearn.metrics import classification_report, confusion_matrix, roc_curve

import os

## 1. Data Wrangling

- 1.1. Data Loading
- 1.2. scale thresholding
- 1.3. applying filters
- 1.4. transformations
- 1.5. segmentation

### 1.1 Data Loading

In [8]:
metadata = pd.read_csv('../Data/Chest_xray_Corona_Metadata.csv')
metadata_summary = pd.read_csv('../Data/Chest_xray_Corona_dataset_Summary.csv')

train_data = metadata[metadata['Dataset_type'] == 'TRAIN']
test_data = metadata[metadata['Dataset_type'] == 'TEST']

train_directory = '../Data/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train'
test_directory = '../Data/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/test'

In [10]:
metadata.tail()

Unnamed: 0.1,Unnamed: 0,X_ray_image_name,Label,Dataset_type,Label_2_Virus_category,Label_1_Virus_category
5905,5928,person1637_virus_2834.jpeg,Pnemonia,TEST,,Virus
5906,5929,person1635_virus_2831.jpeg,Pnemonia,TEST,,Virus
5907,5930,person1634_virus_2830.jpeg,Pnemonia,TEST,,Virus
5908,5931,person1633_virus_2829.jpeg,Pnemonia,TEST,,Virus
5909,5932,person1632_virus_2827.jpeg,Pnemonia,TEST,,Virus


In [12]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5910 entries, 0 to 5909
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              5910 non-null   int64 
 1   X_ray_image_name        5910 non-null   object
 2   Label                   5910 non-null   object
 3   Dataset_type            5910 non-null   object
 4   Label_2_Virus_category  69 non-null     object
 5   Label_1_Virus_category  4334 non-null   object
dtypes: int64(1), object(5)
memory usage: 277.2+ KB


In [14]:
print("List of null values for each column: \n")
Missing_count = metadata.isna().sum().sort_values(ascending = False)[:25]
missing = pd.concat([Missing_count, (Missing_count/len(metadata))*100], axis=1, keys = ["count", "%"])
missing_type = pd.concat([metadata.isna().sum().sort_values(ascending = False), (metadata.isna().sum().sort_values(ascending = False)/len(metadata))*100,metadata.dtypes], axis=1, keys = ["count", "%", "types"])
print(missing_type.sort_values(by=["count"], ascending = False))


metadata.isnull().sum()

List of null values for each column: 

                        count          %   types
Label_2_Virus_category   5841  98.832487  object
Label_1_Virus_category   1576  26.666667  object
Unnamed: 0                  0   0.000000   int64
X_ray_image_name            0   0.000000  object
Label                       0   0.000000  object
Dataset_type                0   0.000000  object


Unnamed: 0                   0
X_ray_image_name             0
Label                        0
Dataset_type                 0
Label_2_Virus_category    5841
Label_1_Virus_category    1576
dtype: int64

In [15]:
#replace null data points to 'unknown'
metadata.fillna('unknown', inplace=True)
metadata.isnull().sum()

Unnamed: 0                0
X_ray_image_name          0
Label                     0
Dataset_type              0
Label_2_Virus_category    0
Label_1_Virus_category    0
dtype: int64

In [16]:
print((metadata['Label_1_Virus_category']).value_counts())
print((metadata['Label_2_Virus_category']).value_counts())

bacteria          2777
unknown           1576
Virus             1555
Stress-Smoking       2
Name: Label_1_Virus_category, dtype: int64
unknown          5841
COVID-19           58
Streptococcus       5
SARS                4
ARDS                2
Name: Label_2_Virus_category, dtype: int64


#### WE NEED TO ADD MORE IMAGES TO BOOST MODEL PERFORMANCE CUZ THERE ARE ONLY 58 PICTURES WITH COVID

In [17]:
print(f"Shape of train data: {train_data.shape}")
print(f"Shape of test data: {test_data.shape}")

Shape of train data: (5286, 6)
Shape of test data: (624, 6)
