In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [2]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense,Conv2D,MaxPooling2D,Flatten,Dropout,Flatten
from keras.optimizers import Adam
from keras.losses import binary_crossentropy
import os 
import matplotlib.pyplot as plt
from PIL import Image
import seaborn as sns

from keras.preprocessing.image import ImageDataGenerator

In [3]:
summary=pd.read_csv('/kaggle/input/coronahack-chest-xraydataset/Chest_xray_Corona_dataset_Summary.csv')
metadata=pd.read_csv('/kaggle/input/coronahack-chest-xraydataset/Chest_xray_Corona_Metadata.csv')

In [4]:
summary


In [5]:
metadata

In [6]:
train_data=metadata[metadata['Dataset_type']=='TRAIN']

In [7]:
train_data.shape

In [8]:
train_data.head()

In [9]:
train_data.isna().sum()

In [10]:
train_data['Label_1_Virus_category'].value_counts()

In [11]:
train_data=train_data.fillna('n/a')

In [12]:
targets = ['Label', 'Label_2_Virus_category', 'Label_1_Virus_category']
fig,ax=plt.subplots(2,2, figsize=(20, 10))
sns.countplot(x=targets[0],data=train_data, ax=ax[0, 0])
sns.countplot(x=targets[1],data=train_data, ax=ax[0, 1])
sns.countplot(x=targets[2],data=train_data, ax=ax[1, 0])
plt.show()

In [13]:
TRAIN_FOLDER = '/kaggle/input/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train'

In [14]:
sample_train_images = list(os.walk(TRAIN_FOLDER))[0][2][:8]
sample_train_images = list(map(lambda x: os.path.join(TRAIN_FOLDER, x), sample_train_images))

In [15]:
sample_train_images 

In [16]:
plt.figure(figsize=(20, 20))

for iterator, filename in enumerate(sample_train_images):
    image = Image.open(filename)
    plt.subplot(4, 2, iterator+1)
    plt.axis('off')
    plt.imshow(image)


plt.tight_layout()


In [17]:
balanced_data=train_data[(train_data['Label']=='Normal')| 
                              ((train_data['Label'] == 'Pnemonia') & (train_data['Label_2_Virus_category'] == 'COVID-19'))]

In [18]:
balanced_data['target'] =['negative' if holder == 'Normal' else 'positive' for holder in balanced_data['Label']]

In [19]:
from sklearn.utils import shuffle
balanced_data_subset_normal=balanced_data[balanced_data['target']=='negative']
balanced_data_subset_covid=balanced_data[balanced_data['target']=='positive']
balanced_data_frac_normal = balanced_data_subset_normal.sample(frac=(1/5))

balanced_data_concat = pd.concat([balanced_data_frac_normal, balanced_data_subset_covid], axis=0)
balanced_data_concat = shuffle(balanced_data_concat, random_state=0)
balanced_data_train = balanced_data_concat[:240]
balanced_data_validation = balanced_data_concat[240:]


In [20]:
train_data = train_data[(train_data['Label'] == 'Normal') | 
                              ((train_data['Label'] == 'Pnemonia') & (train_data['Label_2_Virus_category'] == 'COVID-19'))]
train_data['target'] = ['negative' if holder == 'Normal' else 'positive' for holder in train_data['Label']]

train_data = shuffle(train_data, random_state=1)

validation_data = train_data.iloc[1000:, :]

In [21]:
train_image_generator = ImageDataGenerator(
    rescale=1./255,
    featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=90,
    width_shift_range=0.15,
    height_shift_range=0.15,
    horizontal_flip=True,
    zoom_range=[0.9, 1.25],
    brightness_range=[0.5, 1.5]
)


train_generator = train_image_generator.flow_from_dataframe(
    dataframe=train_data,
    directory=TRAIN_FOLDER,
    x_col='X_ray_image_name',
    y_col='target',
    target_size=(224, 224),
    batch_size=8,
    seed=2020,
    shuffle=True,
    class_mode='binary'
)

validation_generator = train_image_generator.flow_from_dataframe(
    dataframe=validation_data,
    directory=TRAIN_FOLDER,
    x_col='X_ray_image_name',
    y_col='target',
    target_size=(224, 224),
    batch_size=8,
    seed=2020,
    shuffle=True,
    class_mode='binary'
)


In [22]:
def schedule(epoch):
    if epoch<5:
        return 0.0001
    else:
        print('уменьшаем значение learning rate')
        return  0.0001 * np.exp(0.5 * (5 - epoch))

In [23]:
from keras.metrics import *
from keras.callbacks import LearningRateScheduler
custom_callback = LearningRateScheduler(schedule)

METRICS = [
      TruePositives(name='tp'),
      FalsePositives(name='fp'),
      TrueNegatives(name='tn'),
      FalseNegatives(name='fn'), 
      BinaryAccuracy(name='accuracy'),
      Precision(name='precision'),
      Recall(name='recall'),
]

In [24]:
model=Sequential([
Conv2D(64, (3, 3), input_shape=(224, 224, 3), activation='relu'),
MaxPooling2D((3,3)),
Conv2D(32,(3,3),activation='relu'),
MaxPooling2D((3,3)),
Conv2D(32,(3,3),activation='relu'),
Flatten(),
Dense(64,activation='relu'),
Dropout(0.2),
Dense(16,activation='relu'),
Dropout(0.2),
Dense(1,activation='sigmoid')])

In [25]:
model.compile(optimizer=Adam(),loss=binary_crossentropy,metrics=METRICS)

In [None]:
history=model.fit_generator(train_generator,validation_data=validation_generator,epochs=10,callbacks=[custom_callback])

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 10))
sns.lineplot(x=np.arange(1, 11), y=history.history.get('loss'), ax=ax[0, 0])
sns.lineplot(x=np.arange(1, 11), y=history.history.get('accuracy'), ax=ax[0, 1])
sns.lineplot(x=np.arange(1, 11), y=history.history.get('val_loss'), ax=ax[1, 0])
sns.lineplot(x=np.arange(1, 11), y=history.history.get('val_accuracy'), ax=ax[1, 1])
ax[0, 0].set_title('Training Loss vs Epochs')
ax[0, 1].set_title('Training accuracy vs Epochs')
ax[1, 0].set_title('Validation Loss vs Epochs')
ax[1, 1].set_title('Validation accuracy vs Epochs')
fig.suptitle('CNN model', size=16)
plt.show()