In [None]:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from tqdm import tqdm
import pandas as pd

import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D


In [None]:
from glob import glob
rawImagePaths = glob('../input/data/images*/images/*.png')
df = pd.read_csv("/kaggle/input/data/Data_Entry_2017.csv")

In [None]:
from itertools import chain

# df['Finding Labels'] = df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
# #Splitting Multilabels to get unique labels
# all_labels = np.unique(list(chain(*df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
# all_labels = [x for x in all_labels if len(x)>0]

all_labels = ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening',
 'Pneumonia', 'Pneumothorax','No Finding']

#Making the labels as column and assigning 1 if it exists in Finding Column else 0
for c_label in all_labels:
    if len(c_label)>1:
        df[c_label] = df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)

# #Use the columns created above to make a vector
# df['disease_vec'] = df.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])

In [None]:
df[:10]

In [None]:
folders = sorted(glob('../input/data/images*/images/'))
imagenames_list = []
for folder in folders:
    for f in glob(folder+'/*.png'):
        imagenames_list.append(f)
        
imagenames_list.sort()

In [None]:
len(imagenames_list)

In [None]:
df['path'] = imagenames_list

In [None]:
df.head()

In [None]:
import random
# random.seed(42);
df1 = df.sample(40000, random_state = 42)
remove_index = (df1.index[df1['Finding Labels'] == 'No Finding'].tolist())[:18000]
df2 = df1.drop(remove_index)

In [None]:
label_counts = df2['Finding Labels'].value_counts()[:18]
fig, ax1 = plt.subplots(1,1,figsize = (12, 8))
ax1.bar(np.arange(len(label_counts))+0.5, label_counts)
ax1.set_xticks(np.arange(len(label_counts))+0.5)
_ = ax1.set_xticklabels(label_counts.index, rotation = 90)

In [None]:
import pickle
filename = 'xraydata'

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df2, 
                                   test_size = 0.20, 
                                   random_state = 2018)
print('train', train_df.shape[0], 'validation', test_df.shape[0])

In [None]:
type(train_df)

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (128, 128)
datagen = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip = True, 
                              vertical_flip = False, 
                              height_shift_range= 0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range = 0.1,
                              fill_mode = 'reflect',
                              zoom_range=0.15)

In [None]:
train_generator = datagen.flow_from_dataframe(
            dataframe=train_df,
            directory=None,
            x_col="path",
            y_col= ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening',
 'Pneumonia', 'Pneumothorax','No Finding'],
            subset="training",
            batch_size=256,
            seed=42,
            shuffle=True,
            target_size= IMG_SIZE,rescale=1.0/255.0, color_mode='grayscale',class_mode = 'raw')

In [None]:
test_datagen=ImageDataGenerator(rescale=1./255.)
test_generator = test_datagen.flow_from_dataframe(
dataframe=test_df,
directory=None,
x_col="path",
y_col=['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening',
 'Pneumonia', 'Pneumothorax', 'No Finding'],
class_mode="raw",
batch_size=1024,
seed=42,
shuffle=False,
target_size=IMG_SIZE,rescale=1.0/255.0, color_mode='grayscale')     



In [None]:
test_x,test_y = next(test_generator)
test_labels = test_generator.labels

In [None]:
test_labels.shape

In [None]:
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(test_x, test_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0], cmap = 'bone', vmin = -1.5, vmax = 1.5)
    c_ax.set_title(', '.join([n_class for n_class, n_score in zip(all_labels, c_y) 
                             if n_score>0.5]))
    c_ax.axis('off')

In [None]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2
from keras.optimizers import adam
import tensorflow as tf

model = Sequential()
#Layer 1
model.add(Conv2D(filters=96, kernel_size=(11, 11),input_shape =(128,128,1),strides=4, kernel_regularizer=l2(0.)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(BatchNormalization())

#Layer 2
model.add(Conv2D(256, (5, 5), padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))
model.add(BatchNormalization())

# Layer 3
model.add(Conv2D(384, (3, 3), padding='same'))
model.add(Activation('relu'))

#Layer 4
model.add(Conv2D(384, (3, 3), padding='same'))
model.add(Activation('relu'))

# Layer 5
model.add(Conv2D(256, (3, 3), padding='same'))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(3, 3)))

# Layer 6
model.add(Flatten())
model.add(Dense(4096))
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Layer 7
model.add(Dense(4096))
model.add(Activation('relu'))
model.add(Dropout(0.5))

# Layer 8
model.add(Dense(15))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',optimizer=adam(lr=0.0005, decay=1e-6),metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# history = model.fit_generator(train_generator, steps_per_epoch=len(train_generator),validation_data=test_generator,epochs=3, verbose=1)

In [None]:
# from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
# weight_path="{}_weights.best.hdf5".format('x-rays')

# checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, 
#                              save_best_only=True, mode='min', save_weights_only = True)

# early = EarlyStopping(monitor="val_loss", 
#                       mode="min", 
#                       patience=3)
# callbacks_list = [checkpoint, early]

In [None]:
#With 8 epochs and steps_per_epoch as 100
history = model.fit_generator(train_generator, steps_per_epoch=len(train_generator),validation_data=test_generator,epochs=10, verbose=1)

In [None]:
(loss, accuracy) = model.evaluate_generator(test_generator,verbose=1)
print('[INFO] accuracy: {:.2f}%'.format(accuracy * 100))

In [None]:
#predicted values
predicted = model.predict_generator(test_generator, steps = len(test_generator), verbose = True)


In [None]:
from matplotlib import pyplot as plt
def summarize_diagnostics(history):
    figure,ax = plt.subplots()
    plt.figure(figsize=(10,10))
# plot loss
    plt.subplot(211)
    plt.title('Cross Entropy Loss')
    plt.plot(history.history['loss'], color='blue', label='train')
    plt.plot(history.history['val_loss'], color='orange', label='test')
    # plot accuracy
    plt.subplot(212)
    plt.title('Accuracy')
    plt.plot(history.history['accuracy'], color='blue', label='train')
    plt.plot(history.history['val_accuracy'], color='orange', label='test')
    figure.tight_layout(pad=3.0)

In [None]:
summarize_diagnostics(history)

In [None]:
for c_label, p_count, t_count in zip(all_labels, 
                                     100*np.mean(test_labels,0), 
                                     100*np.mean(predicted,0)):
    print('%s: Dx: %2.2f%%, PDx: %2.2f%%' % (c_label, t_count, p_count))

In [None]:
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_labels[:,idx].astype(int), predicted[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
fig.savefig('barely_trained_net.png')

In [None]:
sickest_idx = np.argsort(np.sum(test_y, 1)<1)
fig, m_axs = plt.subplots(4, 2, figsize = (16, 32))
for (idx, c_ax) in zip(sickest_idx, m_axs.flatten()):
    c_ax.imshow(test_x[idx, :,:,0], cmap = 'bone')
    stat_str = [n_class[:6] for n_class, n_score in zip(all_labels, 
                                                                  test_labels[idx]) 
                             if n_score>0.5]
    pred_str = ['%s:%2.0f%%' % (n_class[:4], p_score*100)  for n_class, n_score, p_score in zip(all_labels, 
                                                                  test_labels[idx], predicted[idx]) 
                             if (n_score>0.5) or (p_score>0.5)]
    c_ax.set_title('Dx: '+', '.join(stat_str)+'\nPDx: '+', '.join(pred_str))
    c_ax.axis('off')
fig.savefig('trained_img_predictions.png')