In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
all_xray_df = pd.read_csv('../input/data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x) : x for x in 
                  glob(os.path.join('..', 'input', 'data', 'images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), 'Total Entries:', all_xray_df.shape[0])
all_xray_df['Path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df.rename(columns={'OriginalImage[Width':'OriginalImageWidth', 
                            'Height]':'OriginalImageHeight',
                            'OriginalImagePixelSpacing[x':'OriginalImagePixelSpacing_X',
                           'y]':'OriginalImagePixelSpacing_y'}, inplace=True)
all_xray_df.drop('Unnamed: 11', inplace=True, axis=1)
all_xray_df.sample(4)

In [None]:
label_count = all_xray_df['Finding Labels'].value_counts()[:15]
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.bar(np.arange(len(label_count)), label_count)
ax1.set_xticks(np.arange(len(label_count)))
ax1.set_xticklabels(label_count.index, rotation = 90)
plt.show()

In [None]:
from itertools import chain
all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
all_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x : x.split('|')))))
all_labels = [x for x in all_labels if len(x)>0]
print('All labels, ({}): {}'.format(len(all_labels), all_labels))
for label in all_labels:
    if(len(label) > 1):
        all_xray_df[label] = all_xray_df['Finding Labels'].map(lambda search: 1.0 if label in search else 0)

In [None]:
MIN_CASES = 1000
all_labels = [label for label in all_labels if (all_xray_df[label].sum() > MIN_CASES)]
print([(label,int(all_xray_df[label].sum())) for label in all_labels])

In [None]:
sample_weights = all_xray_df['Finding Labels'].map(lambda x : len(x.split('|')) if len(x)>0 else 0).values + 4e-2
sample_weights /= sample_weights.sum()
all_xray_df = all_xray_df.sample(40000, weights=sample_weights)

label_counts = all_xray_df['Finding Labels'].value_counts()[0:15]
fig, ax1 = plt.subplots(1, 1, figsize=(12, 8))
ax1.bar(np.arange(len(label_counts)), label_counts)
ax1.set_xticks(np.arange(len(label_counts)))
ax1.set_xticklabels(label_counts.index, rotation=90)
plt.show()

In [None]:
all_xray_df['disease_vec'] = all_xray_df.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])

In [None]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_xray_df, test_size=0.25, random_state=42, stratify=all_xray_df['Finding Labels'].map(lambda x: x[:4]))
print('Train:', train_df.shape[0], 'Valid:', valid_df.shape[0])

In [None]:
import tensorflow
from tensorflow.keras.preprocessing.image import ImageDataGenerator
IMG_SIZE = (128, 128)
idg = ImageDataGenerator(samplewise_center=True,
                        samplewise_std_normalization=True,
                        horizontal_flip=True,
                        vertical_flip=False,
                        height_shift_range=0.05,
                        width_shift_range=0.1,
                        rotation_range=5,
                        shear_range=0.1,
                        fill_mode='reflect',
                        zoom_range=0.15)

In [None]:
valid_df['newLabel'] = valid_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)
train_df['newLabel'] = train_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)

train_gen = idg.flow_from_dataframe(dataframe=train_df,
                                   directory=None,
                                   x_col='Path',
                                   y_col='newLabel',
                                   class_mode='categorical',
                                   classes=all_labels,
                                   target_size=IMG_SIZE,
                                   color_mode='grayscale',
                                   batch_size=32)

valid_gen = idg.flow_from_dataframe(dataframe=valid_df,
                                   directory=None,
                                   x_col='Path',
                                   y_col='newLabel',
                                   class_mode='categorical',
                                   classes=all_labels,
                                   target_size=IMG_SIZE,
                                   color_mode='grayscale',
                                   batch_size=256)

test_X, test_Y = next(idg.flow_from_dataframe(dataframe=valid_df,
                                             directory=None,
                                             x_col='Path',
                                             y_col='newLabel',
                                             class_mode='categorical',
                                             classes=all_labels,
                                             target_size=IMG_SIZE,
                                             color_mode='grayscale',
                                             batch_size=1024))

In [None]:
t_x, t_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0], cmap = 'bone', vmin = -1.5, vmax = 1.5)
    c_ax.set_title(', '.join([n_class for n_class, n_score in zip(all_labels, c_y) 
                             if n_score>0.5]))
    c_ax.axis('off')

In [None]:
from tensorflow.keras.applications.mobilenet import MobileNet
from tensorflow.keras.layers import GlobalAveragePooling2D, Dropout, Flatten, Dense
from tensorflow.keras.models import Sequential

base = MobileNet(input_shape=t_x.shape[1:], include_top=False, weights=None)
model = Sequential()
model.add(base)
model.add(GlobalAveragePooling2D())
model.add(Dropout(0.5))
model.add(Dense(512))
model.add(Dropout(0.5))
model.add(Dense(len(all_labels), activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy', 'mae'])

model.summary()

In [None]:
model.fit_generator(train_gen,
                   steps_per_epoch=100,
                   validation_data=(test_X, test_Y),
                   epochs=5)

In [None]:
pred_Y = model.predict(test_X, batch_size = 32, verbose = True)

In [None]:
from sklearn.metrics import roc_curve, auc
fig, c_ax = plt.subplots(1,1, figsize = (9, 9))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:,idx].astype(int), pred_Y[:,idx])
    c_ax.plot(fpr, tpr, label = '%s (AUC:%0.2f)'  % (c_label, auc(fpr, tpr)))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')