In [1]:
import os
import numpy as np
import pandas as pd
from glob import glob

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers
import tensorflow as tf
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc
import matplotlib.pyplot as plt

In [2]:
cd "D:\Depaul\DATA_SCIENCE\prog_ml_apps\DATASET\archive (5)"


D:\Depaul\DATA_SCIENCE\prog_ml_apps\DATASET\archive (5)


In [3]:
data = pd.read_csv('Data_Entry_2017.csv')

my_glob = glob('images*/images/*.png')

# Step 2: Map image basenames to their absolute paths
all_image_paths = {os.path.basename(x): x for x in my_glob}

print('Scans found:', len(all_image_paths), ', Total Headers', data.shape[0])
data['path'] = data['Image Index'].map(all_image_paths.get)
data['Patient Age'] = data['Patient Age'].map(lambda x: int(x[:-1])if isinstance(x, str)else x)
data.sample(3)

Scans found: 112120 , Total Headers 112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
81515,00020043_001.png,No Finding,1,20043,45,F,PA,2544,3056,0.139,0.139,,images_009\images\00020043_001.png
9235,00002411_008.png,No Finding,8,2411,35,M,AP,2500,2048,0.168,0.168,,images_002\images\00002411_008.png
83103,00020410_004.png,Atelectasis|Consolidation,4,20410,82,F,AP,3056,2544,0.139,0.139,,images_009\images\00020410_004.png


In [4]:
# Assuming `data` is already loaded
data['Finding Labels'] = data['Finding Labels'].map(lambda x: x.replace('No Finding', '') if pd.notnull(x) else '')
from itertools import chain
all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x) > 0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label) > 1:  # leave out empty labels
        data[c_label] = data['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)
data.sample(3)

All Labels (14): ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
95881,00025234_009.png,Effusion,9,25234,24,M,PA,3056,2544,0.139,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5623,00001512_001.png,Emphysema|Infiltration|Mass,1,1512,49,M,PA,2500,2048,0.168,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
90915,00022651_017.png,Pneumothorax,17,22651,20,M,PA,2624,2770,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
all_labels

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax']

In [6]:
MIN_CASES = 1000
#all_labels = [c_label for c_label in all_labels if data[c_label].sum()>MIN_CASES]
all_labels = ['Effusion', 'Infiltration', 'Mass', 'Nodule', 'Atelectasis', 'Pneumothorax']

print('Clean Labels ({})'.format(len(all_labels)), 
      [(c_label,int(data[c_label].sum())) for c_label in all_labels])

Clean Labels (6) [('Effusion', 13317), ('Infiltration', 19894), ('Mass', 5782), ('Nodule', 6331), ('Atelectasis', 11559), ('Pneumothorax', 5302)]


In [7]:
sample_weights = data['Finding Labels'].map(lambda x: len(x.split('|')) if len(x) > 0 else 0).values + 4e-2
sample_weights /= sample_weights.sum()
#data = data.sample(100000, weights=sample_weights, replace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 49147 to 53012
Data columns (total 27 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Image Index                  100000 non-null  object 
 1   Finding Labels               100000 non-null  object 
 2   Follow-up #                  100000 non-null  int64  
 3   Patient ID                   100000 non-null  int64  
 4   Patient Age                  100000 non-null  int64  
 5   Patient Gender               100000 non-null  object 
 6   View Position                100000 non-null  object 
 7   OriginalImage[Width          100000 non-null  int64  
 8   Height]                      100000 non-null  int64  
 9   OriginalImagePixelSpacing[x  100000 non-null  float64
 10  y]                           100000 non-null  float64
 11  Unnamed: 11                  0 non-null       float64
 12  path                         100000 non-null  object 
 13  A

In [9]:
# Dropping unnecessary columns
columns_to_drop = ['Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 
                   'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Unnamed: 11']
data = data.drop(columns=[col for col in columns_to_drop if col in data])


In [10]:
data['disease_vec'] = data.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])

counts = data['Finding Labels'].value_counts()
mask = data['Finding Labels'].isin(counts[counts >= 251].index)
data = data[mask]

threshold = 0.75
category_to_delete = ''  # Add appropriate category to delete

# Select rows with the specified category
category_rows = data[data['Finding Labels'] == category_to_delete]

# Calculate the number of rows to delete
num_rows_to_delete = int(len(category_rows) * threshold)

# Randomly select rows to delete
rows_to_delete = np.random.choice(category_rows.index, size=num_rows_to_delete, replace=False)


In [11]:
# Delete selected rows from the DataFrame
data = data.drop(rows_to_delete)
# Calculate the sum of diseases per patient

data['Total Diseases'] = data[all_labels].sum(axis=1)
# Filter patients with exactly one disease
data_one_disease = data[data['Total Diseases'] == 1]


train_df, valid_df = train_test_split(data_one_disease, test_size=0.2, random_state=123)
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

train 33690 validation 8423


In [12]:
# # Calculate the sum of diseases per patient
# train_df['Total Diseases'] = train_df[all_labels].sum(axis=1)

# # Filter patients with exactly one disease
# one_disease_df = train_df[train_df['Total Diseases'] == 1]

In [13]:
data_one_disease.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42113 entries, 49147 to 53012
Data columns (total 19 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Image Index         42113 non-null  object 
 1   Finding Labels      42113 non-null  object 
 2   path                42113 non-null  object 
 3   Atelectasis         42113 non-null  float64
 4   Cardiomegaly        42113 non-null  float64
 5   Consolidation       42113 non-null  float64
 6   Edema               42113 non-null  float64
 7   Effusion            42113 non-null  float64
 8   Emphysema           42113 non-null  float64
 9   Fibrosis            42113 non-null  float64
 10  Hernia              42113 non-null  float64
 11  Infiltration        42113 non-null  float64
 12  Mass                42113 non-null  float64
 13  Nodule              42113 non-null  float64
 14  Pleural_Thickening  42113 non-null  float64
 15  Pneumonia           42113 non-null  float64
 16  Pneum

In [14]:

"""
## for testting purpose 10 iages per each class
#Function to sample 10 images per class
def sample_images_per_class(df, labels, num_samples=10):
    sampled_df = pd.DataFrame()
    for label in labels:
        label_df = df[df[label] == 1]
        sampled_label_df = label_df.sample(min(num_samples, len(label_df)))
        sampled_df = pd.concat([sampled_df, sampled_label_df])
    return sampled_df

train_df = sample_images_per_class(data_one_disease, all_labels, num_samples=10)
valid_df = sample_images_per_class(valid_df, all_labels, num_samples=10)
print('Sampled train', train_df.shape[0], 'Sampled validation', valid_df.shape[0])
"""


"\n## for testting purpose 10 iages per each class\n#Function to sample 10 images per class\ndef sample_images_per_class(df, labels, num_samples=10):\n    sampled_df = pd.DataFrame()\n    for label in labels:\n        label_df = df[df[label] == 1]\n        sampled_label_df = label_df.sample(min(num_samples, len(label_df)))\n        sampled_df = pd.concat([sampled_df, sampled_label_df])\n    return sampled_df\n\ntrain_df = sample_images_per_class(data_one_disease, all_labels, num_samples=10)\nvalid_df = sample_images_per_class(valid_df, all_labels, num_samples=10)\nprint('Sampled train', train_df.shape[0], 'Sampled validation', valid_df.shape[0])\n"

In [29]:
IMG_SIZE = (128, 128)
core_idg = ImageDataGenerator(samplewise_center=True, 
                              samplewise_std_normalization=True, 
                              horizontal_flip=True, 
                              vertical_flip=False, 
                              height_shift_range=0.05, 
                              width_shift_range=0.1, 
                              rotation_range=5, 
                              shear_range=0.1,
                              fill_mode='reflect',
                              zoom_range=0.15)

In [30]:
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    base_dir = os.path.dirname(in_df[path_col].values[0])
    print('## Ignore next message from keras, values are replaced anyways')
    df_gen = img_data_gen.flow_from_directory(base_dir, 
                                              class_mode='sparse',
                                              **dflow_args)
    df_gen.filenames = in_df[path_col].values
    df_gen.classes = np.stack(in_df[y_col].values)
    df_gen.samples = in_df.shape[0]
    df_gen.n = in_df.shape[0]
    df_gen._set_index_array()
    df_gen.directory = ''  # since we have the full path
    print('Reinserting dataframe: {} images'.format(in_df.shape[0]))
    return df_gen

In [31]:
valid_df['newLabel'] = valid_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)
train_df['newLabel'] = train_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)

In [32]:
train_gen = core_idg.flow_from_dataframe(dataframe=train_df,
                                         directory=None,
                                         x_col='path',
                                         y_col='newLabel',
                                         class_mode='categorical',
                                         classes=all_labels,
                                         target_size=IMG_SIZE,
                                         color_mode='rgb',
                                         batch_size=32)

valid_gen = core_idg.flow_from_dataframe(dataframe=valid_df,
                                         directory=None,
                                         x_col='path',
                                         y_col='newLabel',
                                         class_mode='categorical',
                                         classes=all_labels,
                                         target_size=IMG_SIZE,
                                         color_mode='rgb',
                                         batch_size=256)  # we can use much larger batches for evaluation

test_X, test_Y = next(core_idg.flow_from_dataframe(dataframe=valid_df,
                                                   directory=None,
                                                   x_col='path',
                                                   y_col='newLabel',
                                                   class_mode='categorical',
                                                   classes=all_labels,
                                                   target_size=IMG_SIZE,
                                                   color_mode='rgb',
                                                   batch_size=1024))

Found 33690 validated image filenames belonging to 6 classes.
Found 8423 validated image filenames belonging to 6 classes.
Found 8423 validated image filenames belonging to 6 classes.


In [33]:
t_x, t_y = next(train_gen)

In [34]:
# def dense_block(x, blocks):
#     for _ in range(blocks):
#         x1 = layers.BatchNormalization()(x)
#         x1 = layers.Activation('relu')(x1)
#         x1 = layers.Conv2D(32, kernel_size=3, padding='same')(x1)
#         x = layers.Concatenate()([x, x1])
#     return x

# def transition_block(x):
#     x = layers.BatchNormalization()(x)
#     x = layers.Activation('relu')(x)
#     x = layers.Conv2D(128, kernel_size=1, padding='same')(x)
#     x = layers.AveragePooling2D(pool_size=2, strides=2)(x)
#     return x

# def create_densenet121(input_shape, num_classes):
#     inputs = tf.keras.Input(shape=input_shape)
#     x = layers.Conv2D(64, kernel_size=7, strides=2, padding='same')(inputs)
#     x = layers.BatchNormalization()(x)
#     x = layers.Activation('relu')(x)
#     x = layers.MaxPooling2D(pool_size=3, strides=2, padding='same')(x)

#     x = dense_block(x, 6)
#     x = transition_block(x)

#     x = dense_block(x, 12)
#     x = transition_block(x)

#     x = dense_block(x, 24)
#     x = transition_block(x)

#     x = dense_block(x, 16)

#     x = layers.BatchNormalization()(x)
#     x = layers.Activation('relu')(x)
#     x = layers.GlobalAveragePooling2D()(x)

#     outputs = layers.Dense(num_classes, activation='softmax')(x)

#     model = tf.keras.Model(inputs=inputs, outputs=outputs)
#     return model


In [35]:
# # Define input shape and number of classes
# input_shape = (128, 128, 3)

# # Create the model
# model = create_densenet121(input_shape, len(all_labels))

# # Print the model summary
# model.summary()

In [36]:
# optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
# model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy', 'mae'])

# from keras.callbacks import ModelCheckpoint, EarlyStopping

# weight_path = "xray_class_weights.best.weights.h5"

# checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True)
# early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
# callbacks_list = [checkpoint, early]

In [37]:
type(train_gen)

keras.src.legacy.preprocessing.image.DataFrameIterator

In [38]:
# Train the model
# model.fit(train_gen, validation_data=valid_gen, epochs=10, callbacks=callbacks_list)
#model.fit(train_gen, validation_data=valid_gen, epochs=10)

In [39]:
# # Predicting and calculating the metrics
# pred_Y = model.predict(test_X, batch_size=32, verbose=True)
# pred_labels = np.argmax(pred_Y, axis=1)
# true_labels = np.argmax(test_Y, axis=1)

# precision = precision_score(true_labels, pred_labels, average='weighted')
# recall = recall_score(true_labels, pred_labels, average='weighted')
# f1 = f1_score(true_labels, pred_labels, average='weighted')

# print('Precision:', precision)
# print('Recall:', recall)
# print('F1 Score:', f1)


In [40]:
# # Plotting ROC Curves
# fig, c_ax = plt.subplots(1, 1, figsize=(6, 6))
# for (idx, c_label) in enumerate(all_labels):
#     fpr, tpr, thresholds = roc_curve(test_Y[:, idx].astype(int), pred_Y[:, idx])
#     c_ax.plot(fpr, tpr, label='%s (AUC:%0.2f)' % (c_label, auc(fpr, tpr)))
# c_ax.legend()
# c_ax.set_xlabel('False Positive Rate')
# c_ax.set_ylabel('True Positive Rate')
# fig.savefig('barely_trained_net.png')

# USING PRE TRAINED WEIGHTS

In [41]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

def dense_block(x, blocks):
    for _ in range(blocks):
        x1 = layers.BatchNormalization()(x)
        x1 = layers.Activation('relu')(x1)
        x1 = layers.Conv2D(32, kernel_size=3, padding='same')(x1)
        x = layers.Concatenate()([x, x1])
    return x

def transition_block(x):
    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.Conv2D(128, kernel_size=1, padding='same')(x)
    x = layers.AveragePooling2D(pool_size=2, strides=2, padding='same')(x)  # Use padding='same' to avoid reducing dimensions too much
    return x

def create_densenet121_custom(input_shape, num_classes):
    base_model = DenseNet121(include_top=False, weights='imagenet', input_shape=input_shape)
    
    # Freeze the base model
    base_model.trainable = False

    inputs = tf.keras.Input(shape=input_shape)
    x = base_model(inputs, training=False)

    # Add dense and transition blocks
    x = dense_block(x, 6)
    x = transition_block(x)

    x = dense_block(x, 12)
    x = transition_block(x)

    x = dense_block(x, 24)
    x = transition_block(x)

    x = dense_block(x, 16)

    x = layers.BatchNormalization()(x)
    x = layers.Activation('relu')(x)
    x = layers.GlobalAveragePooling2D()(x)

    outputs = layers.Dense(num_classes, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# Define input shape and number of classes
input_shape = (128, 128, 3)
num_classes = len(all_labels)  # Assuming all_labels is defined

# Create the model
model = create_densenet121_custom(input_shape, num_classes)

# Print the model summary
model.summary()



In [None]:

optimizer = tf.keras.optimizers.Adam(learning_rate=0.003)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['categorical_accuracy', 'mae'])

weight_path = "xray_class_weights.best.weights.h5"

checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min', save_weights_only=True)
early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks_list = [checkpoint, early]

# Training the model
model.fit(train_gen, validation_data=valid_gen, epochs=50, callbacks=callbacks_list)

Epoch 1/50


  self._warn_if_super_not_called()


[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - categorical_accuracy: 0.3717 - loss: 1.6779 - mae: 0.2455
Epoch 1: val_loss improved from inf to 1.89699, saving model to xray_class_weights.best.weights.h5
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1565s[0m 1s/step - categorical_accuracy: 0.3717 - loss: 1.6778 - mae: 0.2455 - val_categorical_accuracy: 0.3860 - val_loss: 1.8970 - val_mae: 0.2449
Epoch 2/50
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - categorical_accuracy: 0.4298 - loss: 1.4964 - mae: 0.2344
Epoch 2: val_loss improved from 1.89699 to 1.47184, saving model to xray_class_weights.best.weights.h5
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1598s[0m 2s/step - categorical_accuracy: 0.4298 - loss: 1.4964 - mae: 0.2344 - val_categorical_accuracy: 0.4401 - val_loss: 1.4718 - val_mae: 0.2291
Epoch 3/50
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - cate

In [None]:
# Predicting and calculating the metrics
pred_Y = model.predict(test_X, batch_size=32, verbose=True)
pred_labels = np.argmax(pred_Y, axis=1)
true_labels = np.argmax(test_Y, axis=1)

precision = precision_score(true_labels, pred_labels, average='weighted')
recall = recall_score(true_labels, pred_labels, average='weighted')
f1 = f1_score(true_labels, pred_labels, average='weighted')

print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)

In [None]:
# Plotting ROC Curves
fig, c_ax = plt.subplots(1, 1, figsize=(6, 6))
for (idx, c_label) in enumerate(all_labels):
    fpr, tpr, thresholds = roc_curve(test_Y[:, idx].astype(int), pred_Y[:, idx])
    c_ax.plot(fpr, tpr, label='%s (AUC:%0.2f)' % (c_label, auc(fpr, tpr)))
c_ax.legend()
c_ax.set_xlabel('False Positive Rate')
c_ax.set_ylabel('True Positive Rate')
#fig.savefig('barely_trained_net.png')