In [1]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras import layers, models, optimizers

print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.20.0


In [5]:
BASE_DIR = Path("..").resolve()

# This should point to: D:\HealthAI-Project\datasets\CheXpertSmall
DATA_ROOT = BASE_DIR / "datasets" / "CheXpertSmall"

# CheXpert folder inside it
CHEXPERT_DIR = DATA_ROOT / "CheXpert-v1.0-small"

TRAIN_CSV = CHEXPERT_DIR / "train.csv"
VALID_CSV = CHEXPERT_DIR / "valid.csv"

print("BASE_DIR:", BASE_DIR)
print("DATA_ROOT:", DATA_ROOT)
print("CHEXPERT_DIR:", CHEXPERT_DIR)
print("TRAIN_CSV exists:", TRAIN_CSV.exists())
print("VALID_CSV exists:", VALID_CSV.exists())

train_df = pd.read_csv(TRAIN_CSV)
valid_df = pd.read_csv(VALID_CSV)

print("Raw train shape:", train_df.shape)
print("Raw valid shape:", valid_df.shape)
print(train_df.head())

BASE_DIR: D:\HealthAI-Project
DATA_ROOT: D:\HealthAI-Project\datasets\CheXpertSmall
CHEXPERT_DIR: D:\HealthAI-Project\datasets\CheXpertSmall\CheXpert-v1.0-small
TRAIN_CSV exists: True
VALID_CSV exists: True
Raw train shape: (223414, 19)
Raw valid shape: (234, 19)
                                                Path     Sex  Age  \
0  CheXpert-v1.0-small/train/patient00001/study1/...  Female   68   
1  CheXpert-v1.0-small/train/patient00002/study2/...  Female   87   
2  CheXpert-v1.0-small/train/patient00002/study1/...  Female   83   
3  CheXpert-v1.0-small/train/patient00002/study1/...  Female   83   
4  CheXpert-v1.0-small/train/patient00003/study1/...    Male   41   

  Frontal/Lateral AP/PA  No Finding  Enlarged Cardiomediastinum  Cardiomegaly  \
0         Frontal    AP         1.0                         NaN           NaN   
1         Frontal    AP         NaN                         NaN          -1.0   
2         Frontal    AP         NaN                         NaN           NaN 

In [6]:
print(train_df["Path"].head(3))

first_rel = train_df["Path"].iloc[0]
full_path = DATA_ROOT / first_rel

print("Relative:", first_rel)
print("Full path:", full_path)
print("File exists?", full_path.exists())


0    CheXpert-v1.0-small/train/patient00001/study1/...
1    CheXpert-v1.0-small/train/patient00002/study2/...
2    CheXpert-v1.0-small/train/patient00002/study1/...
Name: Path, dtype: object
Relative: CheXpert-v1.0-small/train/patient00001/study1/view1_frontal.jpg
Full path: D:\HealthAI-Project\datasets\CheXpertSmall\CheXpert-v1.0-small\train\patient00001\study1\view1_frontal.jpg
File exists? True


In [7]:
DISEASES = [
    "Atelectasis",
    "Cardiomegaly",
    "Consolidation",
    "Edema",
    "Pleural Effusion",
    "Pneumonia",
    "Pneumothorax",
    "No Finding"
]

# Keep only needed columns
train_df = train_df[["Path"] + DISEASES].copy()
valid_df = valid_df[["Path"] + DISEASES].copy()

def process_labels(df, disease_cols):
    df = df.copy()
    for col in disease_cols:
        df[col] = df[col].fillna(0)
        df[col] = df[col].replace(-1, 1)  # uncertain -> positive
    return df

train_df = process_labels(train_df, DISEASES)
valid_df = process_labels(valid_df, DISEASES)

# Build full file paths using DATA_ROOT now
train_df["filepath"] = train_df["Path"].apply(lambda p: str(DATA_ROOT / p))
valid_df["filepath"] = valid_df["Path"].apply(lambda p: str(DATA_ROOT / p))

# Drop missing files (now should keep MANY)
train_df = train_df[train_df["filepath"].apply(os.path.exists)].reset_index(drop=True)
valid_df = valid_df[valid_df["filepath"].apply(os.path.exists)].reset_index(drop=True)

print("Train images:", len(train_df))
print("Valid images:", len(valid_df))
train_df[["Path", "filepath"] + DISEASES].head()


Train images: 223414
Valid images: 234


Unnamed: 0,Path,filepath,Atelectasis,Cardiomegaly,Consolidation,Edema,Pleural Effusion,Pneumonia,Pneumothorax,No Finding
0,CheXpert-v1.0-small/train/patient00001/study1/...,D:\HealthAI-Project\datasets\CheXpertSmall\Che...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,CheXpert-v1.0-small/train/patient00002/study2/...,D:\HealthAI-Project\datasets\CheXpertSmall\Che...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
2,CheXpert-v1.0-small/train/patient00002/study1/...,D:\HealthAI-Project\datasets\CheXpertSmall\Che...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,CheXpert-v1.0-small/train/patient00002/study1/...,D:\HealthAI-Project\datasets\CheXpertSmall\Che...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,CheXpert-v1.0-small/train/patient00003/study1/...,D:\HealthAI-Project\datasets\CheXpertSmall\Che...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
train_df = train_df.sample(min(25000, len(train_df)), random_state=42)
valid_df = valid_df.sample(min(5000, len(valid_df)), random_state=42)

print("After subsample - Train:", len(train_df), "Valid:", len(valid_df))


After subsample - Train: 25000 Valid: 234


In [9]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

IMG_SIZE = (224, 224)
BATCH_SIZE = 32

train_datagen = ImageDataGenerator(
    rescale=1.0/255.0,
    horizontal_flip=True,
    rotation_range=10,
    width_shift_range=0.05,
    height_shift_range=0.05,
    zoom_range=0.1,
)

valid_datagen = ImageDataGenerator(
    rescale=1.0/255.0
)

train_gen = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    x_col="filepath",
    y_col=DISEASES,              # list of label columns
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="raw",            # multi-label
    shuffle=True
)

val_gen = valid_datagen.flow_from_dataframe(
    dataframe=valid_df,
    x_col="filepath",
    y_col=DISEASES,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode="raw",
    shuffle=False
)


Found 25000 validated image filenames.
Found 234 validated image filenames.


In [10]:
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras import layers, models, optimizers
import tensorflow as tf

base_model = DenseNet121(
    include_top=False,
    weights="imagenet",
    input_shape=IMG_SIZE + (3,)
)

base_model.trainable = False  # first stage: freeze base

model_md = models.Sequential([
    base_model,
    layers.GlobalAveragePooling2D(),
    layers.Dropout(0.3),
    layers.Dense(len(DISEASES), activation="sigmoid")  # one prob per disease
])

model_md.compile(
    optimizer=optimizers.Adam(learning_rate=1e-3),
    loss="binary_crossentropy",  # multi-label loss
    metrics=["accuracy"]
)

model_md.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m29084464/29084464[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 0us/step


In [11]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from pathlib import Path

MODELS_DIR = (BASE_DIR / "models")
MODELS_DIR.mkdir(exist_ok=True)

md_model_path = MODELS_DIR / "xray_chexpert_multidisease_model.h5"

early_stop = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    md_model_path,
    monitor="val_loss",
    save_best_only=True
)

EPOCHS = 5  # start small – we can fine-tune more later

history_md = model_md.fit(
    train_gen,
    validation_data=val_gen,
    epochs=EPOCHS,
    callbacks=[early_stop, checkpoint]
)


Epoch 1/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.1981 - loss: 0.5181



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1452s[0m 2s/step - accuracy: 0.2054 - loss: 0.4828 - val_accuracy: 0.1068 - val_loss: 0.4171
Epoch 2/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.2122 - loss: 0.4570



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1218s[0m 2s/step - accuracy: 0.2152 - loss: 0.4547 - val_accuracy: 0.2607 - val_loss: 0.4118
Epoch 3/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.2197 - loss: 0.4517



[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1214s[0m 2s/step - accuracy: 0.2190 - loss: 0.4518 - val_accuracy: 0.2906 - val_loss: 0.3939
Epoch 4/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1213s[0m 2s/step - accuracy: 0.2213 - loss: 0.4496 - val_accuracy: 0.2906 - val_loss: 0.3982
Epoch 5/5
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1214s[0m 2s/step - accuracy: 0.2192 - loss: 0.4496 - val_accuracy: 0.1410 - val_loss: 0.4173


In [12]:
val_loss, val_acc = model_md.evaluate(val_gen)
print("Multi-disease Val loss:", val_loss)
print("Multi-disease Val accuracy:", val_acc)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2s/step - accuracy: 0.2906 - loss: 0.3939
Multi-disease Val loss: 0.3938594460487366
Multi-disease Val accuracy: 0.2905983030796051


In [13]:
import json

labels_path = MODELS_DIR / "xray_chexpert_labels.json"

with open(labels_path, "w") as f:
    json.dump(DISEASES, f, indent=2)

print("Saved labels to:", labels_path)
print("Saved model to:", md_model_path)


Saved labels to: D:\HealthAI-Project\models\xray_chexpert_labels.json
Saved model to: D:\HealthAI-Project\models\xray_chexpert_multidisease_model.h5


In [14]:
from tensorflow.keras.utils import load_img, img_to_array
import numpy as np

def predict_chexpert_image(img_path, model, diseases, img_size=(224, 224)):
    img = load_img(img_path, target_size=img_size, color_mode="rgb")
    arr = img_to_array(img) / 255.0
    arr = np.expand_dims(arr, axis=0)

    probs = model.predict(arr)[0]  # shape: (len(diseases),)

    return {diseases[i]: float(probs[i]) for i in range(len(diseases))}


In [15]:
sample_path = valid_df["filepath"].iloc[0]
print("Sample image:", sample_path)

preds = predict_chexpert_image(sample_path, model_md, DISEASES, IMG_SIZE)
preds


Sample image: D:\HealthAI-Project\datasets\CheXpertSmall\CheXpert-v1.0-small\valid\patient64592\study1\view1_frontal.jpg
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step


{'Atelectasis': 0.3581162393093109,
 'Cardiomegaly': 0.12076950818300247,
 'Consolidation': 0.12704865634441376,
 'Edema': 0.21756716072559357,
 'Pleural Effusion': 0.34631186723709106,
 'Pneumonia': 0.028416648507118225,
 'Pneumothorax': 0.04238026589155197,
 'No Finding': 0.07330987602472305}