In [11]:
#Just to reset prometheus_client registry

from prometheus_client import REGISTRY

# Unregister all previously registered metrics
for collector in list(REGISTRY._names_to_collectors.values()):
    try:
        REGISTRY.unregister(collector)
    except KeyError:
        pass


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from sklearn.metrics import roc_auc_score

import pandas as pd
from PIL import Image
from torchvision import models
from torch.utils.data import Dataset
import mlflow
import mlflow.pytorch


from prometheus_client import start_http_server, Gauge, Summary

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("chexpert-jupyter")  # name your experiment

train_loss_metric = Gauge("train_loss", "Training loss per epoch")

training_duration = Summary('training_duration_seconds', 'Total training time in seconds')


In [21]:
#minio credentials

os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"


In [13]:
# -----------------------
# Config
# -----------------------
BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-4
NUM_CLASSES = 14
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TRAIN_CSV = "sample_train.csv"
TEST_CSV = "sample_test.csv"
IMG_ROOT = "Chexpert-v1.0"  # Current dir includes CheXpert-v1.0/

LABELS = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion",
    "Edema", "Consolidation", "Pneumonia", "Atelectasis", "Pneumothorax",
    "Pleural Effusion", "Pleural Other", "Fracture", "Support Devices", "No Finding"
]

In [14]:
# -----------------------
# Dataset
# -----------------------
class CheXpertDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.df[LABELS] = self.df[LABELS].fillna(0).replace(-1, 0)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.root_dir, row["Path"])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        labels = torch.tensor(row[LABELS].values.astype("float32"))
        return image, labels


In [15]:
# -----------------------
# Model
# -----------------------
class CheXpertModel(nn.Module):
    def __init__(self, num_classes=14):
        super().__init__()
        base = models.densenet121(pretrained=True)
        in_features = base.classifier.in_features
        base.classifier = nn.Linear(in_features, num_classes)
        self.model = base

    def forward(self, x):
        return self.model(x)

In [16]:
# -----------------------
# AUC Evaluation
# -----------------------
# def compute_auc(y_true, y_pred):
#     try:
#         aucs = [roc_auc_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])]
#         return sum(aucs) / len(aucs)
#     except Exception:
#         return None

In [17]:
# -----------------------
# Transforms
# -----------------------
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

In [18]:
# -----------------------
# Load Data
# -----------------------
train_ds = CheXpertDataset(TRAIN_CSV, IMG_ROOT, transform)
test_ds = CheXpertDataset(TEST_CSV, IMG_ROOT, transform)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [8]:
#prometheus

In [23]:
with training_duration.time():

    with mlflow.start_run():

        # üîπ Start Prometheus server only ONCE
        start_http_server(8001)

        # üîπ Log hyperparameters
        mlflow.log_param("batch_size", BATCH_SIZE)
        mlflow.log_param("epochs", EPOCHS)
        mlflow.log_param("learning_rate", LEARNING_RATE)

        model = CheXpertModel(NUM_CLASSES).to(DEVICE)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        criterion = nn.BCEWithLogitsLoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

        best_loss = float("inf")
        best_model_path = "best_model.pth"

        for epoch in range(EPOCHS):
            model.train()
            total_loss = 0

            for imgs, labels in train_loader:
                imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                optimizer.zero_grad()
                outputs = model(imgs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_loss = total_loss / len(train_loader)

            # üîπ Log to MLflow
            print(f"[Epoch {epoch+1}] Train Loss: {avg_loss:.4f}")
            mlflow.log_metric("train_loss", avg_loss, step=epoch)

            # üîπ Log to Prometheus
            train_loss_metric.set(avg_loss)

            scheduler.step(avg_loss)

            if avg_loss < best_loss:
                best_loss = avg_loss
                torch.save(model.state_dict(), best_model_path)
                print(f"‚úÖ Saved new best model at epoch {epoch+1} (Loss: {avg_loss:.4f})")

        # üîπ Log final artifacts
        mlflow.log_artifact(best_model_path)
        mlflow.pytorch.log_model(model, "final_model")




[Epoch 1] Train Loss: 0.7224
‚úÖ Saved new best model at epoch 1 (Loss: 0.7224)
[Epoch 2] Train Loss: 0.5972
‚úÖ Saved new best model at epoch 2 (Loss: 0.5972)
[Epoch 3] Train Loss: 0.4800
‚úÖ Saved new best model at epoch 3 (Loss: 0.4800)
[Epoch 4] Train Loss: 0.4055
‚úÖ Saved new best model at epoch 4 (Loss: 0.4055)
[Epoch 5] Train Loss: 0.3480
‚úÖ Saved new best model at epoch 5 (Loss: 0.3480)




üèÉ View run auspicious-chimp-742 at: http://localhost:5000/#/experiments/1/runs/26c8da488ee5473ebd787fce6eda55c2
üß™ View experiment at: http://localhost:5000/#/experiments/1


In [32]:
import mlflow
print("Tracking URI:", mlflow.get_tracking_uri())


Tracking URI: file:///c:/Users/shaba/Documents/Collage/mlops/project/MedAI-Scalable-Diagnosis-with-Machine-Learning/mlruns


In [9]:
# -----------------------
# Evaluation
# -----------------------
model.load_state_dict(torch.load(best_model_path))
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs = imgs.to(DEVICE)
        outputs = torch.sigmoid(model(imgs)).cpu()
        all_preds.append(outputs)
        all_labels.append(labels)

all_preds = torch.cat(all_preds).numpy()
all_labels = torch.cat(all_labels).numpy()

# Convert probabilities to binary predictions (threshold = 0.5)
preds_binary = (all_preds >= 0.5).astype(int)

# Compute accuracy (macro across all labels)
correct = (preds_binary == all_labels).sum()
total = preds_binary.size
accuracy = correct / total

print(f"\n‚úÖ Test Accuracy (avg over all labels): {accuracy:.4f}")


# Show ground truth vs prediction for a few samples
print("\nüìä Ground Truth vs Predictions (first 3 images):")
for i in range(min(6, len(all_preds))):
    print(f"\nImage {i+1}:")
    print("Labels (GT):     ", all_labels[i].astype(int).tolist())
    print("Predictions (bin):", (all_preds[i] >= 0.5).astype(int).tolist())


  model.load_state_dict(torch.load(best_model_path))



‚úÖ Test Accuracy (avg over all labels): 0.8286

üìä Ground Truth vs Predictions (first 3 images):

Image 1:
Labels (GT):      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
Predictions (bin): [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Image 2:
Labels (GT):      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
Predictions (bin): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Image 3:
Labels (GT):      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]
Predictions (bin): [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Image 4:
Labels (GT):      [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
Predictions (bin): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

Image 5:
Labels (GT):      [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
Predictions (bin): [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
mlflow.end_run()
