In [1]:
#Just to reset prometheus_client registry

from prometheus_client import REGISTRY

# Unregister all previously registered metrics
for collector in list(REGISTRY._names_to_collectors.values()):
    try:
        REGISTRY.unregister(collector)
    except KeyError:
        pass


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms
from torch.utils.data import random_split, Dataset
import pandas as pd
from PIL import Image
from torchvision import models
import mlflow
import mlflow.pytorch
import json
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune import Tuner, TuneConfig
from ray.train import Checkpoint, session

from prometheus_client import start_http_server, Gauge, Summary

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("chexpert-jupyter")  # name your experiment
os.environ["MLFLOW_TRACKING_URI"] = "http://localhost:5000"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"

train_loss_metric = Gauge("train_loss", "Training loss per epoch", registry=None)
val_loss_metric = Gauge("val_loss", "Validation loss", registry=None)
training_duration = Summary("training_duration_seconds", "Total training time", registry=None)

In [None]:
#minio credentials
os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"

In [None]:
# -----------------------
# Config
# -----------------------
BATCH_SIZE = 1
EPOCHS = 5
LEARNING_RATE = 1e-4
NUM_CLASSES = 14
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

TRAIN_CSV = "sample_train.csv"
TEST_CSV = "sample_test.csv"
IMG_ROOT = "."  # Current dir includes CheXpert-v1.0/

LABELS = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity", "Lung Lesion",
    "Edema", "Consolidation", "Pneumonia", "Atelectasis", "Pneumothorax",
    "Pleural Effusion", "Pleural Other", "Fracture", "Support Devices", "No Finding"
]

In [None]:
# -----------------------
# Dataset
# -----------------------
class CheXpertDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        self.df = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.df[LABELS] = self.df[LABELS].fillna(0).replace(-1, 0)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(self.root_dir, row["Path"])
        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)
        labels = torch.tensor(row[LABELS].values.astype("float32"))
        return image, labels


In [None]:
# -----------------------
# Model
# -----------------------
class CheXpertModel(nn.Module):
    def __init__(self, num_classes=14):
        super().__init__()
        base = models.densenet121(pretrained=True)
        in_features = base.classifier.in_features
        base.classifier = nn.Linear(in_features, num_classes)
        self.model = base

    def forward(self, x):
        return self.model(x)

In [None]:
# -----------------------
# Transforms
# -----------------------
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

In [None]:
# -----------------------
# Load Data
# -----------------------
train_ds = CheXpertDataset(TRAIN_CSV, IMG_ROOT, transform)
test_ds = CheXpertDataset(TEST_CSV, IMG_ROOT, transform)

train_size = int(0.95 * len(train_ds))
val_size = len(train_ds) - train_size
train_subset, val_subset = random_split(train_ds, [train_size, val_size])

train_loader = DataLoader(train_subset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_subset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [None]:
# -----------------------
# Training Loop
# -----------------------
with training_duration.time():
    with mlflow.start_run():

        # 🔹 Start Prometheus server only ONCE (uncomment if needed)
        # start_http_server(8004)

        # 🔹 Log hyperparameters
        mlflow.log_param("batch_size", BATCH_SIZE)
        mlflow.log_param("epochs", EPOCHS)
        mlflow.log_param("learning_rate", LEARNING_RATE)

        model = CheXpertModel(NUM_CLASSES).to(DEVICE)
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        criterion = nn.BCEWithLogitsLoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2)

        best_val_loss = float("inf")
        best_model_path = "best_model.pth"

        for epoch in range(EPOCHS):
            # ---- Training Phase ----
            model.train()
            total_loss = 0

            for imgs, labels in train_loader:
                imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                optimizer.zero_grad()
                outputs = model(imgs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            avg_train_loss = total_loss / len(train_loader)
            print(f"[Epoch {epoch+1}] Train Loss: {avg_train_loss:.4f}")
            mlflow.log_metric("train_loss", avg_train_loss, step=epoch)
            train_loss_metric.set(avg_train_loss)
            scheduler.step(avg_train_loss)

            # ---- Validation Phase ----
            model.eval()
            val_loss = 0
            correct = 0
            total = 0

            with torch.no_grad():
                for imgs, labels in val_loader:
                    imgs, labels = imgs.to(DEVICE), labels.to(DEVICE)
                    outputs = model(imgs)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item()

                    preds = torch.sigmoid(outputs) >= 0.5
                    correct += (preds == labels).sum().item()
                    total += preds.numel()

            avg_val_loss = val_loss / len(val_loader)
            val_accuracy = correct / total
            print(f"[Epoch {epoch+1}] Val Loss: {avg_val_loss:.4f}, Val Acc: {val_accuracy:.4f}")

            mlflow.log_metric("val_loss", avg_val_loss, step=epoch)
            mlflow.log_metric("val_accuracy", val_accuracy, step=epoch)
            val_loss_metric.set(avg_val_loss)

            # Save best model
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), best_model_path)
                print(f"✅ Saved best model at epoch {epoch+1} (Val Loss: {avg_val_loss:.4f}, Acc: {val_accuracy:.4f})")

        # 🔹 Log final artifacts
        mlflow.log_artifact(best_model_path)
        mlflow.pytorch.log_model(model, "final_model")

In [None]:
import mlflow
print("Tracking URI:", mlflow.get_tracking_uri())


In [None]:
# -----------------------
# Evaluation
# -----------------------
model.load_state_dict(torch.load(best_model_path))
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for imgs, labels in test_loader:
        imgs = imgs.to(DEVICE)
        outputs = torch.sigmoid(model(imgs)).cpu()
        all_preds.append(outputs)
        all_labels.append(labels)

all_preds = torch.cat(all_preds).numpy()
all_labels = torch.cat(all_labels).numpy()

# Convert probabilities to binary predictions (threshold = 0.5)
preds_binary = (all_preds >= 0.5).astype(int)

# Compute accuracy (macro across all labels)
correct = (preds_binary == all_labels).sum()
total = preds_binary.size
accuracy = correct / total

print(f"\n✅ Test Accuracy (avg over all labels): {accuracy:.4f}")

# -----------------------
# Display GT & Predictions as Disease Names
# -----------------------
def decode_labels(binary_labels):
    return [LABELS[i] for i, val in enumerate(binary_labels) if val == 1]

print("\n📊 Ground Truth vs Predictions (first 6 images):")
for i in range(min(6, len(all_preds))):
    gt_diseases = decode_labels(all_labels[i].astype(int))
    pred_diseases = decode_labels(preds_binary[i])

    print(f"\n🖼️ Image {i+1}:")
    print("🔹 Ground Truth Labels: ", gt_diseases if gt_diseases else ["No Finding"])
    print("🔸 Predicted Labels:    ", pred_diseases if pred_diseases else ["No Finding"])


In [None]:
mlflow.end_run()
