In [2]:
!pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl.metadata (17 kB)
Downloading lightgbm-4.6.0-py3-none-macosx_12_0_arm64.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m58.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:05[0m
[?25hInstalling collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [3]:
# ===============================
# 1. Imports
# ===============================
import os
import cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import mlflow
import mlflow.lightgbm



In [4]:
# ===============================
# 2. Constants & Config
# ===============================
data_dir = "chest_xray/chest_xray/"  # dataset path
labels = ["PNEUMONIA", "NORMAL"]
img_size = 64
batch_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [5]:
# ===============================
# 3. Dataset Preparation
# ===============================
def get_data(data_dir):
    data = []
    for label in labels:
        path = os.path.join(data_dir, label)
        class_num = labels.index(label)
        for img in os.listdir(path):
            try:
                img_path = os.path.join(path, img)
                img_arr = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img_arr is not None:
                    resized_arr = cv2.resize(img_arr, (img_size, img_size))
                    data.append([resized_arr, class_num])
            except Exception as e:
                print("Error loading image:", e)
    return np.array(data, dtype=object)

# Load data
train_dir = os.path.join(data_dir, "train")
val_dir = os.path.join(data_dir, "val")
test_dir = os.path.join(data_dir, "test")

train_data = get_data(train_dir)
val_data = get_data(val_dir)
test_data = get_data(test_dir)

print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")


Training data size: 5216
Validation data size: 16
Test data size: 624


In [6]:
# ===============================
# 4. Torch Dataset & DataLoader
# ===============================
class ChestXrayDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image, label = self.data[idx]
        image = np.array(image, dtype=np.uint8)

        if image.ndim == 2:
            image = np.expand_dims(image, axis=-1)

        if self.transform:
            image = self.transform(image)

        return image, label

# Transforms
train_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(30, fill=0),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

val_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# Create datasets
train_dataset = ChestXrayDataset(train_data, transform=train_transforms)
val_dataset = ChestXrayDataset(val_data, transform=val_transforms)
test_dataset = ChestXrayDataset(test_data, transform=val_transforms)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



In [7]:
# ===============================
# 5. Feature Extraction for LightGBM
# ===============================
def extract_features(loader):
    features, labels_out = [], []
    for images, lbls in tqdm(loader, desc="Extracting features"):
        images = images.view(images.size(0), -1)
        features.append(images.cpu().numpy())
        labels_out.append(lbls.cpu().numpy())
    return np.vstack(features), np.hstack(labels_out)

X_train, y_train = extract_features(train_loader)
X_val, y_val = extract_features(val_loader)
X_test, y_test = extract_features(test_loader)

print("Train features:", X_train.shape)
print("Validation features:", X_val.shape)
print("Test features:", X_test.shape)


Extracting features: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 163/163 [00:01<00:00, 124.09it/s]
Extracting features: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 442.48it/s]
Extracting features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 492.31it/s]

Train features: (5216, 4096)
Validation features: (16, 4096)
Test features: (624, 4096)





In [10]:
from lightgbm import early_stopping, log_evaluation

# ===============================
# 6. Train LightGBM Model
# ===============================
lgb_model = lgb.LGBMClassifier(
    boosting_type='gbdt',
    objective='binary',
    metric='binary_logloss',
    n_estimators=500,
    learning_rate=0.05,
    class_weight='balanced',
    random_state=42
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='binary_logloss',
    callbacks=[
        early_stopping(stopping_rounds=50),
        log_evaluation(period=10)
    ]
)


[LightGBM] [Info] Number of positive: 1341, number of negative: 3875
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.139486 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 822683
[LightGBM] [Info] Number of data points in the train set: 5216, number of used features: 4096
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 50 rounds
[10]	valid_0's binary_logloss: 0.652997
[20]	valid_0's binary_logloss: 0.626333
[30]	valid_0's binary_logloss: 0.61703
[40]	valid_0's binary_logloss: 0.623258
[50]	valid_0's binary_logloss: 0.647262
[60]	valid_0's binary_logloss: 0.686006
[70]	valid_0's binary_logloss: 0.697545
Early stopping, best iteration is:
[26]	valid_0's binary_logloss: 0.614134


In [11]:
# ===============================
# 7. Evaluation
# ===============================
# Validation
y_val_pred = lgb_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=labels))

# Test
y_test_pred = lgb_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, target_names=labels))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Validation Accuracy: 0.625
              precision    recall  f1-score   support

   PNEUMONIA       0.58      0.88      0.70         8
      NORMAL       0.75      0.38      0.50         8

    accuracy                           0.62        16
   macro avg       0.67      0.62      0.60        16
weighted avg       0.67      0.62      0.60        16

Test Accuracy: 0.7980769230769231
              precision    recall  f1-score   support

   PNEUMONIA       0.87      0.79      0.83       390
      NORMAL       0.70      0.80      0.75       234

    accuracy                           0.80       624
   macro avg       0.79      0.80      0.79       624
weighted avg       0.81      0.80      0.80       624

Confusion Matrix:
 [[310  80]
 [ 46 188]]


In [12]:
# ===============================
# 8. MLflow Logging
# ===============================
report_dict = classification_report(y_test, y_test_pred, output_dict=True)

mlflow.set_tracking_uri("http://127.0.0.1:8000/")
mlflow.set_experiment("LightGBM_Pneumonia_Detection")

params = {
    "boosting_type": "gbdt",
    "n_estimators": 500,
    "learning_rate": 0.05,
    "class_weight": "balanced"
}

with mlflow.start_run():
    mlflow.log_params(params)
    mlflow.log_metric("accuracy", report_dict["accuracy"])
    mlflow.log_metric("f1_macro", report_dict["macro avg"]["f1-score"])
    mlflow.log_metric("precision_macro", report_dict["macro avg"]["precision"])
    mlflow.log_metric("recall_macro", report_dict["macro avg"]["recall"])

    for cls, metrics in report_dict.items():
        if cls not in ["accuracy", "macro avg", "weighted avg"]:
            mlflow.log_metric(f"precision_class_{cls}", metrics["precision"])
            mlflow.log_metric(f"recall_class_{cls}", metrics["recall"])
            mlflow.log_metric(f"f1_class_{cls}", metrics["f1-score"])

    mlflow.lightgbm.log_model(lgb_model, "LightGBM_model")

print("✅ MLflow logging complete for LightGBM")


2025/09/23 20:03:06 INFO mlflow.tracking.fluent: Experiment with name 'LightGBM_Pneumonia_Detection' does not exist. Creating a new experiment.


🏃 View run thoughtful-perch-337 at: http://127.0.0.1:8000/#/experiments/675635134002793963/runs/a362e9a790b84a9ea82c85d8ff2f9bfc
🧪 View experiment at: http://127.0.0.1:8000/#/experiments/675635134002793963
✅ MLflow logging complete for LightGBM
