In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [5]:
# ===============================
# 1. Imports
# ===============================
import os
import cv2
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import mlflow
import mlflow.sklearn


In [6]:
# ===============================
# 2. Constants & Config
# ===============================
data_dir = "chest_xray/chest_xray/"  # dataset path
labels = ["PNEUMONIA", "NORMAL"]
img_size = 64
batch_size = 32

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [7]:
# ===============================
# 3. Dataset Preparation
# ===============================
def get_data(data_dir):
    data = []
    for label in labels:
        path = os.path.join(data_dir, label)
        class_num = labels.index(label)
        for img in os.listdir(path):
            try:
                img_path = os.path.join(path, img)
                img_arr = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img_arr is not None:
                    resized_arr = cv2.resize(img_arr, (img_size, img_size))
                    data.append([resized_arr, class_num])
            except Exception as e:
                print("Error loading image:", e)
    return np.array(data, dtype=object)


# Load data
train_dir = os.path.join(data_dir, "train")
val_dir = os.path.join(data_dir, "val")
test_dir = os.path.join(data_dir, "test")

train_data = get_data(train_dir)
val_data = get_data(val_dir)
test_data = get_data(test_dir)

print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")


Training data size: 5216
Validation data size: 16
Test data size: 624


In [8]:
# ===============================
# 4. Torch Dataset & DataLoader
# ===============================
class ChestXrayDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image, label = self.data[idx]
        image = np.array(image, dtype=np.uint8)

        if image.ndim == 2:
            image = np.expand_dims(image, axis=-1)

        if self.transform:
            image = self.transform(image)

        return image, label


# Transforms
train_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomRotation(30, fill=0),
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(img_size, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

val_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

# Create datasets
train_dataset = ChestXrayDataset(train_data, transform=train_transforms)
val_dataset = ChestXrayDataset(val_data, transform=val_transforms)
test_dataset = ChestXrayDataset(test_data, transform=val_transforms)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



In [9]:
# ===============================
# 5. Feature Extraction for SVM
# ===============================
def extract_features(loader):
    features, labels_out = [], []
    for images, lbls in tqdm(loader, desc="Extracting features"):
        # Flatten each image tensor: [B, 1, H, W] -> [B, H*W]
        images = images.view(images.size(0), -1)
        features.append(images.cpu().numpy())
        labels_out.append(lbls.cpu().numpy())
    return np.vstack(features), np.hstack(labels_out)


X_train, y_train = extract_features(train_loader)
X_val, y_val = extract_features(val_loader)
X_test, y_test = extract_features(test_loader)

print("Train features:", X_train.shape)
print("Validation features:", X_val.shape)
print("Test features:", X_test.shape)



Extracting features: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 163/163 [00:01<00:00, 122.24it/s]
Extracting features: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 311.73it/s]
Extracting features: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 464.59it/s]

Train features: (5216, 4096)
Validation features: (16, 4096)
Test features: (624, 4096)





In [10]:
# ===============================
# 6. Train SVM
# ===============================
svm_model = SVC(kernel="rbf", C=1, gamma="scale", class_weight="balanced")
svm_model.fit(X_train, y_train)


In [11]:
# ===============================
# 7. Evaluation
# ===============================
# Validation
y_val_pred = svm_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print(classification_report(y_val, y_val_pred, target_names=labels))

# Test
y_test_pred = svm_model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, target_names=labels))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))


Validation Accuracy: 0.75
              precision    recall  f1-score   support

   PNEUMONIA       0.75      0.75      0.75         8
      NORMAL       0.75      0.75      0.75         8

    accuracy                           0.75        16
   macro avg       0.75      0.75      0.75        16
weighted avg       0.75      0.75      0.75        16

Test Accuracy: 0.8269230769230769
              precision    recall  f1-score   support

   PNEUMONIA       0.91      0.80      0.85       390
      NORMAL       0.72      0.87      0.79       234

    accuracy                           0.83       624
   macro avg       0.82      0.84      0.82       624
weighted avg       0.84      0.83      0.83       624

Confusion Matrix:
 [[312  78]
 [ 30 204]]


In [15]:
# ===============================
# 8. MLflow Logging
# ===============================
report_dict = classification_report(y_test, y_test_pred, output_dict=True)

mlflow.set_tracking_uri("http://127.0.0.1:8000/")
mlflow.set_experiment("SVM_Pneumonia_Detection")

params = {"kernel": "rbf", "C": 1, "gamma": "scale", "class_weight": "balanced"}

with mlflow.start_run():
    mlflow.log_params(params)

    # Log metrics
    mlflow.log_metric("accuracy", report_dict["accuracy"])
    mlflow.log_metric("f1_macro", report_dict["macro avg"]["f1-score"])
    mlflow.log_metric("precision_macro", report_dict["macro avg"]["precision"])
    mlflow.log_metric("recall_macro", report_dict["macro avg"]["recall"])

    # Per-class metrics
    for cls, metrics in report_dict.items():
        if cls not in ["accuracy", "macro avg", "weighted avg"]:
            mlflow.log_metric(f"precision_class_{cls}", metrics["precision"])
            mlflow.log_metric(f"recall_class_{cls}", metrics["recall"])
            mlflow.log_metric(f"f1_class_{cls}", metrics["f1-score"])

    # Log the trained SVM model
    mlflow.sklearn.log_model(svm_model, "SVM_model")

print("✅ MLflow logging complete for SVM")

2025/09/23 11:13:52 INFO mlflow.tracking.fluent: Experiment with name 'SVM_Pneumonia_Detection' does not exist. Creating a new experiment.


🏃 View run big-loon-681 at: http://127.0.0.1:8000/#/experiments/772446650389602807/runs/621a15928ced43c7b1213d5c3be8c83f
🧪 View experiment at: http://127.0.0.1:8000/#/experiments/772446650389602807
✅ MLflow logging complete for SVM
