
# Bones Baseline: E2E Demo (Real X-rays)

This notebook demonstrates an end‑to‑end workflow:

1. **(Optional)** Quick EDA on exported CSVs (if you already ran `datasetqa-export`).
2. **Image baseline**: tiny classifier on actual images (`broken_bone` vs `non_broken`).

> **Data location**: `examples/bones_real/` is created by `datasetqa-fetch-bones`.


In [None]:

# Install minimal dependencies (safe to run multiple times)
import sys, subprocess, pkgutil
def pip_install(pkg):
    if pkg not in [m.name for m in pkgutil.iter_modules()]:
        subprocess.check_call([sys.executable, "-m", "pip", "install", pkg])

pip_install("opencv-python")
pip_install("matplotlib")
pip_install("scikit-learn")
pip_install("pandas")


In [None]:

from pathlib import Path
import os

# Project-root-relative paths
PROJECT_ROOT = Path(__file__).resolve().parents[1]
DATA_ROOT = PROJECT_ROOT / "examples" / "bones_real"
BB_DIR = DATA_ROOT / "broken_bone"
NB_DIR = DATA_ROOT / "non_broken"

print("Project root:", PROJECT_ROOT)
print("Data root   :", DATA_ROOT)

if not DATA_ROOT.exists() or not any(DATA_ROOT.glob("**/*.jpg")):
    raise SystemExit(
        "\nData not found. From the project root, run:\n"
        "  pip install datasets huggingface_hub\n"
        "  datasetqa-fetch-bones --out-dir ./examples/bones_real --max 60\n"
    )


## (Optional) EDA from exported CSVs

In [None]:

import pandas as pd

bb_csv = PROJECT_ROOT / "bones_real_bb.csv"
nb_csv = PROJECT_ROOT / "bones_real_nb.csv"

if bb_csv.exists():
    bb = pd.read_csv(bb_csv)
    display(bb.head(3))
else:
    print("No bones_real_bb.csv found — skip if you didn’t export yet.")

if nb_csv.exists():
    nb = pd.read_csv(nb_csv)
    display(nb.head(3))
else:
    print("No bones_real_nb.csv found — skip if you didn’t export yet.")


## Image Baseline (Logistic Regression on downsampled images)

In [None]:

import cv2, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Collect images + labels directly from folders
IMG_SIZE = 128
X, y, paths = [], [], []

for label_name, folder in (("broken_bone", BB_DIR), ("non_broken", NB_DIR)):
    for p in folder.glob("*.jpg"):
        img = cv2.imread(str(p), cv2.IMREAD_GRAYSCALE)
        if img is None:
            continue
        img = cv2.resize(img, (IMG_SIZE, IMG_SIZE), interpolation=cv2.INTER_AREA)
        X.append(img.flatten())
        y.append(1 if label_name == "broken_bone" else 0)
        paths.append(p.name)

X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.int64)
print("Dataset:", X.shape, "labels:", np.bincount(y))

# Train/test split
X_train, X_test, y_train, y_test, p_train, p_test = train_test_split(
    X, y, paths, test_size=0.25, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler(with_mean=True)
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Train a simple logistic regression
clf = LogisticRegression(max_iter=200, n_jobs=None)
clf.fit(X_train_s, y_train)

# Evaluate
y_pred = clf.predict(X_test_s)
print(classification_report(y_test, y_pred, target_names=["non_broken","broken_bone"]))

fig = plt.figure(figsize=(4,4))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=["non_broken","broken_bone"])
plt.tight_layout(); plt.show()


### Visualize a few predictions

In [None]:

# Show a few random test samples with predictions
import random
indices = list(range(len(y_test)))
random.shuffle(indices)
indices = indices[:9]

plt.figure(figsize=(8,8))
for i, idx in enumerate(indices, 1):
    img = X_test[idx].reshape(IMG_SIZE, IMG_SIZE)
    pred = y_pred[idx]
    true = y_test[idx]
    name = p_test[idx]
    plt.subplot(3,3,i)
    plt.imshow(img, cmap="gray")
    plt.title(f"pred: {'broken' if pred==1 else 'non'} | true: {'broken' if true==1 else 'non'}\n{name}")
    plt.axis("off")
plt.tight_layout()
plt.show()
