Imports for the program to work, no need to include in the methods as this is secondary.

In [2]:
import pandas as pd
import numpy as np
import re
from pathlib import Path
from typing import Tuple, Dict, Any

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

Config for model training, again, safe to ignore for your method needs

In [3]:

XLSX_PATH = "PCOS_data_without_infertility.xlsx"   # or the full path if needed
SHEET_NAME = "Full_new"                            # your data sheet
TARGET_COL = "PCOS (Y/N)"                          # label column

# columns that are clearly not useful as features
DROP_COLS = ["Sl. No", "Patient File No.", "Unnamed: 44"]


Helper functions, not needed in methods as its for easier implementation of training later

In [4]:
# blood Group mapping as specified by the excel sheet
BLOOD_GROUP_MAP = {
    "A+": 11, "A-": 12, "B+": 13, "B-": 14, "O+": 15, "O-": 16, "AB+": 17, "AB-": 18,
}

def normalize_yes_no(series: pd.Series) -> pd.Series:
    """Convert Yes/No variants to {1,0}; keep NaN for blanks."""
    def to01(v):
        if pd.isna(v): 
            return np.nan
        s = str(v).strip().lower()
        if s in {"yes", "y", "1", "true"}:  return 1
        if s in {"no", "n", "0", "false"}:  return 0
        # tolerate numbers given as strings
        try:
            f = float(s)
            if f in (0.0, 1.0): 
                return int(f)
        except:
            pass
        return np.nan
    return series.map(to01)

def choose_splits(n_rows: int) -> Tuple[float, float, float]:
    """Return (train, val, test) fractions based on dataset size."""
    if n_rows < 1000:
        return 0.70, 0.15, 0.15
    elif n_rows < 5000:
        return 0.75, 0.15, 0.10
    else:
        return 0.80, 0.10, 0.10


Basic data overview, once again, not needed in method description

In [5]:
df_raw = pd.read_excel(XLSX_PATH, sheet_name=SHEET_NAME)
df = df_raw.copy()

#drop the cols we dont need, specified earlier
for c in DROP_COLS:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

# strip column names of stray spaces
df.columns = [c.strip() for c in df.columns]

#map the blood group numerical values to their actual vals (double checking lang to be sure)
if "Blood Group" in df.columns and df["Blood Group"].dtype == object:
    df["Blood Group"] = df["Blood Group"].map(lambda x: BLOOD_GROUP_MAP.get(str(x).strip(), np.nan))

# normalize all (Y/N) columns
yn_cols = [c for c in df.columns if "(Y/N)" in c or re.search(r"\bY/?N\b", c, re.I)]
for c in yn_cols:
    df[c] = normalize_yes_no(df[c]).astype("float")  # float for imputation; cast to int later

# Coerce other object columns to numeric when possible
for c in df.columns:
    if df[c].dtype == object and c not in yn_cols:
        df[c] = pd.to_numeric(df[c].replace({"NaN": np.nan, "nan": np.nan, "": np.nan, "—": np.nan}), errors="ignore")

# Quick peek
display(df.head())
print("Rows, Cols:", df.shape)
print("Nulls: ", df.isna().sum().sum())

  df[c] = pd.to_numeric(df[c].replace({"NaN": np.nan, "nan": np.nan, "": np.nan, "—": np.nan}), errors="ignore")


Unnamed: 0,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),Hb(g/dl),Cycle(R/I),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,0.0,28,44.6,152.0,19.3,15,78,22,10.48,2,...,0.0,1.0,0.0,110,80,3,3,18.0,18.0,8.5
1,0.0,36,65.0,161.5,24.921163,15,74,20,11.7,2,...,0.0,0.0,0.0,120,70,3,5,15.0,14.0,3.7
2,1.0,33,68.8,165.0,25.270891,11,72,18,11.8,2,...,1.0,1.0,0.0,120,80,13,15,18.0,20.0,10.0
3,0.0,37,65.0,148.0,29.674945,13,72,20,12.0,2,...,0.0,0.0,0.0,120,70,2,2,15.0,14.0,7.5
4,0.0,25,52.0,161.0,20.060954,11,72,18,10.0,2,...,0.0,0.0,0.0,120,80,3,4,16.0,14.0,7.0


Rows, Cols: (541, 42)
Nulls:  2


build the dataset to train on, create X, y impute and splits

In [6]:
assert TARGET_COL in df.columns, f"Target '{TARGET_COL}' not found; available: {list(df.columns)}"
y = df[TARGET_COL].astype(int)
X = df.drop(columns=[TARGET_COL])

# Keep numeric columns (this dataset is numeric after our mapping)
numeric_cols = [c for c in X.columns if pd.api.types.is_numeric_dtype(X[c])]
X = X[numeric_cols].copy()

# Median imputation (robust for clinical-style data)
imputer = SimpleImputer(strategy="median")
X[numeric_cols] = imputer.fit_transform(X[numeric_cols])

n_rows = len(X)
split_train, split_val, split_test = choose_splits(n_rows)

# 1) Test split
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=split_test, stratify=y, random_state=42
)

# 2) Train/Val split (from the remaining)
val_fraction_of_temp = split_val / (1.0 - split_test)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_fraction_of_temp, stratify=y_temp, random_state=42
)

print({
    "n_rows": n_rows,
    "split_fractions": {"train": split_train, "val": split_val, "test": split_test},
    "counts": {"train": len(X_train), "val": len(X_val), "test": len(X_test)}
})

# Class balance check
print("Class balance (train):")
print(y_train.value_counts(normalize=True).rename("pct"))
print("Class balance (val):")
print(y_val.value_counts(normalize=True).rename("pct"))
print("Class balance (test):")
print(y_test.value_counts(normalize=True).rename("pct"))

print("Class counts in train:")
print(y_train.value_counts())


{'n_rows': 541, 'split_fractions': {'train': 0.7, 'val': 0.15, 'test': 0.15}, 'counts': {'train': 378, 'val': 81, 'test': 82}}
Class balance (train):
PCOS (Y/N)
0    0.671958
1    0.328042
Name: pct, dtype: float64
Class balance (val):
PCOS (Y/N)
0    0.679012
1    0.320988
Name: pct, dtype: float64
Class balance (test):
PCOS (Y/N)
0    0.670732
1    0.329268
Name: pct, dtype: float64
Class counts in train:
PCOS (Y/N)
0    254
1    124
Name: count, dtype: int64


Some minor class imbalance

In [7]:
from xgboost import XGBClassifier

#set a scale factor given the class imbalance
neg, pos = np.bincount(y_train)
scale = pos/neg

model = XGBClassifier(
        n_estimators=500,
        learning_rate=0.10,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=scale,
        tree_method="hist",
        eval_metric="auc",
        random_state=42,
)

model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [8]:

from sklearn.metrics import (
    average_precision_score, precision_recall_fscore_support,
    confusion_matrix, balanced_accuracy_score, matthews_corrcoef
)

val_scores = model.predict_proba(X_val)[:, 1]
test_scores = model.predict_proba(X_test)[:, 1]

def _eval_split_with_imbalance(y_true, scores, split_name):
    # Thresholded predictions at 0.5 (keep consistent with your other eval)
    pred = (scores >= 0.5).astype(int)

    # Class distribution
    n = len(y_true)
    pos = int(np.sum(y_true))
    neg = n - pos
    base_rate = pos / max(1, n)
    pred_pos = int(np.sum(pred))
    pred_pos_rate = pred_pos / max(1, n)

    # Threshold-independent metrics
    auc = roc_auc_score(y_true, scores)
    ap  = average_precision_score(y_true, scores)  # PR-AUC

    # Threshold-dependent metrics @0.5
    prec, rec, f1, _ = precision_recall_fscore_support(
        y_true, pred, average="binary", zero_division=0
    )
    tn, fp, fn, tp = confusion_matrix(y_true, pred).ravel()
    acc   = accuracy_score(y_true, pred)
    spec  = tn / (tn + fp) if (tn + fp) > 0 else np.nan  # specificity (TNR)
    bacc  = balanced_accuracy_score(y_true, pred)
    mcc   = matthews_corrcoef(y_true, pred) if len(np.unique(y_true)) == 2 else np.nan

    # Print block (keeps it readable in console/logs)
    print(f"\n[{split_name}] class distribution: "
          f"pos={pos} ({base_rate:.1%}), neg={neg} ({1-base_rate:.1%}), n={n}")
    print(f"[{split_name}] AUC={auc:.3f} | PR-AUC(AP)={ap:.3f} | "
          f"Acc@0.5={acc:.3f} | Prec@0.5={prec:.3f} | Rec@0.5={rec:.3f} | "
          f"F1@0.5={f1:.3f} | Spec@0.5={spec:.3f} | BalAcc@0.5={bacc:.3f} | MCC@0.5={mcc:.3f}")
    print(f"[{split_name}] Predicted positive rate @0.5: {pred_pos}/{n} ({pred_pos_rate:.1%})")
    print(f"[{split_name}] Confusion matrix @0.5 (rows=true, cols=pred):\n"
          f"[[TN FP]\n [FN TP]]\n{np.array([[tn, fp], [fn, tp]])}")

    return {
        "n": n, "pos": pos, "neg": neg,
        "base_rate": float(base_rate),
        "pred_pos_rate@0.5": float(pred_pos_rate),
        "AUC": float(auc), "AP": float(ap),
        "ACC@0.5": float(acc), "Precision@0.5": float(prec),
        "Recall@0.5": float(rec), "F1@0.5": float(f1),
        "Specificity@0.5": float(spec), "BalancedAcc@0.5": float(bacc),
        "MCC@0.5": float(mcc),
        "cm@0.5": {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)},
    }

# Evaluate both splits with the extra metrics
val_summary  = _eval_split_with_imbalance(y_val,  val_scores,  "Validation")
test_summary = _eval_split_with_imbalance(y_test, test_scores, "Test")

# ---- Compact paper-friendly summary footer ----
def _pct(x): return f"{100*x:.1f}%"
imbalance_flag = "(imbalance noted)" if abs(val_summary["base_rate"] - 0.5) > 0.10 else "(near-balanced)"

print("\n=== Summary (tabular) ===")
print(f"Base rate (positives):  Validation {_pct(val_summary['base_rate'])} {imbalance_flag}; "
      f"Test {_pct(test_summary['base_rate'])}")
print(f"ROC-AUC:  val {val_summary['AUC']:.3f} | test {test_summary['AUC']:.3f}  ·  "
      f"PR-AUC(AP): val {val_summary['AP']:.3f} | test {test_summary['AP']:.3f}")
print(f"@0.5 — Validation:  Acc {val_summary['ACC@0.5']:.3f} | "
      f"Prec {val_summary['Precision@0.5']:.3f} | Rec {val_summary['Recall@0.5']:.3f} | "
      f"F1 {val_summary['F1@0.5']:.3f} | Spec {val_summary['Specificity@0.5']:.3f} | "
      f"BalAcc {val_summary['BalancedAcc@0.5']:.3f} | MCC {val_summary['MCC@0.5']:.3f}")
print(f"@0.5 — Test:        Acc {test_summary['ACC@0.5']:.3f} | "
      f"Prec {test_summary['Precision@0.5']:.3f} | Rec {test_summary['Recall@0.5']:.3f} | "
      f"F1 {test_summary['F1@0.5']:.3f} | Spec {test_summary['Specificity@0.5']:.3f} | "
      f"BalAcc {test_summary['BalancedAcc@0.5']:.3f} | MCC {test_summary['MCC@0.5']:.3f}")


[Validation] class distribution: pos=26 (32.1%), neg=55 (67.9%), n=81
[Validation] AUC=0.957 | PR-AUC(AP)=0.935 | Acc@0.5=0.914 | Prec@0.5=0.952 | Rec@0.5=0.769 | F1@0.5=0.851 | Spec@0.5=0.982 | BalAcc@0.5=0.876 | MCC@0.5=0.800
[Validation] Predicted positive rate @0.5: 21/81 (25.9%)
[Validation] Confusion matrix @0.5 (rows=true, cols=pred):
[[TN FP]
 [FN TP]]
[[54  1]
 [ 6 20]]

[Test] class distribution: pos=27 (32.9%), neg=55 (67.1%), n=82
[Test] AUC=0.936 | PR-AUC(AP)=0.922 | Acc@0.5=0.878 | Prec@0.5=0.905 | Rec@0.5=0.704 | F1@0.5=0.792 | Spec@0.5=0.964 | BalAcc@0.5=0.834 | MCC@0.5=0.719
[Test] Predicted positive rate @0.5: 21/82 (25.6%)
[Test] Confusion matrix @0.5 (rows=true, cols=pred):
[[TN FP]
 [FN TP]]
[[53  2]
 [ 8 19]]

=== Summary (tabular) ===
Base rate (positives):  Validation 32.1% (imbalance noted); Test 32.9%
ROC-AUC:  val 0.957 | test 0.936  ·  PR-AUC(AP): val 0.935 | test 0.922
@0.5 — Validation:  Acc 0.914 | Prec 0.952 | Rec 0.769 | F1 0.851 | Spec 0.982 | BalAcc 

In [9]:
import pandas as pd

# ---- Make tabular summary for paper/report ----
summary_df = pd.DataFrame([
    {
        "Split": "Validation",
        "Positives": f"{val_summary['pos']} / {val_summary['n']} ({100*val_summary['base_rate']:.1f}%)",
        "AUC": f"{val_summary['AUC']:.3f}",
        "PR-AUC": f"{val_summary['AP']:.3f}",
        "Accuracy": f"{val_summary['ACC@0.5']:.3f}",
        "Precision": f"{val_summary['Precision@0.5']:.3f}",
        "Recall": f"{val_summary['Recall@0.5']:.3f}",
        "F1-Score": f"{val_summary['F1@0.5']:.3f}",
        "Specificity": f"{val_summary['Specificity@0.5']:.3f}",
        "BalAcc": f"{val_summary['BalancedAcc@0.5']:.3f}",
        "MCC": f"{val_summary['MCC@0.5']:.3f}"
    },
    {
        "Split": "Test",
        "Positives": f"{test_summary['pos']} / {test_summary['n']} ({100*test_summary['base_rate']:.1f}%)",
        "AUC": f"{test_summary['AUC']:.3f}",
        "PR-AUC": f"{test_summary['AP']:.3f}",
        "Accuracy": f"{test_summary['ACC@0.5']:.3f}",
        "Precision": f"{test_summary['Precision@0.5']:.3f}",
        "Recall": f"{test_summary['Recall@0.5']:.3f}",
        "F1-Score": f"{test_summary['F1@0.5']:.3f}",
        "Specificity": f"{test_summary['Specificity@0.5']:.3f}",
        "BalAcc": f"{test_summary['BalancedAcc@0.5']:.3f}",
        "MCC": f"{test_summary['MCC@0.5']:.3f}"
    }
])

# Display neatly in Jupyter / console
print("\n=== Model Evaluation Summary (Tabular Classifier) ===")
display(summary_df.style.set_table_styles(
    [{"selector": "th", "props": [("text-align", "center"), ("font-weight", "bold")]}]
).set_properties(**{"text-align": "center"}))



=== Model Evaluation Summary (Tabular Classifier) ===


Unnamed: 0,Split,Positives,AUC,PR-AUC,Accuracy,Precision,Recall,F1-Score,Specificity,BalAcc,MCC
0,Validation,26 / 81 (32.1%),0.957,0.935,0.914,0.952,0.769,0.851,0.982,0.876,0.8
1,Test,27 / 82 (32.9%),0.936,0.922,0.878,0.905,0.704,0.792,0.964,0.834,0.719


In [10]:
fi = (
    pd.Series(model.feature_importances_, index=X_train.columns)
    .sort_values(ascending=False)
    .to_frame("importance")
)
display(fi)

Unnamed: 0,importance
Follicle No. (R),0.116519
Follicle No. (L),0.115012
Skin darkening (Y/N),0.07727
Weight gain(Y/N),0.06914
hair growth(Y/N),0.06648
Fast food (Y/N),0.045069
Cycle length(days),0.034867
Reg.Exercise(Y/N),0.033515
Cycle(R/I),0.03134
LH(mIU/mL),0.022689


# Export model to ONNX format
Here we'll export the trained XGBoost model to ONNX format for deployment and reuse.

In [16]:
# --- Normalize XGBoost feature names to f0, f1, ... for onnxmltools ---
booster = model.get_booster()

# Preserve ordering, just rename to f0..f{n-1}
n_feats = len(booster.feature_names or [])
booster.feature_names = [f"f{i}" for i in range(n_feats)]

# Some XGBoost versions also store feature_types; make them all numeric floats
try:
    booster.feature_types = ['float'] * n_feats
except Exception:
    pass  # not present in all versions

# Now do the conversion
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

input_shape = [None, model.n_features_in_]
initial_type = [('float_input', FloatTensorType(input_shape))]

onx = convert_xgboost(model, initial_types=initial_type, target_opset=13)


In [19]:
# --- Export XGBoost to ONNX with BOTH label + probability and test rigorously ---

from pathlib import Path
import numpy as np
import onnx, onnxruntime as rt
from onnxmltools.convert import convert_xgboost
from onnxmltools.convert.common.data_types import FloatTensorType

# 1) Prepare export
ONNX_DIR = Path("onnx_models"); ONNX_DIR.mkdir(parents=True, exist_ok=True)
onnx_path = ONNX_DIR / "pcos_tabular_model.onnx"

# 2) (Optional) Temporarily rename booster feature names to f0.. to keep onnxmltools happy
booster = model.get_booster()
orig_names = booster.feature_names[:] if booster.feature_names is not None else None
if orig_names is not None:
    booster.feature_names = [f"f{i}" for i in range(len(orig_names))]
    try:
        booster.feature_types = ['float'] * len(orig_names)  # not always present
    except Exception:
        pass

# 3) Convert with probabilities ON (disable ZipMap so we get a tensor instead of a dict)
initial_type = [('float_input', FloatTensorType([None, model.n_features_in_]))]
try:
    onx = convert_xgboost(
        model,
        initial_types=initial_type,
        target_opset=13,
        options={'zipmap': False}  # <- ask for tensor probs
    )
except TypeError:
    # Some versions don't accept 'options' here; fall back without it.
    onx = convert_xgboost(model, initial_types=initial_type, target_opset=13)

# Restore original booster names so sklearn predict_proba with pandas keeps working
if orig_names is not None:
    booster.feature_names = orig_names

# 4) Save + check model
with open(onnx_path, "wb") as f:
    f.write(onx.SerializeToString())

onnx.checker.check_model(onx)
print(f"ONNX export OK → {onnx_path}")

# 5) Inspect outputs so we know which index is label vs prob
sess = rt.InferenceSession(str(onnx_path), providers=['CPUExecutionProvider'])
print("\nONNX outputs:")
for i, o in enumerate(sess.get_outputs()):
    print(f"  [{i}] name={o.name}, shape={o.shape}, type={o.type}")

# 6) Helper to run ONNX and return (label, prob) consistently
def onnx_predict_label_prob(session, X_np: np.ndarray):
    outs = session.run(None, {session.get_inputs()[0].name: X_np.astype(np.float32)})
    # Heuristics:
    # - When options={'zipmap': False} worked: outs[0]=label, outs[1]=probs (2-class tensor)
    # - Without options: could be [label] only, or [label, probability-map/dict]
    if len(outs) == 2:
        label = outs[0].ravel()
        probs = outs[1]
        # If probs is (N,2), take positive class (index 1)
        if probs.ndim == 2 and probs.shape[1] == 2:
            p_pos = probs[:, 1]
        else:
            # Some converters emit a single-column prob of the positive class
            p_pos = probs.ravel()
        return label, p_pos
    elif len(outs) == 1:
        # Single output: it’s almost always label (0/1). Return label and None for prob.
        return outs[0].ravel(), None
    else:
        raise RuntimeError(f"Unexpected number of ONNX outputs: {len(outs)}")

# 7) Sanity checks (single sample + small batch)
# Use NumPy (not pandas) to avoid XGBoost name validation conflicts
X_np_test = X_test.to_numpy(dtype=np.float32)

# Single sample check
lbl_onx_1, prob_onx_1 = onnx_predict_label_prob(sess, X_np_test[:1])
proba_xgb_1 = model.predict_proba(X_np_test[:1])[:, 1]
pred_xgb_1 = (proba_xgb_1 >= 0.5).astype(np.int64)

print("\n[Single sample check]")
print(f"XGB proba: {proba_xgb_1[0]:.6f} | XGB label: {pred_xgb_1[0]}")
if prob_onx_1 is not None:
    print(f"ONNX proba: {prob_onx_1[0]:.6f}")
else:
    print("ONNX proba: <not provided by this export>")
print(f"ONNX label: {int(lbl_onx_1[0])}")

# Batch check (first 32 rows or fewer)
n = min(32, len(X_np_test))
lbl_onx_b, prob_onx_b = onnx_predict_label_prob(sess, X_np_test[:n])
proba_xgb_b = model.predict_proba(X_np_test[:n])[:, 1]
pred_xgb_b = (proba_xgb_b >= 0.5).astype(np.int64)

print(f"\n[Batch check: n={n}]")
# Compare labels
label_match = (pred_xgb_b == lbl_onx_b.astype(np.int64)).mean()
print(f"Label agreement (threshold 0.5): {label_match*100:.2f}%")

# Compare probabilities if available
if prob_onx_b is not None:
    abs_diffs = np.abs(prob_onx_b - proba_xgb_b)
    print(f"Prob mean abs diff: {abs_diffs.mean():.6e}")
    print(f"Prob max  abs diff: {abs_diffs.max():.6e}")
else:
    print("ONNX probabilities not present; if you want them, re-export with options={'zipmap': False}.")


ONNX export OK → onnx_models\pcos_tabular_model.onnx

ONNX outputs:
  [0] name=label, shape=[None], type=tensor(int64)
  [1] name=probabilities, shape=[None, 2], type=tensor(float)

[Single sample check]
XGB proba: 0.013741 | XGB label: 0
ONNX proba: 0.013741
ONNX label: 0

[Batch check: n=32]
Label agreement (threshold 0.5): 100.00%
Prob mean abs diff: 5.523452e-08
Prob max  abs diff: 2.384186e-07
