In [1]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [3]:
df = pd.read_excel(
    r"C:\Users\Anu\Desktop\RESEARCH\data2\clinical_biomarker_NF.xlsx"
)

print(df.shape)
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Anu\\Desktop\\RESEARCH\\data2\\clinical_biomarker_NF.xlsx'

In [None]:
TARGET = "koa_grade"

X = df.drop(columns=[TARGET, "koa_severity"])
y = df[TARGET]

print(X.shape, y.shape)


In [None]:
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)


In [None]:
# Ordinal Encoding of stiffness
ordinal_maps = {
    "stiffness": ["never", "occasionally", "frequently", "always"]
}

for col, order in ordinal_maps.items():
    if col in X.columns:
        X[col] = pd.Categorical(X[col], categories=order, ordered=True).codes


In [None]:
# One-Hot Encoding for Nominal Categorical Variables
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [None]:
# Feature Scaling (Standardization)
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [None]:
import json, os, joblib

# ✅ SAVE feature names (this becomes your 104 columns)
feature_names = list(X.columns)
print("Feature count:", len(feature_names))

os.makedirs("models", exist_ok=True)

with open("models/xgb_feature_names.json", "w") as f:
    json.dump(feature_names, f)

joblib.dump(scaler, "models/xgb_scaler.pkl")

print("Saved: models/xgb_feature_names.json")
print("Saved: models/xgb_scaler.pkl")


In [None]:
# Train–Test-Validation Split 

from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval,
    y_trainval,
    test_size=0.20,
    stratify=y_trainval,
    random_state=42
)


In [None]:
# Convert Pandas DataFrames to NumPy Arrays

X_train_np = X_train.to_numpy()
X_val_np   = X_val.to_numpy()
X_test_np  = X_test.to_numpy()

y_train_np = y_train.to_numpy()
y_val_np   = y_val.to_numpy()
y_test_np  = y_test.to_numpy()


In [None]:
# Initialize the XGBoost Classifier

from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=3,              # reduce depth
    learning_rate=0.05,
    subsample=0.8,             # stronger randomness
    colsample_bytree=0.8,
    min_child_weight=7,        # prevent small leaf splits
    gamma=0.2,                 # penalize complex splits
    reg_alpha=5.0,             # L1 regularization
    reg_lambda=3.0,            # L2 regularization
    objective="multi:softmax",
    num_class=5,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)


xgb.fit(X_train_np, y_train_np)


In [None]:
xgb_train_pred = xgb.predict(X_train_np)
xgb_train_acc = accuracy_score(y_train_np, xgb_train_pred)

print("XGBoost - Training Accuracy: {:.2f}%".format(xgb_train_acc * 100))


In [None]:
import numpy as np

def within_one(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) <= 1)

from sklearn.metrics import accuracy_score, classification_report


y_pred_xgb = xgb.predict(X_test_np)

print("XGBoost - Test Accuracy:",
      accuracy_score(y_test_np, y_pred_xgb))

print("\nClassification Report:")
print(classification_report(y_test_np, y_pred_xgb, zero_division=0))



In [None]:
xgb_val_pred = xgb.predict(X_val_np)
xgb_val_acc = accuracy_score(y_val_np, xgb_val_pred)

print("XGBoost - Validation Accuracy: {:.2f}%".format(xgb_val_acc * 100))
