In [7]:
pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [9]:
df = pd.read_excel(
    r"C:\Users\msi\Documents\GitHub\KneeCare\Trained Models\Anuji\ipynb\clinical_biomarker_NF.xlsx"
)

print(df.shape)
df.head()


(1000, 34)


Unnamed: 0,age,gender,height,weight,occupation,physical_activity_level,living_environment,do_you_currently_experience_knee_pain,pain_score,stiffness,...,does_the_patient_have_any_other_health_conditions_or_risk_factors_that_may_contribute_to_knee_osteoarthritis,what_are_the_suggested_or_ongoing_treatments_for_the_patients_current_condition,koa,does_the_patient_has_obesity,does_the_patient_has_diabetes,does_the_patient_has_hypertension,does_the_patient_has_vitamin_d_deficiency,does_the_patient_has_rheumatoid_arthritis,bmi,koa_grade
0,70.0,Female,1.55,60.0,No,Moderate,Urban,Yes,3.0,Frequently,...,"Obesity or overweight, Family history of osteo...","0 = Lifestyle modification, 1 = Physiotherapy,...",Yes,Yes,No,No,No,No,24.973985,1
1,48.0,Female,1.57,70.0,No,Moderate,Rural,Yes,4.0,Frequently,...,Family history of osteoarthritis,"0 = Lifestyle modification, 1 = Physiotherapy,...",Yes,No,No,No,No,No,28.398718,2
2,54.0,Female,1.58,76.0,No,Moderate,Urban,Yes,3.0,Never,...,"Vitamin D deficiency, Sedentary or low-activit...","0 = Lifestyle modification, 1 = Physiotherapy,...",Yes,No,No,No,Yes,No,30.443839,3
3,52.0,Female,1.55,60.0,No,Low,Urban,Yes,2.0,Occasionally,...,None of the above,"2 = Medication,",Yes,No,No,No,No,No,24.973985,1
4,74.0,Female,1.55,63.0,No,Moderate,Urban,Yes,2.0,Occasionally,...,Vitamin D deficiency,2 = Medication,Yes,No,No,No,Yes,No,26.222685,1


In [10]:
TARGET = "koa_grade"

X = df.drop(columns=[TARGET, "koa_severity"])
y = df[TARGET]

print(X.shape, y.shape)


(1000, 32) (1000,)


In [11]:
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)


Numeric columns: ['age', 'height', 'weight', 'pain_score', 'fbs', 'wbc', 'platelets', 'cs', 'cholesterol', 'crp', 'esr', 'rf', 'fbc', 'bmi']
Categorical columns: ['gender', 'occupation', 'physical_activity_level', 'living_environment', 'do_you_currently_experience_knee_pain', 'stiffness', 'have_you_had_any_previous_knee_injuries_(acl_tear,_meniscus_tear,_fracture,_etc.)', 'do_you_experience_swelling_in_your_knees', 'do_you_find_difficulty_in_performing_these_activities_(check_all_that_apply)', 'do_you_have_a_family_history_of_osteoarthritis', 'does_the_patient_have_any_other_health_conditions_or_risk_factors_that_may_contribute_to_knee_osteoarthritis', 'what_are_the_suggested_or_ongoing_treatments_for_the_patients_current_condition', 'koa', 'does_the_patient_has_obesity', 'does_the_patient_has_diabetes', 'does_the_patient_has_hypertension', 'does_the_patient_has_vitamin_d_deficiency', 'does_the_patient_has_rheumatoid_arthritis']


In [12]:
# Ordinal Encoding of stiffness
ordinal_maps = {
    "stiffness": ["never", "occasionally", "frequently", "always"]
}

for col, order in ordinal_maps.items():
    if col in X.columns:
        X[col] = pd.Categorical(X[col], categories=order, ordered=True).codes


In [13]:
# One-Hot Encoding for Nominal Categorical Variables
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

In [14]:
# Feature Scaling (Standardization)
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [15]:
import json, os, joblib

# ✅ SAVE feature names (this becomes your 104 columns)
feature_names = list(X.columns)
print("Feature count:", len(feature_names))

os.makedirs("models", exist_ok=True)

with open("models/xgb_feature_names.json", "w") as f:
    json.dump(feature_names, f)

joblib.dump(scaler, "models/xgb_scaler.pkl")

print("Saved: models/xgb_feature_names.json")
print("Saved: models/xgb_scaler.pkl")


Feature count: 104
Saved: models/xgb_feature_names.json
Saved: models/xgb_scaler.pkl


In [16]:
# Train–Test-Validation Split 

from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    stratify=y,
    random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_trainval,
    y_trainval,
    test_size=0.20,
    stratify=y_trainval,
    random_state=42
)


In [17]:
# Convert Pandas DataFrames to NumPy Arrays

X_train_np = X_train.to_numpy()
X_val_np   = X_val.to_numpy()
X_test_np  = X_test.to_numpy()

y_train_np = y_train.to_numpy()
y_val_np   = y_val.to_numpy()
y_test_np  = y_test.to_numpy()


In [18]:
# Initialize the XGBoost Classifier

from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=200,
    max_depth=3,              # reduce depth
    learning_rate=0.05,
    subsample=0.8,             # stronger randomness
    colsample_bytree=0.8,
    min_child_weight=7,        # prevent small leaf splits
    gamma=0.2,                 # penalize complex splits
    reg_alpha=5.0,             # L1 regularization
    reg_lambda=3.0,            # L2 regularization
    objective="multi:softmax",
    num_class=5,
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)


xgb.fit(X_train_np, y_train_np)


In [19]:
xgb_train_pred = xgb.predict(X_train_np)
xgb_train_acc = accuracy_score(y_train_np, xgb_train_pred)

print("XGBoost - Training Accuracy: {:.2f}%".format(xgb_train_acc * 100))


XGBoost - Training Accuracy: 95.00%


In [20]:
import numpy as np

def within_one(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred) <= 1)

from sklearn.metrics import accuracy_score, classification_report


y_pred_xgb = xgb.predict(X_test_np)

print("XGBoost - Test Accuracy:",
      accuracy_score(y_test_np, y_pred_xgb))

print("\nClassification Report:")
print(classification_report(y_test_np, y_pred_xgb, zero_division=0))



XGBoost - Test Accuracy: 0.965

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        36
           1       0.97      0.93      0.95        30
           2       0.91      1.00      0.95        52
           3       0.98      0.95      0.97        65
           4       1.00      0.88      0.94        17

    accuracy                           0.96       200
   macro avg       0.97      0.95      0.96       200
weighted avg       0.97      0.96      0.96       200



In [21]:
xgb_val_pred = xgb.predict(X_val_np)
xgb_val_acc = accuracy_score(y_val_np, xgb_val_pred)

print("XGBoost - Validation Accuracy: {:.2f}%".format(xgb_val_acc * 100))


XGBoost - Validation Accuracy: 94.38%
