In [27]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

df = pd.read_excel(
    r"C:\Users\Anu\Desktop\RESEARCH\data2\clinical_biomarker_NF.xlsx"
)

print(df.shape)
df.head()


(1000, 34)


Unnamed: 0,age,gender,height,weight,occupation,physical_activity_level,living_environment,do_you_currently_experience_knee_pain,pain_score,stiffness,...,does_the_patient_have_any_other_health_conditions_or_risk_factors_that_may_contribute_to_knee_osteoarthritis,what_are_the_suggested_or_ongoing_treatments_for_the_patients_current_condition,koa,does_the_patient_has_obesity,does_the_patient_has_diabetes,does_the_patient_has_hypertension,does_the_patient_has_vitamin_d_deficiency,does_the_patient_has_rheumatoid_arthritis,bmi,koa_grade
0,70.0,Female,1.55,60.0,No,Moderate,Urban,Yes,3.0,Frequently,...,"Obesity or overweight, Family history of osteo...","0 = Lifestyle modification, 1 = Physiotherapy,...",Yes,Yes,No,No,No,No,24.973985,1
1,48.0,Female,1.57,70.0,No,Moderate,Rural,Yes,4.0,Frequently,...,Family history of osteoarthritis,"0 = Lifestyle modification, 1 = Physiotherapy,...",Yes,No,No,No,No,No,28.398718,2
2,54.0,Female,1.58,76.0,No,Moderate,Urban,Yes,3.0,Never,...,"Vitamin D deficiency, Sedentary or low-activit...","0 = Lifestyle modification, 1 = Physiotherapy,...",Yes,No,No,No,Yes,No,30.443839,3
3,52.0,Female,1.55,60.0,No,Low,Urban,Yes,2.0,Occasionally,...,None of the above,"2 = Medication,",Yes,No,No,No,No,No,24.973985,1
4,74.0,Female,1.55,63.0,No,Moderate,Urban,Yes,2.0,Occasionally,...,Vitamin D deficiency,2 = Medication,Yes,No,No,No,Yes,No,26.222685,1


In [28]:
TARGET = "koa_grade"

X = df.drop(columns=[TARGET, "koa_severity"])
y = df[TARGET]

print(X.shape, y.shape)


(1000, 32) (1000,)


In [29]:
num_cols = X.select_dtypes(include=["int64","float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

print("Numeric columns:", num_cols)
print("Categorical columns:", cat_cols)


Numeric columns: ['age', 'height', 'weight', 'pain_score', 'fbs', 'wbc', 'platelets', 'cs', 'cholesterol', 'crp', 'esr', 'rf', 'fbc', 'bmi']
Categorical columns: ['gender', 'occupation', 'physical_activity_level', 'living_environment', 'do_you_currently_experience_knee_pain', 'stiffness', 'have_you_had_any_previous_knee_injuries_(acl_tear,_meniscus_tear,_fracture,_etc.)', 'do_you_experience_swelling_in_your_knees', 'do_you_find_difficulty_in_performing_these_activities_(check_all_that_apply)', 'do_you_have_a_family_history_of_osteoarthritis', 'does_the_patient_have_any_other_health_conditions_or_risk_factors_that_may_contribute_to_knee_osteoarthritis', 'what_are_the_suggested_or_ongoing_treatments_for_the_patients_current_condition', 'koa', 'does_the_patient_has_obesity', 'does_the_patient_has_diabetes', 'does_the_patient_has_hypertension', 'does_the_patient_has_vitamin_d_deficiency', 'does_the_patient_has_rheumatoid_arthritis']


In [30]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_maps = {
    "stiffness": ["never", "occasionally", "frequently", "always"]
}

for col, order in ordinal_maps.items():
    if col in X.columns:
        X[col] = pd.Categorical(X[col], categories=order, ordered=True).codes


In [31]:
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)


In [32]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])


In [33]:
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(
    X,
    y,
    test_size=0.20,
    stratify=y,
    random_state=42
)


X_train, X_val, y_train, y_val = train_test_split(
    X_trainval,
    y_trainval,
    test_size=0.20,
    stratify=y_trainval,
    random_state=42
)


Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_split=3,
    min_samples_leaf=2,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


In [36]:
rf_train_pred = rf.predict(X_train)
rf_train_acc = accuracy_score(y_train, rf_train_pred)

print("Random Forest - Training Accuracy: {:.2f}%".format(rf_train_acc * 100))


Random Forest - Training Accuracy: 95.94%


In [37]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Classes in y_test:", sorted(y_test.unique()))
print("All possible classes:", sorted(y.unique()))


y_pred_rf = rf.predict(X_test)

print("Random Forest - Test Accuracy:",
      accuracy_score(y_test, y_pred_rf))

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))


Classes in y_test: [0, 1, 2, 3, 4]
All possible classes: [0, 1, 2, 3, 4]
Random Forest - Test Accuracy: 0.955

Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99        36
           1       0.91      1.00      0.95        30
           2       0.94      0.94      0.94        52
           3       0.97      0.94      0.95        65
           4       1.00      0.88      0.94        17

    accuracy                           0.95       200
   macro avg       0.96      0.95      0.95       200
weighted avg       0.96      0.95      0.95       200



In [38]:
rf_val_pred = rf.predict(X_val)
rf_val_acc = accuracy_score(y_val, rf_val_pred)

print("Random Forest - Validation Accuracy: {:.2f}%".format(rf_val_acc * 100))


Random Forest - Validation Accuracy: 95.00%
