In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

np.random.seed(42)

# BMI classification function
def classify_bmi(bmi):
    if bmi < 18.5:
        return "underweight"
    elif 18.5 <= bmi <= 24.99:
        return "normal"
    elif 25 <= bmi <= 29.99:
        return "overweight"
    else:
        return "obese"

def classify_bmi_asian(bmi):
    if bmi < 17.5:
        return "underweight"
    elif 17.5 <= bmi <= 22.99:
        return "normal"
    elif 23 <= bmi <= 27.99:
        return "overweight"
    else:
        return "obese"

# Enhanced data generation
def generate_synthetic_data(n=50000):
    data = []
    for _ in range(n):
        age = np.random.randint(30, 85)
        bmi = np.round(np.random.uniform(15, 45), 1)
        oa_severity = np.random.randint(1, 5)
        activity = np.random.randint(0, 3)
        smoking = np.random.choice([0, 1])
        pain_score = np.round(np.random.uniform(1, 10), 1)

        bmi_class_who = classify_bmi(bmi)
        bmi_class_asian = classify_bmi_asian(bmi)

        # Adjusted logic for more diversity
        if age > 65 and bmi_class_asian == "obese" and smoking == 1 and pain_score > 7:
            comorbidity = 'multiple'
        elif age > 60 and smoking == 1:
            comorbidity = 'cardiovascular'
        elif bmi_class_asian in ["overweight", "obese"] and pain_score > 5:
            comorbidity = 'diabetes'
        else:
            comorbidity = 'none'

        # Doctor flags
        clinical_suggestion = []
        if bmi_class_asian in ["overweight", "obese"] and oa_severity >= 3:
            clinical_suggestion.append("Recommend weight management for OA relief")
        if pain_score > 7:
            clinical_suggestion.append("High pain, consider advanced imaging")
        if smoking == 1:
            clinical_suggestion.append("Advise smoking cessation")

        data.append([
            age, bmi, oa_severity, activity, smoking, pain_score,
            bmi_class_who, bmi_class_asian, comorbidity, "; ".join(clinical_suggestion)
        ])

    df = pd.DataFrame(data, columns=[
        'age', 'bmi', 'oa_severity', 'activity', 'smoking', 'pain_score',
        'bmi_class_who', 'bmi_class_asian', 'comorbidity', 'clinical_note'
    ])
    return df

# Generate the full dataset
df = generate_synthetic_data(50000)

# Preview and encode
df['comorbidity'] = df['comorbidity'].astype('category')
df['comorbidity_code'] = df['comorbidity'].cat.codes

# Save full dataset for clinical use
df.to_csv('synthetic_osteoarthritis_data.csv', index=False)

# Prepare for ML
X = df[['age', 'bmi', 'oa_severity', 'activity', 'smoking', 'pain_score']]
y = df['comorbidity_code']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train and save model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

joblib.dump(model, 'osteoarthritis_comorbidity_model.pkl')
joblib.dump(dict(enumerate(df['comorbidity'].cat.categories)), 'label_mapping.pkl')

print("✅ Model trained on 50,000 records.")
print("📁 Saved: model + label mapping + full dataset.")

✅ Model trained on 50,000 records.
📁 Saved: model + label mapping + full dataset.


In [5]:
# Prepare for ML
X = df[['age', 'bmi', 'oa_severity', 'activity', 'smoking', 'pain_score']]
y = df['comorbidity_code']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train and save model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy Score: {accuracy:.2f}")

Accuracy Score: 1.00
