In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib

# Set seed for reproducibility
np.random.seed(42)

# Create synthetic data
def generate_synthetic_data(n=1000):
    data = []
    for _ in range(n):
        age = np.random.randint(30, 85)
        bmi = np.round(np.random.uniform(18, 40), 1)
        oa_severity = np.random.randint(1, 5)  # 1-4
        activity = np.random.randint(0, 3)  # 0=low, 1=medium, 2=high
        smoking = np.random.choice([0, 1])  # 0=no, 1=yes
        pain_score = np.round(np.random.uniform(1, 10), 1)

        # Basic logic for assigning comorbidity
        if age > 65 and bmi > 32 and smoking == 1 and pain_score > 7:
            comorbidity = 'multiple'
        elif age > 60 and smoking == 1:
            comorbidity = 'cardiovascular'
        elif bmi > 30 and pain_score > 5:
            comorbidity = 'diabetes'
        else:
            comorbidity = 'none'

        data.append([age, bmi, oa_severity, activity, smoking, pain_score, comorbidity])

    df = pd.DataFrame(data, columns=[
        'age', 'bmi', 'oa_severity', 'activity', 'smoking', 'pain_score', 'comorbidity'
    ])
    return df

# Generate and preview data
df = generate_synthetic_data(1000)
print(df['comorbidity'].value_counts())

# Encode target
df['comorbidity'] = df['comorbidity'].astype('category')
df['comorbidity_code'] = df['comorbidity'].cat.codes

# Train/test split
X = df[['age', 'bmi', 'oa_severity', 'activity', 'smoking', 'pain_score']]
y = df['comorbidity_code']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'osteoarthritis_comorbidity_model.pkl')

# Save mapping
label_mapping = dict(enumerate(df['comorbidity'].cat.categories))
joblib.dump(label_mapping, 'label_mapping.pkl')

print("✅ Model and label mapping saved.")


comorbidity
none              583
cardiovascular    205
diabetes          188
multiple           24
Name: count, dtype: int64
✅ Model and label mapping saved.
