In [1]:
!pip install pandas scikit-learn joblib




[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib


In [13]:
# Sample symptoms and mappings
symptoms = [
    "chest pain, fatigue", "shortness of breath, dizziness",
    "forgetfulness, disorientation", "memory loss, headache",
    "back pain, stiffness", "joint pain, inflammation"
]
family_history = ["Heart Disease", "Alzheimer’s", "Arthritis", "None"]
genders = ["M", "F"]
insurance_types = ["Basic", "Premium", "Comprehensive"]


def random_entry():
    symptom = random.choice(symptoms)
    age = random.randint(30, 80)
    gender = random.choice(genders)
    fh = random.choice(family_history)
    billing = random.randint(1000, 3000)
    
    # Determine diagnosis based on symptom
    if "chest" in symptom or "breath" in symptom:
        category = "Cardiology"
    elif "memory" in symptom or "forgetfulness" in symptom:
        category = "Neurology"
    else:
        category = "Orthopedic"
    
     # Determine insurance type based on age and billing
    if age > 60 or billing > 2500:
        insurance = "Comprehensive"
    elif age <= 60 and billing <= 2000:
        insurance = "Basic"
    else:
        insurance = "Premium"
    
    return [symptom, age, gender, fh, billing, category, insurance]

# Generate 200 synthetic patient records
data = [random_entry() for _ in range(200)]
df = pd.DataFrame(data, columns=["Symptoms", "Age", "Gender", "Family_History", "Billing", "Diagnosis_Category", "Insurance_Type"])
df.head()


Unnamed: 0,Symptoms,Age,Gender,Family_History,Billing,Diagnosis_Category,Insurance_Type
0,"shortness of breath, dizziness",69,F,,1701,Cardiology,Comprehensive
1,"memory loss, headache",58,F,Arthritis,1676,Neurology,Basic
2,"forgetfulness, disorientation",79,M,,1875,Neurology,Comprehensive
3,"shortness of breath, dizziness",79,F,Heart Disease,1430,Cardiology,Comprehensive
4,"forgetfulness, disorientation",40,F,Heart Disease,1123,Neurology,Basic


In [14]:
# Encode categorical features
le_gender = LabelEncoder()
le_fh = LabelEncoder()
le_symptoms = LabelEncoder()
le_diagnosis = LabelEncoder()
le_insurance = LabelEncoder()

df["Gender_enc"] = le_gender.fit_transform(df["Gender"])
df["Family_History_enc"] = le_fh.fit_transform(df["Family_History"])
df["Symptoms_enc"] = le_symptoms.fit_transform(df["Symptoms"])
df["Diagnosis_Category_enc"] = le_diagnosis.fit_transform(df["Diagnosis_Category"])
df["Insurance_Type_enc"] = le_insurance.fit_transform(df["Insurance_Type"])

# Features and targets
X = df[["Symptoms_enc", "Age", "Gender_enc", "Family_History_enc", "Billing"]]
y_diagnosis = df["Diagnosis_Category_enc"]
y_insurance = df["Insurance_Type_enc"]

In [15]:
# Split data into training and testing sets for both models
X_train, X_test, y_train_diagnosis, y_test_diagnosis = train_test_split(X, y_diagnosis, test_size=0.2, random_state=42)
_, _, y_train_insurance, y_test_insurance = train_test_split(X, y_insurance, test_size=0.2, random_state=42)

In [6]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)


In [16]:
# Train RandomForest model for diagnosis
clf_diagnosis = RandomForestClassifier(n_estimators=100, random_state=42)
clf_diagnosis.fit(X_train, y_train_diagnosis)

In [17]:
# Train RandomForest model for insurance
clf_insurance = RandomForestClassifier(n_estimators=100, random_state=42)
clf_insurance.fit(X_train, y_train_insurance)

In [18]:
# Make predictions and evaluate both models
y_pred_diagnosis = clf_diagnosis.predict(X_test)
y_pred_insurance = clf_insurance.predict(X_test)

In [19]:
print("Diagnosis Model Evaluation:")
print(classification_report(y_test_diagnosis, y_pred_diagnosis, target_names=le_diagnosis.classes_))

print("Insurance Model Evaluation:")
print(classification_report(y_test_insurance, y_pred_insurance, target_names=le_insurance.classes_))

Diagnosis Model Evaluation:
              precision    recall  f1-score   support

  Cardiology       1.00      0.91      0.95        11
   Neurology       0.83      0.94      0.88        16
  Orthopedic       0.83      0.77      0.80        13

    accuracy                           0.88        40
   macro avg       0.89      0.87      0.88        40
weighted avg       0.88      0.88      0.87        40

Insurance Model Evaluation:
               precision    recall  f1-score   support

        Basic       1.00      1.00      1.00        12
Comprehensive       1.00      1.00      1.00        20
      Premium       1.00      1.00      1.00         8

     accuracy                           1.00        40
    macro avg       1.00      1.00      1.00        40
 weighted avg       1.00      1.00      1.00        40



In [21]:
import os

# Create 'models' directory if it doesn't exist
os.makedirs('models', exist_ok=True)


In [22]:
import os

# Create directory
os.makedirs('models', exist_ok=True)

# Save the models and encoders
joblib.dump(clf_diagnosis, 'models/diagnosis_model.pkl')
joblib.dump(clf_insurance, 'models/insurance_model.pkl')
joblib.dump(le_gender, 'models/le_gender.pkl')
joblib.dump(le_fh, 'models/le_fh.pkl')
joblib.dump(le_symptoms, 'models/le_symptoms.pkl')
joblib.dump(le_diagnosis, 'models/le_diagnosis.pkl')
joblib.dump(le_insurance, 'models/le_insurance.pkl')

print("✅ Models and encoders saved in /models directory.")


✅ Models and encoders saved in /models directory.
