In [1]:
import pandas as pd

# Load symptom-based dataset
symptom_data = pd.read_excel(r"D:\Z\medbot_2\data\data.xlsx")

# Load heart disease dataset
heart_data = pd.read_excel(r"D:\Z\medbot_2\data\Heart_disease_statlog.xlsx")

# Preview
print("Symptom Dataset Sample:")
print(symptom_data.head())

print("\nHeart Disease Dataset Sample:")
print(heart_data.head())

Symptom Dataset Sample:
   fever  cough  fatigue  headache  sore_throat  chest_pain  \
0      1      1        1         0            0           0   
1      1      0        1         1            1           0   
2      0      0        0         0            0           1   
3      1      1        1         1            0           1   
4      0      0        0         1            0           0   

   shortness_of_breath      disease  
0                    0  Common Cold  
1                    0          Flu  
2                    1    Pneumonia  
3                    1     COVID-19  
4                    0     Migraine  

Heart Disease Dataset Sample:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   70    1   3       130   322    0        2      109      0      2.4      1   
1   67    0   2       115   564    0        2      160      0      1.6      1   
2   57    1   1       124   261    0        0      141      0      0.3      0   
3   64    1   

In [2]:
# Symptom dataset
X_symptom = symptom_data.drop(columns=["disease"])
y_symptom = symptom_data["disease"]

# Heart disease dataset
X_heart = heart_data.drop(columns=["target"])
y_heart = heart_data["target"]

In [3]:
from sklearn.model_selection import train_test_split

# Symptom data split
X_sym_train, X_sym_test, y_sym_train, y_sym_test = train_test_split(X_symptom, y_symptom, test_size=0.2, random_state=42)

# Heart disease data split
X_heart_train, X_heart_test, y_heart_train, y_heart_test = train_test_split(X_heart, y_heart, test_size=0.2, random_state=42)

In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Models for symptom-based prediction
models_symptom = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB()
}

# Models for heart disease prediction
models_heart = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier()
}

In [5]:
import joblib, os

# Train symptom models
print("  Symptom-Based Disease Prediction:")
for name, model in models_symptom.items():
    model.fit(X_sym_train, y_sym_train)
    acc = model.score(X_sym_test, y_sym_test)
    print(f"{name} Accuracy: {acc * 100:.2f}%")
    joblib.dump(model, f"model/{name.replace(' ', '_').lower()}_symptom_model.pkl")
    print(f"  {name} model saved.\n")

# Train heart disease models
print(" Heart Disease Prediction:")
for name, model in models_heart.items():
    model.fit(X_heart_train, y_heart_train)
    acc = model.score(X_heart_test, y_heart_test)
    print(f"{name} Accuracy: {acc * 100:.2f}%")
    joblib.dump(model, f"model/{name.replace(' ', '_').lower()}_heart_model.pkl")
    print(f"  {name} model saved.\n")

  Symptom-Based Disease Prediction:
Decision Tree Accuracy: 0.00%
  Decision Tree model saved.

Random Forest Accuracy: 0.00%
  Random Forest model saved.

Naive Bayes Accuracy: 0.00%
  Naive Bayes model saved.

 Heart Disease Prediction:
Logistic Regression Accuracy: 92.59%
  Logistic Regression model saved.

Random Forest Accuracy: 87.04%
  Random Forest model saved.

SVM Accuracy: 62.96%
  SVM model saved.

K-Nearest Neighbors Accuracy: 62.96%
  K-Nearest Neighbors model saved.



In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load datasets
symptom_data = pd.read_excel(r"D:\Z\medbot_2\data\data.xlsx")
heart_data = pd.read_excel(r"D:\Z\medbot_2\data\Heart_disease_statlog.xlsx")

print("✅ Data Loaded Successfully\n")
print("Symptom Dataset Sample:")
display(symptom_data.head())

print("\nHeart Disease Dataset Sample:")
display(heart_data.head())

# Split datasets
X_symptom = symptom_data.drop(columns=["disease"])
y_symptom = symptom_data["disease"]

X_heart = heart_data.drop(columns=["target"])
y_heart = heart_data["target"]

# Standardize heart dataset (helps SVM, LogisticRegression)
scaler = StandardScaler()
X_heart_scaled = scaler.fit_transform(X_heart)

# Split data
X_sym_train, X_sym_test, y_sym_train, y_sym_test = train_test_split(
    X_symptom, y_symptom, test_size=0.2, random_state=42)

X_heart_train, X_heart_test, y_heart_train, y_heart_test = train_test_split(
    X_heart_scaled, y_heart, test_size=0.2, random_state=42)


✅ Data Loaded Successfully

Symptom Dataset Sample:


Unnamed: 0,fever,cough,fatigue,headache,sore_throat,chest_pain,shortness_of_breath,disease
0,1,1,1,0,0,0,0,Common Cold
1,1,0,1,1,1,0,0,Flu
2,0,0,0,0,0,1,1,Pneumonia
3,1,1,1,1,0,1,1,COVID-19
4,0,0,0,1,0,0,0,Migraine



Heart Disease Dataset Sample:


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,70,1,3,130,322,0,2,109,0,2.4,1,3,1,1
1,67,0,2,115,564,0,2,160,0,1.6,1,0,3,0
2,57,1,1,124,261,0,0,141,0,0.3,0,0,3,1
3,64,1,3,128,263,0,0,105,1,0.2,1,1,3,0
4,74,0,1,120,269,0,2,121,1,0.2,0,1,1,0


In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib, os

os.makedirs("model", exist_ok=True)

# Models
models_symptom = {
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Naive Bayes": GaussianNB()
}

models_heart = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

# Training
print("🔹 Training Symptom-Based Disease Models:\n")
for name, model in models_symptom.items():
    model.fit(X_sym_train, y_sym_train)
    preds = model.predict(X_sym_test)
    acc = accuracy_score(y_sym_test, preds)
    print(f"{name}: {acc*100:.2f}%")
    print(classification_report(y_sym_test, preds))
    joblib.dump(model, f"model/{name.replace(' ', '_').lower()}_symptom_model.pkl")

print("\n🔹 Training Heart Disease Models:\n")
for name, model in models_heart.items():
    model.fit(X_heart_train, y_heart_train)
    preds = model.predict(X_heart_test)
    acc = accuracy_score(y_heart_test, preds)
    print(f"{name}: {acc*100:.2f}%")
    print(classification_report(y_heart_test, preds))
    joblib.dump(model, f"model/{name.replace(' ', '_').lower()}_heart_model.pkl")

# Save the scaler too
joblib.dump(scaler, "model/heart_scaler.pkl")


🔹 Training Symptom-Based Disease Models:

Decision Tree: 0.00%
                  precision    recall  f1-score   support

     Common Cold       0.00      0.00      0.00       0.0
          Dengue       0.00      0.00      0.00       1.0
             Flu       0.00      0.00      0.00       1.0
Throat Infection       0.00      0.00      0.00       0.0

        accuracy                           0.00       2.0
       macro avg       0.00      0.00      0.00       2.0
    weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Random Forest: 0.00%
                  precision    recall  f1-score   support

        COVID-19       0.00      0.00      0.00       0.0
          Dengue       0.00      0.00      0.00       1.0
             Flu       0.00      0.00      0.00       1.0
Throat Infection       0.00      0.00      0.00       0.0

        accuracy                           0.00       2.0
       macro avg       0.00      0.00      0.00       2.0
    weighted avg       0.00      0.00      0.00       2.0

Naive Bayes: 0.00%
                  precision    recall  f1-score   support

        COVID-19       0.00      0.00      0.00       0.0
          Dengue       0.00      0.00      0.00       1.0
             Flu       0.00      0.00      0.00       1.0
Throat Infection       0.00      0.00      0.00       0.0

        accuracy                           0.00       2.0
       macro avg       0.00      0.00      0.00       2.0
    weighted avg       0.00      0.00      0.00       2.0


🔹 Training Heart Disease 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Random Forest: 85.19%
              precision    recall  f1-score   support

           0       0.84      0.94      0.89        33
           1       0.88      0.71      0.79        21

    accuracy                           0.85        54
   macro avg       0.86      0.83      0.84        54
weighted avg       0.86      0.85      0.85        54

SVM: 88.89%
              precision    recall  f1-score   support

           0       0.91      0.91      0.91        33
           1       0.86      0.86      0.86        21

    accuracy                           0.89        54
   macro avg       0.88      0.88      0.88        54
weighted avg       0.89      0.89      0.89        54

KNN: 79.63%
              precision    recall  f1-score   support

           0       0.79      0.91      0.85        33
           1       0.81      0.62      0.70        21

    accuracy                           0.80        54
   macro avg       0.80      0.76      0.77        54
weighted avg       0.80     

['model/heart_scaler.pkl']

In [None]:
import numpy as np

def load_models():
    models = {
        "symptom_model": joblib.load("model/random_forest_symptom_model.pkl"),
        "heart_model": joblib.load("model/logistic_regression_heart_model.pkl"),
        "heart_scaler": joblib.load("model/heart_scaler.pkl")
    }
    return models

def predict_disease_from_symptoms(model, symptom_input):
    """
    symptom_input: list or numpy array with symptom indicators (same order as training features)
    """
    input_data = np.array(symptom_input).reshape(1, -1)
    prediction = model.predict(input_data)[0]
    return prediction

def predict_heart_disease(model, scaler, patient_data):
    """
    patient_data: list like [age, sex, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal]
    """
    input_scaled = scaler.transform(np.array(patient_data).reshape(1, -1))
    prediction = model.predict(input_scaled)[0]
    return "Heart Disease Detected" if prediction == 1 else "No Heart Disease"


In [None]:
import joblib
model = ...  # Train your model here
joblib.dump(model, 'models/medbot_model.pkl')