In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib  # to save the model later


In [2]:
df_model = pd.read_csv("my_data.csv")
df_model.head()

Unnamed: 0,symptoms_text,diseases
0,"anxiety and nervousness, shortness of breath, ...",panic_disorder
1,"shortness of breath, depressive or psychotic s...",panic_disorder
2,"anxiety and nervousness, depression, shortness...",panic_disorder
3,"anxiety and nervousness, depressive or psychot...",panic_disorder
4,"anxiety and nervousness, depression, insomnia,...",panic_disorder


In [3]:
df_model.shape

(114312, 2)

In [4]:
# Assuming df_model has "symptoms_text" and "diseases" columns
X = df_model["symptoms_text"]
y = df_model["diseases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [5]:
vectorizer = TfidfVectorizer(
    max_features=3000,      # limits features for speed
    ngram_range=(1, 2),     # captures phrases like "chest pain"
    stop_words="english"    # remove common words
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [6]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [7]:
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


                                             precision    recall  f1-score   support

                          actinic_keratosis       0.92      0.68      0.78       182
                        acute_bronchiolitis       0.92      0.93      0.92       241
                           acute_bronchitis       0.82      0.71      0.76       243
                         acute_bronchospasm       0.64      0.73      0.68       181
                        acute_kidney_injury       0.95      0.98      0.97       182
                         acute_pancreatitis       0.95      0.88      0.92       241
                            acute_sinusitis       0.83      0.90      0.86       181
                      acute_stress_reaction       0.90      0.96      0.93       182
                                    allergy       0.99      0.97      0.98       181
                                     angina       0.98      0.96      0.97       181
                                    anxiety       0.96      0.92

In [8]:
from sklearn.metrics import accuracy_score

# Train accuracy
y_train_pred = model.predict(X_train_tfidf)
train_acc = accuracy_score(y_train, y_train_pred)
print(f"Training Accuracy: {train_acc:.4f}")

# Test accuracy
y_test_pred = model.predict(X_test_tfidf)
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_acc:.4f}")


Training Accuracy: 0.8940
Test Accuracy: 0.8836


In [10]:
def predict_disease(symptoms_text):
    text_tfidf = vectorizer.transform([symptoms_text])
    prediction = model.predict(text_tfidf)[0]
    return prediction

# Example:
user_input = " anxiety and nervousness, shortness of breath,depression ,insomnia"
print("Predicted disease:", predict_disease(user_input))


Predicted disease: acute_stress_reaction


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Load your data
# df_model has columns: 'symptoms_text' and 'diseases'

# 2. Vectorize symptoms
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2), stop_words='english')
X_tfidf = vectorizer.fit_transform(df_model['symptoms_text'])

# 3. Chatbot function
def medical_chatbot(user_input):
    # Convert user input to TF-IDF
    input_vec = vectorizer.transform([user_input])
    
    # Compute similarity with all dataset symptoms
    similarities = cosine_similarity(input_vec, X_tfidf).flatten()
    
    # Get the most similar symptom index
    idx = similarities.argmax()
    
    # Return the corresponding disease
    return df_model.iloc[idx]['diseases']




In [14]:
# 4. Try it
user_input = "dizziness ,insomnia"
predicted_disease = medical_chatbot(user_input)
print("Predicted disease:", predicted_disease)

Predicted disease: acute_stress_reaction


In [15]:
from sklearn.metrics import accuracy_score, classification_report

# Assuming your dataset is already split
# For demonstration, let's use a simple 80/20 split
from sklearn.model_selection import train_test_split

X = df_model['symptoms_text']
y = df_model['diseases']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Refit vectorizer only on training data
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)

# Update chatbot function to use training TF-IDF
def medical_chatbot(user_input):
    input_vec = vectorizer.transform([user_input])
    similarities = cosine_similarity(input_vec, X_train_tfidf).flatten()
    idx = similarities.argmax()
    return X_train.iloc[idx], y_train.iloc[idx]  # return matched symptoms + disease

# Evaluate on test set
y_pred = [medical_chatbot(text)[1] for text in X_test]  # only take predicted disease

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.8299

Classification Report:
                                             precision    recall  f1-score   support

                          actinic_keratosis       0.72      0.64      0.68       182
                        acute_bronchiolitis       0.89      0.91      0.90       241
                           acute_bronchitis       0.66      0.71      0.68       243
                         acute_bronchospasm       0.56      0.61      0.58       181
                        acute_kidney_injury       0.94      0.99      0.97       182
                         acute_pancreatitis       0.90      0.89      0.89       241
                            acute_sinusitis       0.81      0.81      0.81       181
                      acute_stress_reaction       0.87      0.88      0.88       182
                                    allergy       0.93      0.96      0.94       181
                                     angina       0.95      0.93      0.94       181
                  

In [16]:
import random


In [17]:

X = df_model['symptoms_text']
y = df_model['diseases']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -------------------------------
# 4️⃣ TF-IDF vectorizer (fit only on train)
# -------------------------------
vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)

# -------------------------------
# 5️⃣ Define chatbot function
# -------------------------------
# Rule-based greetings and fallbacks
greetings = ["hello", "hi", "hey", "good morning", "good evening"]
greeting_responses = [
    "Hello! How can I help you with your symptoms today?", 
    "Hi there! Tell me your symptoms so I can assist.",
    "Hey! What symptoms are you experiencing?"
]

fallback_responses = [
    "I'm here to help with medical symptoms and possible diseases.",
    "Please tell me your symptoms so I can assist you."
]

In [18]:
# Main chatbot function
def medical_chatbot(user_input):
    user_input_lower = user_input.lower()
    
    # 1. Greetings
    if any(word in user_input_lower for word in greetings):
        return random.choice(greeting_responses)
    
    # 2. Non-medical queries 
    symptom_keywords = ["pain", "fever", "cough", "dizzy", "headache", "shortness", "nausea"]
    if not any(word in user_input_lower for word in symptom_keywords):
        return random.choice(fallback_responses)
    
    # 3. TF-IDF retrieval
    input_vec = vectorizer.transform([user_input])
    similarities = cosine_similarity(input_vec, X_train_tfidf).flatten()
    idx = similarities.argmax()
    
    return f"Based on your symptoms, you might have: {y_train.iloc[idx]}"

#Evaluate chatbot performan
y_pred = [medical_chatbot(text) for text in X_test]

# Remove "Based on your symptoms, you might have: " prefix to match labels
y_pred_clean = [pred.replace("Based on your symptoms, you might have: ", "") for pred in y_pred]


In [28]:

y_pred_clean = [pred.replace("Based on your symptoms, you might have: ", "") for pred in y_pred]

accuracy = accuracy_score(y_test, y_pred_clean)
print(f"Test Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred_clean))
      
# Optional: interactive chat loop
print("\nMedical Chatbot: type 'exit' to quit")
while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break
    response = medical_chatbot(user_input)
    print("Chatbot:", response)

Test Accuracy: 0.4953

Classification Report:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


                                                               precision    recall  f1-score   support

          Hello! How can I help you with your symptoms today?       0.00      0.00      0.00         0
                     Hey! What symptoms are you experiencing?       0.00      0.00      0.00         0
             Hi there! Tell me your symptoms so I can assist.       0.00      0.00      0.00         0
I'm here to help with medical symptoms and possible diseases.       0.00      0.00      0.00         0
            Please tell me your symptoms so I can assist you.       0.00      0.00      0.00         0
                                            actinic_keratosis       0.00      0.00      0.00       182
                                          acute_bronchiolitis       0.87      0.41      0.56       241
                                             acute_bronchitis       0.50      0.14      0.22       243
                                           acute_bronchospasm       0.52

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


You:  hello


Chatbot: Hello! How can I help you with your symptoms today?


You:  i have a cough,headache,sneezing


Chatbot: Based on your symptoms, you might have: seasonal_allergies_hay_fever_


You:  exit


Chatbot: Goodbye!


In [19]:
import pickle
import pandas as pd

# Save TF-IDF vectorizer
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

# Save training data
train_data = pd.DataFrame({
    "symptoms_text": X_train,   # or df_model['symptoms_text']
    "diseases": y_train         # or df_model['diseases']
})
train_data.to_csv("train_data.csv", index=False)


In [20]:
import pickle
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import random

# Load vectorizer and training data
with open("tfidf_vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

train_data = pd.read_csv("train_data.csv")
X_train = train_data['symptoms_text']
y_train = train_data['diseases']

greetings = ["hello", "hi", "hey", "good morning", "good evening"]
greeting_responses = [
    "Hello! How can I help you with your symptoms today?", 
    "Hi there! Tell me your symptoms so I can assist.",
    "Hey! What symptoms are you experiencing?"
]
fallback_responses = [
    "I'm here to help with medical symptoms and possible diseases.",
    "Please tell me your symptoms so I can assist you."
]

def medical_chatbot(user_input):
    user_input_lower = user_input.lower()
    if any(word in user_input_lower for word in greetings):
        return random.choice(greeting_responses)
    
    symptom_keywords = ["pain", "fever", "cough", "dizzy", "headache", "shortness", "nausea"]
    if not any(word in user_input_lower for word in symptom_keywords):
        return random.choice(fallback_responses)
    
    input_vec = vectorizer.transform([user_input])
    similarities = cosine_similarity(input_vec, vectorizer.transform(X_train)).flatten()
    idx = similarities.argmax()
    
    return f"Based on your symptoms, you might have: {y_train.iloc[idx]}"


In [21]:
from medical_chatbot import medical_chatbot

while True:
    user_input = input("You: ")
    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break
    print("Chatbot:", medical_chatbot(user_input))


ModuleNotFoundError: No module named 'medical_chatbot'