In [1]:
!pip install -q sentence-transformers

In [2]:
import pandas as pd
import pickle
import os
from sentence_transformers import SentenceTransformer

# Create model directory if it doesn't exist
os.makedirs("../model", exist_ok=True)

In [3]:
# Load the CSV file
df = pd.read_csv("../data/Final_Augmented_dataset_Diseases_and_Symptoms.csv")

# Extract all symptom column names (excluding 'diseases')
all_symptoms = df.columns.tolist()
all_symptoms.remove("diseases")

# Save the list of symptoms for use in app
with open("../model/model_all_symptoms.pkl", "wb") as f:
    pickle.dump(all_symptoms, f)

In [4]:
# Load pretrained SentenceTransformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Create a dictionary mapping symptom → vector
symptom_embeddings = {symptom: model.encode(symptom) for symptom in all_symptoms}

# Save the embeddings
with open("../model/model_symptom_embeddings.pkl", "wb") as f:
    pickle.dump(symptom_embeddings, f)

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from sentence_transformers import util, SentenceTransformer
import pickle

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load models
with open("../model/model_symptom_embeddings.pkl", "rb") as f:
    symptom_embeddings = pickle.load(f)

symptom_list = list(symptom_embeddings.keys())
symptom_vectors = list(symptom_embeddings.values())

model = SentenceTransformer("all-MiniLM-L6-v2")

# Clean and extract keywords from user message
def clean_input(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # remove punctuation
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    return " ".join(filtered)

# Match to top symptoms
def extract_symptoms_cleaned(user_input, top_k=5):
    cleaned = clean_input(user_input)
    input_vec = model.encode(cleaned, convert_to_tensor=True)
    input_vec = input_vec.cpu()  # ✅ force to CPU to match stored embeddings

    scores = [(symptom, float(util.cos_sim(input_vec, vec))) for symptom, vec in symptom_embeddings.items()]
    top_matches = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]
    return top_matches

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeetshah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
test_input = "I have a sore throat and a really bad chest pain"
print("Cleaned:", clean_input(test_input))
for symptom, score in extract_symptoms_cleaned(test_input):
    print(f"{symptom} → score: {score:.4f}")

Cleaned: sore throat really bad chest pain


RuntimeError: Tensor for argument #2 'mat2' is on CPU, but expected it to be on GPU (while checking arguments for mm)