<a href="https://colab.research.google.com/github/Vishesh-Goyal7/NeuroMechs/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install dependencies
!pip install transformers accelerate sentencepiece pandas



In [None]:
# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
from huggingface_hub import login
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

login(token="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")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Step 4: Load symptom-disease dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/symptomMap.csv")  # Or upload your CSV
all_symptoms = df.columns[:-1].tolist()

In [None]:
# Step 5: LLM helper function
def ask_llm(user_symptom_input, known_symptoms):
    symptom_list_str = ", ".join(known_symptoms)
    prompt = (
    "You are a medical chatbot. When a user tells you a symptom, "
    "identify and acknowledge it. If the user says 'done', end the list.\n\n"
)
    response = llm_pipeline(prompt, max_new_tokens=100, do_sample=True, temperature=0.7)[0]['generated_text']

    # Extract only the list from the response
    try:
        extracted = eval(response.split("[")[-1].split("]")[0])
        if isinstance(extracted, str):  # Convert single string to list
            return [extracted.strip()]
        return [sym.strip() for sym in extracted]
    except:
        return []

In [None]:
# Step 6: Function to get matching diseases
def get_matching_diseases(symptoms):
    if not symptoms:
        return []
    filtered = df.copy()
    for symptom in symptoms:
        if symptom in filtered.columns:
            filtered = filtered[filtered[symptom] == 1]
    # Access the disease column using its actual name from the DataFrame
    # This could be 'Disease' or another name, check your CSV file
    return filtered['Disease'].unique().tolist() # Changed 'disease' to 'Disease'

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

symptom_cols = df.columns[1:]
symptom_texts = [col.replace("_", " ").lower() for col in symptom_cols]

symptom_embeddings = model.encode(symptom_texts, convert_to_tensor=True)

def get_user_symptoms(user_input, top_k=5, similarity_threshold=0.4):
    input_embedding = model.encode(user_input, convert_to_tensor=True)
    cos_scores = util.cos_sim(input_embedding, symptom_embeddings)[0]
    top_results = (cos_scores > similarity_threshold).nonzero()

    # Check if top_results[0] is a string and convert to a list if necessary
    indices = top_results[0]
    if not isinstance(indices, torch.Tensor): # Check if indices is a Tensor and convert to list
        indices = [int(indices)] if isinstance(indices, str) else [indices] # Convert string/int to list
    else:
        indices = indices.tolist() # Convert Tensor to list

    matched_symptoms = [(symptom_cols[i], float(cos_scores[i])) for i in indices]  # Use the indices list
    matched_symptoms.sort(key=lambda x: x[1], reverse=True)

    return matched_symptoms[:top_k]

In [None]:
def get_user_symptoms(user_input, top_k=5, similarity_threshold=0.4):
    input_embedding = model.encode(user_input, convert_to_tensor=True)
    cos_scores = util.cos_sim(input_embedding, symptom_embeddings)[0]
    top_results = (cos_scores > similarity_threshold).nonzero()

    # Check if top_results is empty
    if top_results.size(0) == 0:  # If top_results is empty
        return []  # Return an empty list to indicate no matches

    # Check if top_results[0] is a string and convert to a list if necessary
    indices = top_results[0] # Access the first element only if top_results is not empty
    if not isinstance(indices, torch.Tensor): # Check if indices is a Tensor and convert to list
        indices = [int(indices)] if isinstance(indices, str) else [indices] # Convert string/int to list
    else:
        indices = indices.tolist() # Convert Tensor to list

    matched_symptoms = [(symptom_cols[i], float(cos_scores[i])) for i in indices]  # Use the indices list
    matched_symptoms.sort(key=lambda x: x[1], reverse=True)

    return matched_symptoms[:top_k]

In [None]:
def match_symptom_semantic(user_input):
    inputs = [i.strip().lower() for i in user_input.split(',')]
    matched_symptoms = {}

    for input_symptom in inputs:
        input_embedding = model.encode(input_symptom, convert_to_tensor=True).to(symptom_embeddings.device)
        similarities = util.cos_sim(input_embedding, symptom_embeddings)
        best_match_idx = similarities.argmax()
        best_match_score = similarities[0, best_match_idx].item()

        if best_match_score > 0.5:
            matched_symptoms[input_symptom] = symptom_cols[int(best_match_idx)]
        else:
            matched_symptoms[input_symptom] = None

    return matched_symptoms

In [None]:
import re

def preprocess_symptoms(user_input):
    user_input = user_input.lower()
    user_input = re.sub(r"\b(i have|i'm|i am|i've got|feeling|suffering from|dealing with|experiencing|got|having)\b", "", user_input)
    user_input = user_input.replace(" and ", ";").replace(",", ";")
    phrases = [p.strip().strip(".") for p in user_input.split(";") if p.strip()]

    return phrases


In [None]:
import datetime

def chatbot_interaction():
    user_symptoms = []

    print("🤖 MediBot: Hello! I'm MediBot, your personal symptom checker powered by AI.")
    print("🤖 MediBot: Please enter any symptoms you're experiencing one by one.")
    print("           Type 'done' when you're finished.\n")

    while True:
      user_input = input("🧑 User: ").strip()
      if user_input.lower() == 'done':
          break

      possible_symptoms = preprocess_symptoms(user_input)

      for raw_symptom in possible_symptoms:
          matches = match_symptom_semantic(raw_symptom)
          matched = matches.get(raw_symptom)

          if matched:
              user_symptoms.append(matched)
              prompt = f"User reports: {matched}. What else might be related?"
              response = llm_pipeline(prompt, max_new_tokens=60, do_sample=True, temperature=0.7)[0]['generated_text']
              print(f"🤖 MediBot: Got it. You're experiencing '{matched}'. {response.splitlines()[0]}")
          else:
              print(f"🤖 MediBot: Couldn't confidently match '{raw_symptom}'. Please rephrase or try another symptom.")

    if not user_symptoms:
        print("🤖 MediBot: I couldn't understand any symptoms. Please try again with more common descriptions.")
        return

    print("\n🔍 MediBot: Analyzing your symptoms...")
    print(f"📌 Symptoms considered: {', '.join(user_symptoms)}")

    # Disease prediction based on refined symptom list
    predicted_diseases = get_matching_diseases(user_symptoms)

    if predicted_diseases:
        print("\n🩺 MediBot: Based on your symptoms, here are some possible conditions:\n")
        for idx, disease in enumerate(predicted_diseases, 1):
            print(f"   {idx}. {disease}")
    else:
        print("\n🤖 MediBot: I couldn't match your symptoms to any known conditions. Please consult a medical professional.")

    print(f"\n📅 Date: {datetime.date.today().strftime('%B %d, %Y')}")
    print("✅ Thank you for using MediBot. Stay healthy!")

chatbot_interaction()

🤖 MediBot: Hello! I'm MediBot, your personal symptom checker powered by AI.
🤖 MediBot: Please enter any symptoms you're experiencing one by one.
           Type 'done' when you're finished.

🧑 User: cold


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 

In [None]:
def chatbot_interaction():
    user_symptoms = []
    print("MediBot: Hello! I'm here to help you understand your symptoms.")
    while True:
        user_input = input("User: Enter a symptom (or type 'done' if finished): ")
        if user_input.lower() == 'done':
            break
        user_symptoms.append(user_input)

        # Mistral for chatbot-like response:
        prompt = f"MediBot: Okay, you mentioned {user_input}. Anything else?"
        response = llm_pipeline(prompt, max_new_tokens=50, do_sample=True, temperature=0.7)[0]['generated_text']

        # Print the entire response from Mistral:
        print(response)  # This is the line that was missing

    matches = match_symptom_semantic(", ".join(user_symptoms))

    print("\nMediBot: Symptom Matches:")
    for k, v in matches.items():
        if v:
            print(f"Input: '{k}' ➔ Matched: '{v}'")
        else:
            print(f"Input: '{k}' ➔ No good match found.")

    # Disease prediction based on matched symptoms:
    matched_symptoms_list = [v for k, v in matches.items() if v is not None]  # Get list of matched symptoms
    predicted_diseases = get_matching_diseases(matched_symptoms_list) # Function to predict diseases using df

    if predicted_diseases:
        print("\nMediBot: Possible Diseases:")
        for disease in predicted_diseases:
            print(f"- {disease}")
    else:
        print("\nMediBot: No matching diseases found in the dataset.")

chatbot_interaction()

MediBot: Hello! I'm here to help you understand your symptoms.
User: Enter a symptom (or type 'done' if finished): hey i have cold


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


KeyboardInterrupt: 