<a href="https://colab.research.google.com/github/Vishesh-Goyal7/NeuroMechs/blob/Chatbot/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install dependencies
!pip install transformers accelerate sentencepiece pandas

In [None]:
# Step 2: Import libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [None]:
from huggingface_hub import login
model_id = "mistralai/Mistral-7B-Instruct-v0.1"

login(token="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")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Step 4: Load symptom-disease dataset
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/symptomMap.csv")  # Or upload your CSV
all_symptoms = df.columns[:-1].tolist()

In [None]:
# Step 5: LLM helper function
def ask_llm(user_symptom_input, known_symptoms):
    symptom_list_str = ", ".join(known_symptoms)
    prompt = (
    "You are a medical chatbot. When a user tells you a symptom, "
    "identify and acknowledge it. If the user says 'done', end the list.\n\n"
)

    # Extract only the list from the response
    try:
        extracted = eval(response.split("[")[-1].split("]")[0])
        if isinstance(extracted, str):  # Convert single string to list
            return [extracted.strip()]
        return [sym.strip() for sym in extracted]
    except:
        return []

In [None]:
# Step 6: Function to get matching diseases
def get_matching_diseases(symptoms):
    if not symptoms:
        return []
    filtered = df.copy()
    for symptom in symptoms:
        if symptom in filtered.columns:
            filtered = filtered[filtered[symptom] == 1]
    # Access the disease column using its actual name from the DataFrame
    # This could be 'Disease' or another name, check your CSV file
    return filtered['Disease'].unique().tolist() # Changed 'disease' to 'Disease'

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

symptom_cols = df.columns[1:]
symptom_texts = [col.replace("_", " ").lower() for col in symptom_cols]

symptom_embeddings = model.encode(symptom_texts, convert_to_tensor=True)

def get_user_symptoms(user_input, top_k=5, similarity_threshold=0.4):
    input_embedding = model.encode(user_input, convert_to_tensor=True)
    cos_scores = util.cos_sim(input_embedding, symptom_embeddings)[0]
    top_results = (cos_scores > similarity_threshold).nonzero()

    # Check if top_results[0] is a string and convert to a list if necessary
    indices = top_results[0]
    if not isinstance(indices, torch.Tensor): # Check if indices is a Tensor and convert to list
        indices = [int(indices)] if isinstance(indices, str) else [indices] # Convert string/int to list
    else:
        indices = indices.tolist() # Convert Tensor to list

    matched_symptoms = [(symptom_cols[i], float(cos_scores[i])) for i in indices]  # Use the indices list
    matched_symptoms.sort(key=lambda x: x[1], reverse=True)

    return matched_symptoms[:top_k]

In [None]:
# Outside chatbot_interaction()
def rank_diseases(matched_symptoms, df):
    symptom_weights = dict(matched_symptoms)  # {symptom: score}
    disease_scores = []

    for idx, row in df.iterrows():
        disease = row[0]
        score = 0.0
        for symptom, weight in symptom_weights.items():
            if row[symptom] == 1:
                score += weight
        if score > 0:
            disease_scores.append((disease, score))

    disease_scores.sort(key=lambda x: x[1], reverse=True)
    return disease_scores

In [None]:
def get_user_symptoms(user_input, top_k=5, similarity_threshold=0.4):
    input_embedding = model.encode(user_input, convert_to_tensor=True)
    cos_scores = util.cos_sim(input_embedding, symptom_embeddings)[0]
    top_results = (cos_scores > similarity_threshold).nonzero()

    # Check if top_results is empty
    if top_results.size(0) == 0:  # If top_results is empty
        return []  # Return an empty list to indicate no matches

    # Check if top_results[0] is a string and convert to a list if necessary
    indices = top_results[0] # Access the first element only if top_results is not empty
    if not isinstance(indices, torch.Tensor): # Check if indices is a Tensor and convert to list
        indices = [int(indices)] if isinstance(indices, str) else [indices] # Convert string/int to list
    else:
        indices = indices.tolist() # Convert Tensor to list

    matched_symptoms = [(symptom_cols[i], float(cos_scores[i])) for i in indices]  # Use the indices list
    matched_symptoms.sort(key=lambda x: x[1], reverse=True)

    return matched_symptoms[:top_k]

In [None]:
def match_symptom_semantic(user_input):
    inputs = [i.strip().lower() for i in user_input.split(',')]
    matched_symptoms = {}

    for input_symptom in inputs:
        input_embedding = model.encode(input_symptom, convert_to_tensor=True).to(symptom_embeddings.device)
        similarities = util.cos_sim(input_embedding, symptom_embeddings)
        best_match_idx = similarities.argmax()
        best_match_score = similarities[0, best_match_idx].item()

        if best_match_score > 0.0:
            matched_symptoms[input_symptom] = symptom_cols[int(best_match_idx)]
        else:
            matched_symptoms[input_symptom] = None

    return matched_symptoms

In [None]:
import re

def preprocess_symptoms(user_input):
    user_input = user_input.lower()
    user_input = re.sub(r"\b(i have|i'm|i am|i've got|feeling|suffering from|dealing with|experiencing|got|having)\b", "", user_input)
    user_input = user_input.replace(" and ", ";").replace(",", ";")
    phrases = [p.strip().strip(".") for p in user_input.split(";") if p.strip()]

    return phrases


In [None]:
import datetime
import json

def chatbot_interaction():
    """Manages the chatbot conversation with the user."""
    user_symptoms = []
    matched_symptom_scores = []

    print("🤖 MediBot: Hello! I'm MediBot, your personal symptom checker powered by AI.")
    print("🤖 MediBot: You can enter multiple symptoms in one go (e.g., 'fever, headache') or one by one.")
    print("           Type 'done' when you're finished.\n")

    while True:
        user_input = input("🧑 User: ").strip()
        if user_input.lower() == 'done':
            break

        possible_symptoms = preprocess_symptoms(user_input)

        for raw_symptom in possible_symptoms:
            matched = match_symptom_semantic(raw_symptom)

            if matched and matched not in user_symptoms:
                # If it's a dict like {'cold': 'fever'}, get the value
                matched_symptom = matched if isinstance(matched, str) else list(matched.values())[0]

                user_symptoms.append(matched_symptom)

                # Score the symptom (for future use)
                top_match = get_user_symptoms(raw_symptom, top_k=1)
                if top_match:
                    matched_symptom_scores.append(top_match[0])

                # Get related info from LLM
                prompt = f"User reports: {matched_symptom}. What else might be related? Keep the response concise and helpful for a medical chatbot context."
                response = llm_pipeline(prompt, max_new_tokens=60, do_sample=True, temperature=0.7)[0]['generated_text']
                llm_response_text = response.strip().split('User reports:')[1].strip() if 'User reports:' in response else response.strip()
                first_sentence = llm_response_text.split('.')[0] + '.' if '.' in llm_response_text else llm_response_text
                print(f"🤖 MediBot: Got it. You're experiencing '{matched_symptom}'. {first_sentence}")
            elif raw_symptom and not matched:
                print(f"🤖 MediBot: Couldn't confidently match '{raw_symptom}'. Please rephrase or try another symptom.")

    if not user_symptoms:
        print("\n🤖 MediBot: I couldn't understand any symptoms. Please try again with more common descriptions.")
        return

    print("\n🔍 MediBot: Analyzing your symptoms...")
    print(f"📌 Symptoms considered: {', '.join(user_symptoms)}")

    # Match diseases from dataframe (df) by symptom overlap
    disease_scores = rank_diseases([(symptom, 1.0) for symptom in user_symptoms], df)

    if disease_scores:
        print("\n🩺 MediBot: Based on your symptoms, here are some possible conditions:\n")
        for idx, (disease, score) in enumerate(disease_scores, 1):
            print(f"   {idx}. {disease.replace('_', ' ')} (match score: {score:.2f})")
    else:
        print("\n🤖 MediBot: Couldn't match these symptoms to any known conditions. Please consult a medical professional.")

    # Structure the result data
    result_data = {
        "date": datetime.date.today().strftime('%Y-%m-%d'),
        "input_symptoms": user_symptoms,
        "possible_conditions": [
            {"disease": disease.replace('_', ' '), "match_score": round(score, 1)}
            for disease, score in disease_scores
        ]
    }

    # Save to JSON file
    with open("medibot_results.json", "w") as f:
        json.dump(result_data, f, indent=4)

    print("\n💾 Your results have been saved to 'medibot_results.json'.")

    print(f"\n📅 Date: {datetime.date.today().strftime('%B %d, %Y')}")
    print("✅ Thank you for using MediBot. Stay healthy!")

chatbot_interaction()