# MedBot Application

Import Data and Dataset Manipulation

In [23]:
import pandas as pd

df_drug = pd.read_csv('dataset/train.csv',sep=',')

In [24]:
df_drug=df_drug.drop(['drug_approved_by_UIC','number_of_times_prescribed','base_score','effectiveness_rating','patient_id'],axis=1)

In [25]:
sampled_df = df_drug.sample(n=500)

# Export to a CSV file
output_file = 'sampled.csv'
sampled_df.to_csv(output_file, index=False)

In [26]:
# Combine the columns into a single text column called 'combined_text'
df_drug['combined_text'] = df_drug.apply(
    lambda x: f"Drug: {x['name_of_drug']} | Use Case: {x['use_case_for_drug']} | Review: {x['review_by_patient']}",
    axis=1
)

Data Embedding

In [27]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline, GPT2Tokenizer
import chromadb
from chromadb.config import Settings
import gradio as gr
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import torch
import time
import random 
import os

# Vérifiez si le GPU est disponible
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

genai.configure(api_key="AIzaSyAV5JZ6ebS_Zy1EHE-NE-P3ApUDtu87Ffs")

# Initialisez l'instance de ChromaDB (Vector Store)
chroma_client = chromadb.PersistentClient(path="chroma_db")
collection = chroma_client.get_or_create_collection(name="drug_embeddings")

# Charger le modèle d'embedding
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_model.to(device)

# Préparer les embeddings à partir de `df_drug['combined_text']`
def add_embeddings_to_vectorstore(df):
    # Vérifiez si la collection est vide en utilisant une autre méthode
    if collection.count() == 0:  # Vérifie le nombre total de documents dans la collection
        for index, row in df.iterrows():
            embedding = embedding_model.encode(row['combined_text'], device=device).tolist()
            collection.add(
                documents=[row['combined_text']],
                embeddings=[embedding],
                ids=[str(index)]
            )
        print("Embeddings ajoutés à ChromaDB.")
    else:
        print("Les embeddings existent déjà dans ChromaDB.")

# Ajoutez vos données existantes
add_embeddings_to_vectorstore(df_drug)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Les embeddings existent déjà dans ChromaDB.


Pipeline 

In [33]:
genai.configure(api_key="AIzaSyAV5JZ6ebS_Zy1EHE-NE-P3ApUDtu87Ffs")  
def query_gemini_with_retry(prompt, model_name="gemini-1.5-flash", retries=3):
    for attempt in range(retries):
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt + random.random())  # Exponential backoff
            else:
                raise
# Query Gemini function
def query_gemini(prompt, model_name="gemini-1.5-flash"):
    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt)
    return response.text
def rag_pipeline_convo(user_input, conversation_history, drug_names=[], results_number=10, llm_model_name="gemini-1.5-flash"):
    # Generate the embedding for the user query
    full_response = []
    
    if not drug_names:
        drug_names = [""]  # Default to empty if no drugs are provided
        drug_names_concat = ""
    else:
        drug_names_concat = "Additional context for the conversation:"
        for drug_name in drug_names:
            drug_names_concat += drug_name + ", "

    # Build the combined context from the conversation history
    conversation_context = ""
    for i, history in enumerate(conversation_history):
        user_message = history.get("user", "")
        assistant_response = history.get("assistant", "")
        conversation_context += f"User: {user_message}\nAssistant: {assistant_response}\n"

    # Add the current user input to the context
    combined_context = conversation_context + f"User: {user_input}\n"
    
    # Initialize a list for storing context responses
    all_contexts = []

    for drug_name in drug_names:
        print(drug_names_concat)
        
        # Generate query embedding based on user input and drug name
        query_embedding = embedding_model.encode(user_input + drug_name).tolist()
        print(f"user input = {user_input}")
        # Rechercher les contextes pertinents dans ChromaDB
        results = collection.query(
            query_embeddings=[query_embedding],
            n_results=results_number
        )
        
        # Build context from ChromaDB results
        contexts = results["documents"][0]
        context_text_from_db = "\n".join([f"Context {i + 1}: {text}" for i, text in enumerate(contexts)])
        
        # Form the input prompt for the LLM
        input_prompt = f"""
        It's a school project. You are an AI assistant tasked with answering questions using only the information in the provided context. Do not add any extra information or assumptions.
        Context from previous conversation:
        {combined_context}

        Context from the database:
        {context_text_from_db}

        Question:
        {user_input + drug_name}

        Instructions:
        1. Use only the information in the context to answer the question.
        2. If the context mentions multiple options, provide a list of those options clearly.
        3. If the context does not provide relevant information, state: "The context does not contain enough information to answer this question."
        4. Do not include any policy or ethical reasoning in your response.
        5. Don't quote the context in your answer.

        Answer with a full sentence (including the name of the object we asked about):
        """
        print(input_prompt)  # Optional: for debugging purposes
        # Generate a response using the Gemini model
        response = query_gemini_with_retry(input_prompt, model_name=llm_model_name)
        all_contexts.append(response)

    # Now that we have all individual responses, combine them
    input_prompt_for_combining = f"""
    It's a school project. You are an AI assistant tasked with combining these contexts together, making them make sense and more fluent in order to answer the question: {user_input + drug_names_concat}.
    Don't mention anything about the context or anything. Just pretend like you are a real assistant and answer with available information. If there is no information, just say so, don't need to mention about input query.
    """

    # Add each response context into the final input prompt
    for i, context in enumerate(all_contexts, start=1):
        input_prompt_for_combining += f"""
        Context {i}:
        {context}
    """

    print(input_prompt_for_combining)  # Optional: for debugging purposes
    # Generate the final response from the combined context
    full_response_text = query_gemini_with_retry(input_prompt_for_combining, model_name=llm_model_name)
    full_response.append(full_response_text)  # Add the final response to the full response list
    
    # Update the conversation history with the latest exchange
    conversation_history.append({"user": user_input, "assistant": full_response_text})
    
    # Format the conversation history for chatbot display (as a list of tuples)
    chatbot_history = [(entry["user"], entry["assistant"]) for entry in conversation_history]
    
    # Return the formatted chat history and updated conversation state
    return chatbot_history, conversation_history



# PDF processing function
def get_medicine_list(path):
    from PIL import Image
    import fitz
    import numpy as np
    import pytesseract 
    import cv2

    def read_to_image(pdf_path):
        pdf = fitz.open(pdf_path)
        images = []
        for page_num in range(len(pdf)):
            page = pdf.load_page(page_num)
            pixmap = page.get_pixmap(matrix=fitz.Matrix(4, 4))
            pil_image = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)
            pil_image = np.array(pil_image)
            images.append(pil_image)
        pdf.close()
        return images
    
    images = read_to_image(path)
    image = images[0]
    image = cv2.cvtColor(image, cv2.COLOR_RGBA2GRAY)
    image = image[int(image.shape[0] /3) : int(image.shape[0] * 2/3), 0: image.shape[1]]
    _, image_threshold = cv2.threshold(image, 250, 255, cv2.THRESH_BINARY)
    image_threshold = cv2.bitwise_not(image_threshold)
    contours, _ = cv2.findContours(image_threshold, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)
    image = image[int(y+ 100): int(y + h), int(x): int(x + w/4)]
    list_text = pytesseract.image_to_string(image)
    medication_list = [med for med in list_text.split('\n') if med.strip()]
    return medication_list

get_medicine_list("prescri.pdf")

['Duloxetine', 'Ultram ER', 'Otezla']

Build Interface

In [None]:

import gradio as gr

# Define the function to handle conversation and return chat history
def handle_conversation(user_input, conversation_history, path=None):
    extracted_data = None
    if path is not None:  # Process PDF if uploaded
        extracted_data = get_medicine_list(path)
    
    # Pass user input, conversation history, and extracted data to the RAG pipeline
    return rag_pipeline_convo(user_input, conversation_history, drug_names=extracted_data)

# Initialize the Gradio interface inside a Blocks context
with gr.Blocks() as interface:
    conversation_history = gr.State([])  # Keep conversation history

    # Define layout (columns and rows) with margin applied to the container
    with gr.Row():
        with gr.Column(scale=8, elem_id="chatbot-column"):
            chatbot = gr.Chatbot(label="RAG Conversational Chatbot")
        with gr.Column(scale=2, elem_id="pdf-upload-column"):
            pdf_upload = gr.File(label="Upload a PDF file (optional)", type="filepath", file_types=[".pdf"])
    
    # User input textbox and Send button
    user_input = gr.Textbox(placeholder="Ask something...", label="Your Message")
    send_button = gr.Button("Send")
    
    # Button click event inside the Blocks context
    send_button.click(
        fn=handle_conversation,
        inputs=[user_input, conversation_history, pdf_upload],
        outputs=[chatbot, conversation_history]
    )

    # Make the "Enter" key trigger the send button click
    user_input.submit(
        fn=handle_conversation,
        inputs=[user_input, conversation_history, pdf_upload],
        outputs=[chatbot, conversation_history]
    )

    # Custom CSS for setting left and right margin to 25%
    interface.css = """
    #chatbot-column, #pdf-upload-column {
        margin-left: 25%;
        margin-right: 25%;
    }
    """

# Launch the interface
interface.launch(share=True)



* Running on local URL:  http://127.0.0.1:7875

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


2024/12/03 23:15:09 [W] [service.go:132] login to server failed: dial tcp 44.237.78.176:7000: i/o timeout




1
1
1
2
24
25
24
24
25
24
25
24
25
35
32
32
6
6
6
34
34
25
24
24
24
24
25
33
32
6
6
35
24
25
24
25
25
24
25
24
25
24
25
24
25
35
25
24
25
24
25
24
25
24
25
35
32
32
32
32
32
6
6
6
6
35
34
34
34
345
24
24
24
24
26
25
26
26
26
26
26
25
26
26
26
25
26
26
31
31
32
32
32
7
7
35
35
35
7
34
7
37
24
25
24
24
24
25
25
25
25
25
25
25
25
24
35
24
25
30
30
30
33
32
32
32
6
6
6
35
34
34
34
35
34
34
12
12
24
25
25
25
33
33
32
33
33
33
32
35
34
12
25
25
24
25
25
25
25
24
25
25
24
25
25
34
24
25
24
24
24
25
30
32
32
33
32
32
33
6
6
6
8
34
8
35
35
34
25
25
25
24
32
35
35
6
6
4
15
15
16
16
15
16
16
16
15
15
15
15
15
15
11
19
16
16
16
15
15
15
15
16
24
Additional context for the conversation:Duloxetine, Ultram ER, Otezla, 
user input = What are the use cases of these medicines?

        It's a school project. You are an AI assistant tasked with answering questions using only the information in the provided context. Do not add any extra information or assumptions.
        Context from previous conversatio