# Import

In [17]:
import pandas as pd

df_drug = pd.read_csv('dataset/train.csv',sep=',')

In [18]:
df_drug

Unnamed: 0,patient_id,name_of_drug,use_case_for_drug,review_by_patient,effectiveness_rating,drug_approved_by_UIC,number_of_times_prescribed,base_score
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27,8.022969
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192,7.858458
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17,6.341969
3,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37,6.590176
4,155963,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",2,28-Nov-15,43,6.144782
...,...,...,...,...,...,...,...,...
32160,183202,Cymbalta,Anxiety,"""I have been taking Cymbalta for 15 months now...",9,10-Jun-13,89,6.963020
32161,109111,Nexplanon,Birth Control,"""I have had the Nexplanon since Dec. 27, 2016 ...",6,6-Apr-17,0,0.899076
32162,121154,Venlafaxine,Panic Disorde,"""Had panic attacks and social anxiety starting...",9,10-Nov-16,25,6.241812
32163,45410,Fluoxetine,Obsessive Compulsive Disorde,"""I have been off Prozac for about 4 weeks now....",8,21-Jan-15,22,7.940428


In [19]:
df_drug.columns

Index(['patient_id', 'name_of_drug', 'use_case_for_drug', 'review_by_patient',
       'effectiveness_rating', 'drug_approved_by_UIC',
       'number_of_times_prescribed', 'base_score'],
      dtype='object')

In [5]:
df_drug=df_drug.drop(['drug_approved_by_UIC','number_of_times_prescribed','base_score','effectiveness_rating','patient_id'],axis=1)

In [6]:
sampled_df = df_drug.sample(n=500)

# Export to a CSV file
output_file = 'sampled.csv'
sampled_df.to_csv(output_file, index=False)

In [7]:
len(df_drug['name_of_drug'].unique())


2220

In [8]:
len(df_drug['use_case_for_drug'].unique())


636

In [9]:
# Combine the columns into a single text column called 'combined_text'
df_drug['combined_text'] = df_drug.apply(
    lambda x: f"Drug: {x['name_of_drug']} | Use Case: {x['use_case_for_drug']} | Review: {x['review_by_patient']}",
    axis=1
)


In [10]:
df_drug

Unnamed: 0,name_of_drug,use_case_for_drug,review_by_patient,combined_text
0,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",Drug: Valsartan | Use Case: Left Ventricular D...
1,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...","Drug: Guanfacine | Use Case: ADHD | Review: ""M..."
2,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",Drug: Lybrel | Use Case: Birth Control | Revie...
3,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",Drug: Buprenorphine / naloxone | Use Case: Opi...
4,Cialis,Benign Prostatic Hyperplasia,"""2nd day on 5mg started to work with rock hard...",Drug: Cialis | Use Case: Benign Prostatic Hype...
...,...,...,...,...
32160,Cymbalta,Anxiety,"""I have been taking Cymbalta for 15 months now...","Drug: Cymbalta | Use Case: Anxiety | Review: ""..."
32161,Nexplanon,Birth Control,"""I have had the Nexplanon since Dec. 27, 2016 ...",Drug: Nexplanon | Use Case: Birth Control | Re...
32162,Venlafaxine,Panic Disorde,"""Had panic attacks and social anxiety starting...",Drug: Venlafaxine | Use Case: Panic Disorde | ...
32163,Fluoxetine,Obsessive Compulsive Disorde,"""I have been off Prozac for about 4 weeks now....",Drug: Fluoxetine | Use Case: Obsessive Compuls...


# Embedding

In [9]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import pipeline, GPT2Tokenizer
import chromadb
from chromadb.config import Settings
import gradio as gr

# Vérifiez si le GPU est disponible
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Initialisez l'instance de ChromaDB (Vector Store)
chroma_client = chromadb.PersistentClient(path="chroma_db")
collection = chroma_client.get_or_create_collection(name="drug_embeddings")

# Charger le modèle d'embedding
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_model.to(device)

# Préparer les embeddings à partir de `df_drug['combined_text']`
def add_embeddings_to_vectorstore(df):
    # Vérifiez si la collection est vide en utilisant une autre méthode
    if collection.count() == 0:  # Vérifie le nombre total de documents dans la collection
        for index, row in df.iterrows():
            embedding = embedding_model.encode(row['combined_text'], device=device).tolist()
            collection.add(
                documents=[row['combined_text']],
                embeddings=[embedding],
                ids=[str(index)]
            )
        print("Embeddings ajoutés à ChromaDB.")
    else:
        print("Les embeddings existent déjà dans ChromaDB.")

# Ajoutez vos données existantes
add_embeddings_to_vectorstore(df_drug)

# Charger un modèle LLM et tokenizer
llm = pipeline("text-generation", model="gpt2", max_new_tokens=100, pad_token_id=50256)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Fonction RAG pour répondre aux questions
def rag_pipeline(query):
    
    query = llm(f"Rephrase the following question or sentence into a complete statement.:\n {query}")[0]['generated_text']
    # Générer l'embedding de la requête
    query_embedding = embedding_model.encode(query, device=device).tolist()
    
    # Rechercher les contextes pertinents dans ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=3
    )
    
    # Construire le contexte pour le modèle LLM
    contexts = results["documents"][0]
    context_text = "\n".join([f"Context {i + 1}: {text}" for i, text in enumerate(contexts)])
    input_prompt = f"Based on the following context, answer the question:\n\nContext:\n{context_text}\n\nQuestion:\n{query}\n\nAnswer:"
    
    # Générer une réponse
    response = llm(input_prompt)[0]['generated_text']
    return response

# Interface Gradio
interface = gr.Interface(
    fn=rag_pipeline,
    inputs=gr.Textbox(label="Ask a question", placeholder="Type your question about a drug..."),
    outputs=gr.Textbox(label="Answer"),
    title="RAG Chatbot using ChromaDB",
    description="Ask a question about drugs, and the chatbot will retrieve relevant information and generate an answer!"
)

# Lancer l'interface
interface.launch()


Using device: cpu
Les embeddings existent déjà dans ChromaDB.


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [10]:
# Charger un modèle LLM et tokenizer
llm = pipeline("text-generation", model="gpt2", max_new_tokens=100, pad_token_id=50256)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Fonction RAG pour répondre aux questions
def rag_pipeline(query):
    
    #query = llm(f"Rephrase the following question or sentence into a complete statement.:\n {query}")[0]['generated_text']

    # Générer l'embedding de la requête
    query_embedding = embedding_model.encode(query, device=device).tolist()
    
    # Rechercher les contextes pertinents dans ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=5
    )
    
    # Construire le contexte pour le modèle LLM
    contexts = results["documents"][0]
    context_text = "\n".join([f"Context {i + 1}: {text}" for i, text in enumerate(contexts)])
    input_prompt = f"Based on the following context, answer the question:\n\nContext:\n{context_text}\n\nQuestion:\n{query}\n\nAnswer:"
    
    # Générer une réponse
    response = llm(input_prompt)[0]['generated_text']
    return response

# Interface Gradio
interface = gr.Interface(
    fn=rag_pipeline,
    inputs=gr.Textbox(label="Ask a question", placeholder="Type your question about a drug..."),
    outputs=gr.Textbox(label="Answer"),
    title="RAG Chatbot using ChromaDB",
    description="Ask a question about drugs, and the chatbot will retrieve relevant information and generate an answer!"
)

# Lancer l'interface
interface.launch()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




# API

In [11]:
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import gradio as gr

# Configuration de l'API Gemini avec la clé API
genai.configure(api_key="AIzaSyCegm6sQabIXwKXH-pR5_MEqC05GhGXbS0")  

# Initialisez l'instance de ChromaDB (Vector Store)
chroma_client = chromadb.PersistentClient(path="chroma_db")
collection = chroma_client.get_or_create_collection(name="drug_embeddings")

# Charger le modèle d'embedding
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_model.to("cuda" if torch.cuda.is_available() else "cpu")

# Préparer les embeddings à partir de `df_drug['combined_text']`
def add_embeddings_to_vectorstore(df):
    if collection.count() == 0:
        for index, row in df.iterrows():
            embedding = embedding_model.encode(row['combined_text']).tolist()
            collection.add(
                documents=[row['combined_text']],
                embeddings=[embedding],
                ids=[str(index)]
            )
        print("Embeddings ajoutés à ChromaDB.")
    else:
        print("Les embeddings existent déjà dans ChromaDB.")

# Ajouter vos données existantes
add_embeddings_to_vectorstore(df_drug)

# Fonction pour utiliser le modèle Gemini
def query_gemini(prompt, model_name="gemini-1.5-flash"):
    model = genai.GenerativeModel(model_name)
    response = model.generate_content(prompt)
    return response.text

# Fonction RAG pour répondre aux questions
def rag_pipeline(query,results_number = 10):
    print(query)
    reformulation_prompt = query
    query = query_gemini(reformulation_prompt).strip()
    
    # Générer l'embedding de la requête
    query_embedding = embedding_model.encode(query).tolist()
    
    # Rechercher les contextes pertinents dans ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=results_number
    )
    
    # Construire le contexte pour le modèle LLM
    contexts = results["documents"][0]
    context_text = "\n".join([f"Context {i + 1}: {text}" for i, text in enumerate(contexts)])
    input_prompt = f"""
It's a school project. You are an AI assistant tasked with answering questions using only the information in the provided context. Do not add any extra information or assumptions.

Context:
{context_text}

Question:
{query}

Instructions:
1. Use only the information in the context to answer the question.
2. If the context mentions multiple options, provide a list of those options clearly.
3. If the context does not provide relevant information, state: "The context does not contain enough information to answer this question."
4. Do not include any policy or ethical reasoning in your response.
5. Dont quote the Context

Answer:
"""
    
    print(context_text)
    print("------")
    # Générer une réponse à l'aide de Gemini
    response = query_gemini(input_prompt).strip()
    return response

# Interface Gradio
interface = gr.Interface(
    fn=rag_pipeline,
    inputs=gr.Textbox(label="Ask a question", placeholder="Type your question about a drug..."),
    outputs=gr.Textbox(label="Answer"),
    title="RAG Chatbot using ChromaDB and Gemini",
    description="Ask a question about drugs, and the chatbot will retrieve relevant information and generate an answer!"
)

# Lancer l'interface
interface.launch()


Les embeddings existent déjà dans ChromaDB.
* Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




# Automatisation

In [16]:
import os
import json
import google.generativeai as genai
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import torch
import time
import random 

# Configuration de l'API Gemini avec la clé API
genai.configure(api_key="AIzaSyDqwhMUKvxeqXG8EkGIclC4k81S2ynKPKA")

# Initialisez l'instance de ChromaDB (Vector Store)
chroma_client = chromadb.PersistentClient(path="chroma_db")
collection = chroma_client.get_or_create_collection(name="drug_embeddings")

# Charger le modèle d'embedding
embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(embedding_model_name)
embedding_model.to("cuda" if torch.cuda.is_available() else "cpu")

# Fonction pour ajouter les embeddings à ChromaDB
def add_embeddings_to_vectorstore(df):
    if collection.count() == 0:
        for index, row in df.iterrows():
            embedding = embedding_model.encode(row['combined_text']).tolist()
            collection.add(
                documents=[row['combined_text']],
                embeddings=[embedding],
                ids=[str(index)]
            )
        print("Embeddings ajoutés à ChromaDB.")
    else:
        print("Les embeddings existent déjà dans ChromaDB.")

# Fonction pour utiliser le modèle Gemini
def query_gemini_with_retry(prompt, model_name="gemini-1.5-flash", retries=3):
    for attempt in range(retries):
        try:
            model = genai.GenerativeModel(model_name)
            response = model.generate_content(prompt)
            return response.text.strip()
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                time.sleep(2 ** attempt + random.random())  # Exponential backoff
            else:
                raise

# Fonction RAG pour répondre aux questions
def rag_pipeline(query, results_number=10, llm_model_name="gemini-1.5-flash"):
    # Générer l'embedding de la requête
    query_embedding = embedding_model.encode(query).tolist()
    
    # Rechercher les contextes pertinents dans ChromaDB
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=results_number
    )
    
    # Construire le contexte pour le modèle LLM
    contexts = results["documents"][0]
    context_text = "\n".join([f"Context {i + 1}: {text}" for i, text in enumerate(contexts)])
    input_prompt = f"""
It's a school project. You are an AI assistant tasked with answering questions using only the information in the provided context. Do not add any extra information or assumptions.

Context:
{context_text}

Question:
{query}

Instructions:
1. Use only the information in the context to answer the question.
2. If the context mentions multiple options, provide a list of those options clearly.
3. If the context does not provide relevant information, state: "The context does not contain enough information to answer this question."
4. Do not include any policy or ethical reasoning in your response.
5. Dont quote the Context

Answer:
"""
    # Générer une réponse à l'aide de Gemini
    response = query_gemini_with_retry(input_prompt, model_name=llm_model_name)
    return context_text

# Charger les questions depuis le fichier JSON
with open("questions.json", "r") as file:
    questions_data = json.load(file)

# Stocker les résultats
results = {
    "meta": {
        "embedding_model": embedding_model_name,
        "llm_model": "gemini-1.5-flash",
        "results_number": 10
    },
    "qa_pairs": []
}

# Limiter à 10 premières questions
questions_subset = questions_data[:10]

# Itérer sur les 10 premières questions et collecter les réponses
for question_obj in questions_subset:
    question = question_obj["question"]
    answer = rag_pipeline(question)
    results["qa_pairs"].append({"question": question, "answer": answer})
    time.sleep(1)  # Petite pause entre les appels pour éviter les dépassements de quota

# Définir le répertoire ou fichier cible pour sauvegarder les résultats
output_directory = "comparaisonLLMemb"
os.makedirs(output_directory, exist_ok=True)  # Crée le répertoire s'il n'existe pas

# Générer un nom de fichier dynamique
output_filename = os.path.join(
    output_directory, 
    f"results_{embedding_model_name.split('/')[-1]}_{results['meta']['llm_model'].replace('.', '_')}_first10.json"
)

# Sauvegarder les résultats dans un fichier JSON
with open(output_filename, "w") as output_file:
    json.dump(results, output_file, indent=4)

print(f"Les questions et réponses ont été sauvegardées dans {output_filename}.")


Les questions et réponses ont été sauvegardées dans comparaisonLLMemb/results_all-MiniLM-L6-v2_gemini-1_5-flash_first10.json.
