<a href="https://colab.research.google.com/github/ShanshanHoo/Brain-Tumour-Segmentation/blob/master/RAG_Cobenfy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai scikit-learn numpy sentence-transformers PyMuPDF langchain ragas

Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting ragas
  Downloading ragas-0.2.14-py3-none-any.whl.metadata (8.5 kB)
Collecting datasets (from ragas)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken (from ragas)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain_openai (from ragas)
  Downloading langchain_openai-0.3.12-py3-none-any.whl.metadata (2.3 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-

In [None]:
from google.colab import userdata

from openai import OpenAI
import fitz
from sentence_transformers import SentenceTransformer
import openai
import sqlite3
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.text_splitter import RecursiveCharacterTextSplitter # import the missing library


In [None]:
# Set up OpenAI API
openai.api_key=userdata.get('OPENAI_API_KEY')

# Load multilingual embedding model (E5-small model)
model = SentenceTransformer('intfloat/multilingual-e5-small')

# Database setup (SQLite for simplicity)
conn = sqlite3.connect('embeddings.db')
cursor = conn.cursor()
cursor.execute('''CREATE TABLE IF NOT EXISTS embeddings
              (chunk_id INTEGER PRIMARY KEY, text_chunk TEXT, embedding BLOB)''') #BLOB (Binary Large Object)
conn.commit()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/498k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:

# Function to read PDF and split into chunks
def read_pdf(file_path, chunk_size=500):
    doc = fitz.open(file_path)
    concatenated_text = ""
    for page in doc:
      text=page.get_text()
      concatenated_text+=text

    # Split text into chunks
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=200)
    chunks=text_splitter.split_text(concatenated_text)

    return chunks

# Function to embed chunks and store in the database
def embed_and_store_chunks(chunks):
    for i, chunk in enumerate(chunks):
        embedding = model.encode(chunk)
        # Store embedding and chunk in the database (BLOB format for embedding)
        cursor.execute("INSERT INTO embeddings (text_chunk, embedding) VALUES (?, ?)",
                       (chunk, embedding.tobytes()))
    conn.commit()

from sklearn.preprocessing import normalize

def retrieve_relevant_chunks(query, top_n=5):
    # Encode the query
    query_embedding = model.encode(query)

    # Normalize the query embedding
    query_embedding = normalize([query_embedding])[0]

    # Retrieve all embeddings from the database
    cursor.execute("SELECT text_chunk, embedding FROM embeddings")
    all_embeddings = cursor.fetchall()

    similarities = []
    for text_chunk, embedding in all_embeddings:
        embedding_np = np.frombuffer(embedding, dtype=np.float32)
        # Normalize the chunk embedding
        embedding_np = normalize([embedding_np])[0]
        similarity = cosine_similarity([query_embedding], [embedding_np])[0][0]
        similarities.append((similarity, text_chunk))

    similarities.sort(reverse=True, key=lambda x: x[0])
    return [chunk for _, chunk in similarities[:top_n]]

# Function to generate a response using GPT-4 with the retrieved context
def generate_gpt4_response(query, relevant_chunks):
    # Combine the relevant chunks to form the context
    client = OpenAI(api_key=openai.api_key)
    context = "\n".join(relevant_chunks)
    prompt = f"Based on the following context:\n\n{context}\n\nAnswer the query: {query}"

    # Call GPT-4o with the context
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
  ],
        max_tokens=500,
        temperature=0.7
    )

    return response.choices[0].message.content



In [None]:
# Example usage
if __name__ == "__main__":
    # Step 1: Read and process the PDF file
    pdf_file_path = '/content/Cobenfy_Drug.com.pdf'  # Replace with your PDF path
    pdf_chunks= read_pdf(pdf_file_path)

    # Step 2: Embed chunks and store them in the database
    embed_and_store_chunks(pdf_chunks)

    # Step 3: Query the system and retrieve relevant chunks
    #user_query = "Could you provide a short summary of this new drug?"  # Replace with your query
    #user_query = "Based on provided documents, what's the recommended dosage for patients just started with Cobenfy?"
    user_query = "Based on provided documents, what's the active ingredients of Cobenfy?"
    relevant_chunks = retrieve_relevant_chunks(user_query)
    #print(relevant_chunks)

    # Step 4: Use GPT-4 to generate an augmented response using the relevant chunks
    gpt4_response = generate_gpt4_response(user_query, relevant_chunks)

    # Output the final response
    print("GPT-4 Response:", gpt4_response)

GPT-4 Response: The active ingredients of Cobenfy are xanomeline and trospium chloride.


In [None]:
client = OpenAI(api_key=openai.api_key)
response_raw = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": user_query}
  ],
        max_tokens=500,
        temperature=0.7
    )
response_raw

ChatCompletion(id='chatcmpl-BLJT08vAvM7g4vQPUtpZbgwTR98p2', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content="I'm sorry, but I don't have access to external documents or databases to look up specific products like Cobenfy. However, I can help you find information if you provide details about the product or its intended use. Alternatively, you might consider checking the packaging or the manufacturer's website for the active ingredients.", refusal=None, role='assistant', annotations=[], audio=None, function_call=None, tool_calls=None))], created=1744419418, model='gpt-4o-mini-2024-07-18', object='chat.completion', service_tier='default', system_fingerprint='fp_44added55e', usage=CompletionUsage(completion_tokens=61, prompt_tokens=31, total_tokens=92, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(

In [None]:
print( pdf_chunks)



In [None]:
query_embedding = model.encode( "Based on provided documents, what's the active ingredients of Cobenfy?")
query_embedding

In [None]:
# More Questions
relevant_chunks

['Cobenfy ingredients\xa0\nActive ingredients:\u200a xanomeline and trospium chloride\xa0\nInactive ingredients: ascorbic acid, lactose monohydrate, microcrystalline cellulose, and talc Capsule\nshell: contains black iron oxide (only100mg/20mg), hypromellose, red iron oxide, titanium dioxide, and\nyellow iron oxide (only 50mg/20mg and 100mg/20mg)\nCompany\nCobenfy Bristol-Myers Squibb Company Princeton, NJ 08543 USA.\nReferences',
 "receptors. Cobenfy's active ingredient tiospium chloride's MOA is a muscarinic antagonist that blocks\nthe muscarinic receptors primarily in the peripheral tissues.\nCobenfy FDA approval is for the treatment of schizophrenia in adults. Cobenfy capsules are taken twice\ndaily on an empty stomach, at least 1 hour before a meal or at least 2 hours after a meal. During drug\ndevelopment, it was named KarXT, and once approved, Bristol Myers Squibb renamed it Cobenfy.\nWhat is the schizophrenia?",
 "chloride cannot enter the brain easily, so it works mainly aroun

## RAGas

Evaluation of Rag Metric

In [None]:
# Make sure four components are matched
from datasets import Dataset
questions = ["What are the active ingredients in Cobenfy?"]
ground_truths = ["Cobenfy contains xanomeline and trospium chloride. Xanomeline acts on brain receptors, while trospium reduces peripheral side effects."]
answers = [gpt4_response]
contexts = [relevant_chunks]
# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "reference": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)


In [None]:
import os
from ragas import evaluate
from google.colab import userdata
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

# Get your API key from Colab secret manager
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")
 # Manage the API key using the context manager
result = evaluate(
    dataset=dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)


df = result.to_pandas()
df

Evaluating:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,context_recall,faithfulness,answer_relevancy
0,What are the active ingredients in Cobenfy?,[Cobenfy ingredients \nActive ingredients: xa...,The active ingredients of Cobenfy are xanomeli...,Cobenfy contains xanomeline and trospium chlor...,0.8875,1.0,1.0,0.998683


In [None]:
# Try three question sets
questions = [
    "What are the active ingredients in Cobenfy?",
    "How should Cobenfy be taken?",
    "Who should not take Cobenfy?",
]

ground_truths = [
    "Cobenfy contains xanomeline and trospium chloride. Xanomeline acts on brain receptors, while trospium reduces peripheral side effects.",
    "It should be taken twice daily on an empty stomach. Capsules must not be opened.",
    "People with urinary retention, moderate/severe liver problems, or untreated narrow-angle glaucoma should avoid it. It’s also contraindicated in those allergic to trospium.",
]

# Step 3: Loop through questions and get answers + contexts
answers = []
contexts = []

for question in questions:
    relevant_chunks = retrieve_relevant_chunks(question)
    gpt4_response = generate_gpt4_response(question, relevant_chunks)

    answers.append(gpt4_response)
    contexts.append(relevant_chunks)

# Step 4: Create dataset
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "reference": ground_truths,
}

dataset = Dataset.from_dict(data)

# Step 5: Evaluate
result = evaluate(
    dataset=dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

# Step 6: View results
df = result.to_pandas()
df

Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,context_recall,faithfulness,answer_relevancy
0,What are the active ingredients in Cobenfy?,[Cobenfy ingredients \nActive ingredients: xa...,The active ingredients in Cobenfy are xanomeli...,Cobenfy contains xanomeline and trospium chlor...,1.0,1.0,1.0,1.0
1,How should Cobenfy be taken?,[about the health of women exposed to Cobenfy ...,Cobenfy should be taken as follows:\n\n- Take ...,It should be taken twice daily on an empty sto...,1.0,0.5,0.75,1.0
2,Who should not take Cobenfy?,[Who should not take this medicine?\nCobenfy s...,Cobenfy should not be taken by individuals who...,"People with urinary retention, moderate/severe...",1.0,1.0,1.0,1.0
