In [None]:
from langchain.document_loaders import DirectoryLoader, PyPDFLoader

def load_pdf(path):
    loader = DirectoryLoader(path, glob='*.pdf', loader_cls=PyPDFLoader)
    documents = loader.load()

    return documents

In [2]:
extracted_text = load_pdf(r"C:\Users\basha.r_isteer\Desktop\BrainAI\data") #144

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(extracted_text):
    text_splitted = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 100)
    text_chunks = text_splitted.split_documents(extracted_text)

    return text_chunks


In [4]:
text_chunks = split_text(extracted_text)
print(len(text_chunks))

671


In [5]:
page_content = [chunk.page_content for chunk in text_chunks]

In [6]:
API_KEY = "AIzaSyDeZsXSMnMPi0A4jqu1YvXh9lABPkwufEA"

In [7]:
import google.generativeai as genai

genai.configure(api_key=API_KEY)

embedding_model = "models/text-embedding-004"

embeddings = genai.embed_content(
    model= embedding_model,
    content= page_content, 
    task_type= "RETRIEVAL_DOCUMENT"
)

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
doc_embeddings = embeddings['embedding'] #671

In [9]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key='pcsk_32k22V_77oFKp7mL369qkiGoQpdN5PhdiixMoZg94f1tsXhkFkQVZ4vQg8RRo1mEV3u9RD')

In [10]:
index_name = "medical-chatbot2"
embedding_dimension = len(doc_embeddings[0]) #768

print(f"Connecting to '{index_name}' index...")
if index_name not in pc.list_indexes().names():
    print(f"Index '{index_name}' does not exist. Creating it now...")
    pc.create_index(
        name=index_name,
        dimension=embedding_dimension,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )
    print("Index created successfully.")
else:
    print("Index already exists.")

Connecting to 'medical-chatbot2' index...
Index already exists.


In [11]:
# Get a handle to the index
index = pc.Index(index_name)

In [12]:
print("Preparing data for upload...")
vectors_to_upsert = []
for i, (chunk, vec) in enumerate(zip(text_chunks, doc_embeddings)):
    vectors_to_upsert.append({
        "id": str(i),  
        "values": vec, 
        "metadata": {
            "text": chunk.page_content, 
            "source": chunk.metadata.get('source', 'Unknown'), 
            "page": chunk.metadata.get('page', 'Unknown') 
        }
    })

Preparing data for upload...


In [13]:
batch_size = 100
print(f"Upserting {len(vectors_to_upsert)} vectors in batches of {batch_size}...")

for i in range(0, len(vectors_to_upsert), batch_size):
    batch = vectors_to_upsert[i : i + batch_size]
    print(f"Upserting batch {i//batch_size + 1}...")
    index.upsert(vectors=batch)

print("Upload complete.")

Upserting 671 vectors in batches of 100...
Upserting batch 1...
Upserting batch 2...
Upserting batch 3...
Upserting batch 4...
Upserting batch 5...
Upserting batch 6...
Upserting batch 7...
Upload complete.


In [14]:
llm = genai.GenerativeModel("gemini-1.5-flash-latest")

In [15]:
def ask_question(query: str):
    print("Embedding the query...")
    query_embedding = genai.embed_content(
        model=embedding_model,
        content=query,
        task_type="RETRIEVAL_QUERY"
    )['embedding']

    # Step 2: Search Pinecone for relevant documents
    print("Searching for relevant documents in Pinecone...")
    results = index.query(
        vector=query_embedding,
        top_k=3,
        include_metadata=True
    )
    
    print("Creating context from search results...")
    context = ""
    for match in results['matches']:
        context += match['metadata']['text'] + "\n---\n"
    
    prompt = f"""
    You are a helpful assistant who answers questions based ONLY on the provided context.
    If the answer is not in the context, say "I don't have enough information to answer that question."

    CONTEXT:
    {context}

    QUESTION:
    {query}

    ANSWER:
    """

    # Step 5: Call the Gemini LLM to generate a final answer
    print("Generating final answer with Gemini...\n")
    response = llm.generate_content(prompt)
    
    return response.text


In [16]:
user_query = "what is cardiovascular system?"
final_answer = ask_question(user_query)

print("Final Answer:\n")
print(final_answer)

Embedding the query...
Searching for relevant documents in Pinecone...
Creating context from search results...
Generating final answer with Gemini...

Final Answer:

The cardiovascular system is made up of blood, the heart, and blood vessels.  It also includes the Coronary Circulation, Pulmonary Circulation, Portal Circulation, and Systemic Circulation.

