In [None]:
import os

from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

from typing import List

from dotenv import load_dotenv

from pinecone import Pinecone, ServerlessSpec


In [None]:
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [None]:
pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)

In [None]:
def load_pdf_files(data_path):
    loader = DirectoryLoader(
        data_path,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
    )

    documents = loader.load()

    return documents

In [None]:
extracted_data = load_pdf_files("data")

In [None]:
def filter_docs(docs: List[Document]) -> List[Document]:
    filtered_docs: List[Document] = []

    for doc in docs:
        src = doc.metadata.get('source')
        filtered_docs.append(
            page_content=doc.page_content,
            metadata={"source": src}
        )
    
    return filtered_docs

In [None]:
filtered_data = filter_docs(extracted_data)

In [None]:
def chunk_documents(filtered_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=20,
    )

    chunked_docs = text_splitter.split_documents(filtered_docs)
    
    return chunked_docs

In [None]:
chunked_data = chunk_documents(filtered_data)

In [None]:
def download_embeddings():
    embeddings = HuggingFaceBgeEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    
    return embeddings

In [None]:
embedding = download_embeddings()

In [None]:
index_name = "medicine-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )

In [None]:
index = pc.Index(index_name)

In [None]:
doc_search = PineconeVectorStore.from_documents(
    documents=chunked_data,
    embedding=embedding,
    index_name=index_name
)

In [None]:
retriever = doc_search.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
chat_model = ChatGoogleGenerativeAI(model="gemini-pro")

In [None]:
system_prompt = (
    "You are a helpful medical research assistant. Use the provided context to answer the user's question accurately and concisely."
    "If the context does not contain the answer, respond with 'I don't know."
    "Keep your answers brief and to the point."
    "\n\n"
    "{context}"
)

In [None]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", "{question}"),
])

In [None]:
question_answer_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)