In [7]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [9]:
%pwd

'e:\\chatbot\\medical-assistance-chatbot\\research'

In [10]:
extracted_data = load_pdf_files("../data")

In [11]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src},
            )
        )
    return minimal_docs

In [12]:
minimal_docs = filter_to_minimal_docs(extracted_data)
len(minimal_docs)

637

## Chunking 

In [13]:
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len,
    )
    texts_chunks = text_splitter.split_documents(minimal_docs)
    return texts_chunks

In [14]:
texts_chunks = text_split(minimal_docs)
len(texts_chunks)

5859

## Embeddings

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
    )
    return embeddings

In [16]:
embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [17]:
# vector = embedding.embed_query("This is a test")
# len(vector)

## Pinecone

In [18]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [19]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


In [20]:
from pinecone import Pinecone
pinecone_client = Pinecone(api_key=PINECONE_API_KEY)

In [21]:
pinecone_client

<pinecone.pinecone.Pinecone at 0x1ccd43b38c0>

In [26]:
from pinecone import ServerlessSpec

index_name = "medical-assistance-chatbot"

if not pinecone_client.has_index(index_name):
    pinecone_client.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud = "aws",
            region = "us-east-1",
        ),
    )

index = pinecone_client.Index(index_name)

<pinecone.db_data.index.Index at 0x1ccd58042f0>

In [29]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents = texts_chunks,
    embedding=embedding,
    index_name=index_name,
)


In [31]:
## Load from existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    embedding=embedding,
    index_name=index_name,
)

In [30]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [32]:
retriever_docs = retriever.invoke("What are the symptoms of diabetes?")

In [33]:
retriever_docs

[Document(id='b741ffff-5a44-4b39-8e54-5c743d8ec2fc', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='• Type I diabetes mellitus. Characterized by fatigue and\nan abnormally high level of glucose in the blood\n(hyperglycemia).\n• Amyotrophic lateral schlerosis. First signs are stum-\nbling and difficulty climbing stairs. Later, muscle\ncramps and twitching may be observed as well as\nweakness in the hands making fastening buttons or\nturning a key difficult. Speech may become slowed or\nslurred. There may also be difficluty swallowing. As\nrespiratory muscles atrophy, there is increased danger'),
 Document(id='74d234d5-d350-4bad-98d5-71a7ea0e99d2', metadata={'source': '..\\data\\Medical_book.pdf'}, page_content='begin to fall. A person with diabetes mellitus either does\nnot make enough insulin, or makes insulin that does not\nwork properly. The result is blood sugar that remains\nhigh, a condition called hyperglycemia.\nDiabetes must be diagnosed as early as possible. 

## LLM

In [34]:
## 1:51