# Loading and Splitting

In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Extract text fom PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        path=data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    docs = loader.load()
    return docs

In [None]:
import os
os.chdir("../")
extracted_data = load_pdf_files("./data")
extracted_data

In [4]:
len(extracted_data)

637

In [5]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects containing only 'source' in metadata 
    and the original page_content 
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

In [7]:
def text_split(minimal_docs):
    """
    Given a list of documents, split them into smaller chunks of text.
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [None]:
text_chunks = text_split(minimal_docs)
print(f"Number of chunks: {len(text_chunks)}")
text_chunks

# Embedding and Vector Store

In [1]:
import os
os.environ['HF_HOME'] = "E:/ohokin/.cache/"

In [2]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

def download_embeddings_model():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings_model = HuggingFaceBgeEmbeddings(
        model_name=model_name
    )
    return embeddings_model

embeddings_model = download_embeddings_model()

  embeddings_model = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
embeddings_model

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [4]:
vector = embeddings_model.embed_query("Hello World")
len(vector)

384

In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN

In [7]:
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY
pc = Pinecone(api_key=pinecone_api_key)
pc

<pinecone.pinecone.Pinecone at 0x29f033edae0>

In [16]:
from pinecone import ServerlessSpec

index_name = "medibot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,      # Dimension of the embeddings
        metric="cosine",    # Cosine Similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings_model,
    index_name=index_name
)

In [8]:
# Load existing index
from langchain_pinecone import PineconeVectorStore

index_name = "medibot"
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings_model
)

In [None]:
# Add more data to existing Pinecone Index

dswith = Document(
    page_content="random stuff",
    metadata={"source": "N/A"}
)
docsearch.add_documents(documents=[dswith])

In [9]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [10]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='1e9504b4-fc83-43d6-a51f-1e652ba5878a', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='9e1ec9f9-dd0f-4fcc-bb74-051c3f961a59', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='d7ca7342-c64f-44bc-87d5-103c6f3d7bab', metadata={'source': 'data\\Medical_book.pdf'}, page_content='Cliffs, NJ: Prentice Hall, 1995.\nGoldstein, Sanford M., and Richard B. Odom. “Skin &\nAppendages: Pustular Disorders.” In Current Medical\nDiagnosis and Treatment, 1996.35th ed. Ed. Stephen\nMcPhee, et al. Stamford: Appleton & Lange, 1995

# RAG Chain

In [11]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="meta-llama/Llama-3.2-1B-Instruct",
    task="text-generation",
    max_new_tokens=512,
    do_sample=False,
    repetition_penalty=1.03,
    provider="auto",  # let Hugging Face choose the best provider for you
)

chat_model = ChatHuggingFace(llm=llm)

In [12]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [13]:
system_prompt = (
    "You are a medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [14]:
question_answer_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [15]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly and gigantism are two disorders that occur when the pituitary gland releases too much of a hormone called growth hormone (GH). Growth hormone is produced by the pituitary gland and plays a role in growth and development during childhood and adolescence. 

In acromegaly, the excess growth hormone causes excessive growth and enlargement of body parts, such as the hands, feet, fingers, toes, and face, as well as other tissues. In gigantism, the excess growth hormone causes excessive growth and enlargement of body parts during childhood and adolescence, often before the age of 15.

Both conditions are rare and can cause a range of symptoms, including:

* Enlarged hands, feet, and face
* Excessive sweating
* High blood pressure
* Heart problems
* Vision problems
* Bone fractures
* Decreased fertility in men

Acromegaly is a hormone disorder caused by a non-cancerous (benign) tumor on the pituitary gland, while gigantism is a rare congenital disorder caused by a genetic mutation t