In [1]:
print("Hello world")

Hello world


In [2]:
%pwd

'c:\\Users\\KIIT\\Desktop\\GENAIprojects\\end-to-end-medical-bot\\research'

In [3]:
import os

In [10]:
os.chdir("../")
%pwd

'c:\\Users\\KIIT\\Desktop\\GENAIprojects\\end-to-end-medical-bot'

'c:\\Users\\KIIT\\Desktop\\GENAIprojects\\end-to-end-medical-bot\\research'

In [11]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [12]:
# extract data from pdf
def load_pdf(data):
    loader=DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [13]:
extracted_data=load_pdf(data="data/")

In [14]:
# split the data into text chunks
def text_split(data):
    splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=splitter.split_documents(data)
    return text_chunks
text_chunks=text_split(extracted_data)
print("Length of text chunks: ",len(text_chunks))

Length of text chunks:  39994


In [16]:
text_chunks[39990]

Document(metadata={'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf', 'page': 4504, 'page_label': '4475'}, page_content='in vegetarian diet, 5:3898\nvitamin E and, 4:2651\nZinc acetate, for Wilson disease,\n5:3999\nZinc deficiency, 3:2439–2443\naging and, 1:77\nblood test for, 2:1149\nwith chelation therapy, 2:817\nZinc oxide\nfor cutaneous T-cell lymphoma,\n2:1079\nfor diaper rash, 2:1171\nfor enterobiasis, 2:1342\nfor hemorrhoids, 1:310\nfor pityriasis rosea, 4:2915\nfor sunburn prevention, 1:688\nsunscreens with, 4:3597\nin topical corticosteroids, 2:1026\nZinc picolinate, for smelling\ndisorders, 4:3462')

In [19]:
#embedding model from huggingface
from langchain.embeddings import HuggingFaceEmbeddings


In [23]:
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings
embeddings=download_hugging_face_embeddings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [24]:
query_result=embeddings.embed_query("Hello world")
print("Length ",len(query_result))

Length  384


In [44]:
from dotenv import load_dotenv
load_dotenv()

True

In [45]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')
MISTRAL_API_KEY=os.environ.get('MISTRAL_API_KEY')

In [27]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [55]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["MISTRAL_API_KEY"] = MISTRAL_API_KEY

In [29]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [30]:
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [31]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1e2bb037a00>

In [32]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})
retrieved_docs = retriever.invoke("What is Acne?")

In [33]:
retrieved_docs

[Document(id='341cced3-fbb8-4bf8-93b6-e28b0c8a5f29', metadata={'page': 55.0, 'page_label': '26', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='9fce1ba2-f1bd-40b0-b7ec-ccc8d6ff22e3', metadata={'page': 55.0, 'page_label': '26', 'source': 'data\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition-staibabussalamsula.ac_.id_.pdf'}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoin— A drug that works by increasing the\nturnover (death and replacement) of skin cells.\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous glands\nbecome inflamed.(Photograph by Biophoto Associates, Photo'),
 Document(id='5c8

In [63]:
from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0.4,
    max_retries=2,
    max_tokens=500,
    # other params...
)

In [62]:
%pip install -qU langchain_mistralai

Note: you may need to restart the kernel to use updated packages.


In [64]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [65]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [68]:
response = rag_chain.invoke({"input": "How to cure stats?"})
print(response["answer"])

I don't know how to "cure stats" as it seems like there is a misunderstanding or typo. If you meant to ask about treating a health condition or something specific, please provide more context.
