In [1]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader 
# from langchain_community.document_loaders import 

In [2]:
def load_file(data):
    loader= DirectoryLoader(data,
                            glob="**/*.pdf", 
                            loader_cls=PyPDFLoader)
    documents = loader.load()
    
    return documents

In [4]:
extracted_data= load_file(data = 'data/')

In [None]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    texts_chunks = text_splitter.split_documents(data)
    
    return texts_chunks

In [None]:



texts_chunks = text_splitter(data=extracted_data)
print("length of texts: ", len(texts_chunks))

length of texts:  13854


In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

In [11]:
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        # model_kwargs={"device": "cuda"}
    )
    return embeddings

In [12]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
query_results = embeddings.embed_query("What is the capital of France?")
print(len(query_results))

384


In [14]:
load_dotenv()

True

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


pcsk_6U7i9B_NYnc2J7NXnGzTB7eT99m9PFkCU8HptZRG6qVHFRdkCAYjnxnYfHCsNbUCmNYG1p
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


In [16]:

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key= PINECONE_API_KEY)


index_name =  "diabetesbot2"

pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)


{
    "name": "diabetesbot2",
    "metric": "cosine",
    "host": "diabetesbot2-0i3kp4g.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [17]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [None]:
# #Load existing index

# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_existing_index(
#     index_name=index_name,
#     embedding=embeddings,
# )

In [18]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2bef0624550>

In [19]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k": 3})

In [20]:
retrieved_docs = retriever.invoke("What are the first signs of diabetes?")
retrieved_docs

[Document(id='658625e1-254e-450c-8ee0-5314cfd931fa', metadata={'author': 'RICHARD I.G. HOLT', 'creationdate': '2010-04-29T09:54:13+08:00', 'creator': 'Adobe InDesign CS4 (6.0.4)', 'moddate': '2010-04-29T17:22:50+08:00', 'page': 46.0, 'page_label': '25', 'producer': 'Acrobat Distiller 7.0.5 (Windows)', 'source': 'data\\allchapters.pdf', 'title': 'Textbook of Diabetes, FOURTH EDITION', 'total_pages': 1141.0}, page_content='with an increased risk of macrovascular disease. \n The characteristic clinical presentation is with thirst, polyuria, \nblurring of vision and weight loss. This can lead to ketoacidosis \nor hyperosmolar non - ketotic coma (see Chapter  19 ). Often, \nsymptoms are mild or absent and mild hyperglycemia can persist \nfor years with tissue damage developing, although the person \nmay be totally asymptomatic.  \n  Classiﬁ cation \n There was awareness of different grades of severity of diabetes for'),
 Document(id='439b5560-68ec-4b8f-8b52-ba5376519d39', metadata={'author'

In [21]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4,max_tokens=500)

In [22]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    "You are a knowledgeable and friendly diabetes educator."
    "Use the following pieces of retrieved context to answer"
    "Use clear and supportive language. Avoid medical jargon unless explained."
    "You are not a doctor and do not give personal medical advice."
    "If you do not know the answer, say you do not" 
    "know and keep it to 3 sentences"
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [23]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [24]:
response = rag_chain.invoke({"input": "What are the major causes of diabetes?"})
print(response["answer"])


System: The main factors that contribute to diabetes include genetics, viral infections, diet, high birth weight and growth rate, stress, and exposure to toxins. Additionally, the combination of islet autoimmunity and factors that increase insulin resistance, such as obesity and rapid growth, can accelerate the destruction of insulin-producing cells.


In [27]:
response = rag_chain.invoke({"input": "What is climate?"})
print(response["answer"])



System: Climate refers to the long-term patterns of weather in a particular region. It includes factors such as temperature, precipitation, and wind patterns. These factors can greatly impact an individual's health, especially for those with chronic conditions like diabetes. However, the specific ways in which climate affects individuals can vary based on their genetic background and other environmental factors.
