In [1]:
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader 
# from langchain_community.document_loaders import 

In [2]:
def load_file(data):
    loader= DirectoryLoader(data,
                            glob="**/*.pdf", 
                            loader_cls=PyPDFLoader)
    documents = loader.load()
    
    return documents

In [3]:
extracted_data= load_file(data = 'data/')

In [4]:

from langchain.text_splitter import RecursiveCharacterTextSplitter

def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
        length_function=len
    )
    texts_chunks = text_splitter.split_documents(data)
    
    return texts_chunks

In [5]:



texts_chunks = text_splitter(data=extracted_data)
print("length of texts: ", len(texts_chunks))

length of texts:  14650


In [6]:
from langchain.embeddings import HuggingFaceEmbeddings

In [7]:
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        # model_kwargs={"device": "cuda"}
    )
    return embeddings

In [8]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [9]:
query_results = embeddings.embed_query("What is the capital of France?")
print(len(query_results))

384


In [10]:
load_dotenv()

True

In [11]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


In [13]:

from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key= PINECONE_API_KEY)


index_name =  "diabetesbot"

# pc.create_index(
#     name=index_name,
#     dimension=384, 
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     ) 
# )



#Load existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [14]:
from langchain_pinecone import PineconeVectorStore
docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [None]:
# #Load existing index

# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_existing_index(
#     index_name=index_name,
#     embedding=embeddings,
# )

In [21]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x153c5a82ee0>

In [23]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k": 10})

In [24]:
retrieved_docs = retriever.invoke("What are the first signs of diabetes?")
retrieved_docs

[Document(id='baa8d693-d2ef-4e75-a897-8e3567f57ced', metadata={'author': 'RICHARD I.G. HOLT', 'creationdate': '2010-04-29T09:54:13+08:00', 'creator': 'Adobe InDesign CS4 (6.0.4)', 'moddate': '2010-04-29T17:22:50+08:00', 'page': 46.0, 'page_label': '25', 'producer': 'Acrobat Distiller 7.0.5 (Windows)', 'source': 'data\\allchapters.pdf', 'title': 'Textbook of Diabetes, FOURTH EDITION', 'total_pages': 1141.0}, page_content='with an increased risk of macrovascular disease. \n The characteristic clinical presentation is with thirst, polyuria, \nblurring of vision and weight loss. This can lead to ketoacidosis \nor hyperosmolar non - ketotic coma (see Chapter  19 ). Often, \nsymptoms are mild or absent and mild hyperglycemia can persist \nfor years with tissue damage developing, although the person \nmay be totally asymptomatic.  \n  Classiﬁ cation \n There was awareness of different grades of severity of diabetes for'),
 Document(id='5d6b5f35-4899-4c28-8f70-844476334e6c', metadata={'author'

In [25]:
from langchain_openai import OpenAI
llm = OpenAI(temperature=0.4,max_tokens=500)

In [32]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

system_prompt = (
    "You are a helpful assistant restricted to answering only from the provided context below. "
    "The topic is diabetes and related health conditions only. Do NOT answer any question unrelated to this topic. "
    "Do not use any outside knowledge. If the answer is not in the context, say: 'I'm sorry, I don't have that information in the provided material.' "
    "Be clear, concise, and avoid medical jargon unless it's explained."
    "Use bullet points if necessary. "
    "Use references and sources available from the context when possible. For example, 'According to [source], ...' "
    "If the question is a yes/no question, provide a short answer and then elaborate with context. "
    "If the question is not clear, ask for clarification. "
    "Use no more than 10 sentences.\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [33]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [34]:
response = rag_chain.invoke({"input": "can diabetic people have babies?"})
print(response["answer"])



Yes, diabetic women can have babies, but the risk of complications and birth defects is higher compared to non-diabetic women. It is important for diabetic women to closely monitor their blood sugar levels and maintain good glycemic control during pregnancy to reduce the risk of complications for both the mother and the baby. It is also recommended for diabetic women to receive preconception care and work closely with their healthcare team during pregnancy. 


In [35]:
response = rag_chain.invoke({"input": "does climate change affect diabetes? just say yes or no and give sources"})
print(response["answer"])



Yes, climate change can affect diabetes. According to the International Diabetes Federation, changes in climate can lead to disruptions in diet and fluid intake, which can impact diabetes control. Additionally, climate change can also lead to an increase in infectious diseases, which can worsen diabetes. Source: [Affairs (Project Hope) 2006; 25: 1053 – 1060]
