In [1]:
%pwd

'/home/mukesh/pikachu/aws/medical_chat-bot_rag/research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'/home/mukesh/pikachu/aws/medical_chat-bot_rag'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [6]:
extracted_data=load_pdf_file(data='data')


In [7]:
# extracted_data


In [8]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [9]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 3027


In [10]:
from langchain.embeddings import HuggingFaceEmbeddings


In [11]:
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [12]:
embeddings = download_hugging_face_embeddings()


  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [13]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [14]:
from dotenv import load_dotenv
load_dotenv()

True

In [15]:
PINECONE_API_KEY=os.environ.get('pincone_api')
# OPENAI_API_KEY=os.environ.get('OPENAI_API_KEY')

In [16]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [17]:
index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 

In [18]:
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY


In [19]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [20]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [21]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x758cc511cbf0>

In [22]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [23]:
retrieved_docs = retriever.invoke("What is Acne?")


In [24]:
retrieved_docs


[Document(id='a9d887fa-f1a4-4817-947f-716b3e492ad1', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 40.0, 'page_label': '41', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data/Medical_book.pdf', 'total_pages': 637.0}, page_content='Cliffs, NJ: Prentice Hall, 1995.\nGoldstein, Sanford M., and Richard B. Odom. “Skin &\nAppendages: Pustular Disorders.” In Current Medical\nDiagnosis and Treatment, 1996.35th ed. Ed. Stephen\nMcPhee, et al. Stamford: Appleton & Lange, 1995.\nKaptchuk, Ted J., Z’ev Rosenberg, and K’an Herb Co., Inc.\nK’an Herbals: Formulas by Ted Kaptchuk, O.M.D.San\nFrancisco: Andrew Miller, 1996.\nPERIODICALS\n“Adult Acne.”Harvard Women’s Health Watch(Mar. 1995): 4-\n5.\nBergfeld, Wilma F. “The Evaluation and Management of Acne:\nEconomic Considerations.” Journal of the American\nAcademy of Dermatology 32 (1995): S52-6.\nBillings, Laura. “Getting Clear.”Health Magazine, Apr. 1997,\n48-52.\nChr

In [27]:
import google.generativeai as genai

GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [33]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/learnlm-1.5-pro-experimental
models/gemma-3-27b-i

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.retrievers import BaseRetriever  # Assuming you have a retriever defined

# Replace 'YOUR_GOOGLE_API_KEY' with your actual API key
llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-flash-latest", temperature=0.4, max_output_tokens=500, google_api_key=GOOGLE_API_KEY)

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)



In [35]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [38]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

Acromegaly is a disorder caused by excess growth hormone (GH) from the pituitary gland, leading to increased bone and soft tissue growth and other bodily disturbances.  In children, before bone growth plates close, this excess GH results in gigantism, characterized by excessive height.  Acromegaly occurs after bone growth plates have closed.
