In [1]:
%pwd

'e:\\LangChain project\\MedChatBot\\research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'e:\\LangChain project\\MedChatBot'

In [4]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

In [5]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [6]:
extracted_data=load_pdf_file(data='Data/')

In [7]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 5859


In [9]:
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [10]:
embeddings = download_hugging_face_embeddings()

  embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [11]:
#test the embeddings
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [12]:
from dotenv import load_dotenv
load_dotenv()

True

In [13]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [14]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medchatbot"


In [15]:

# pc.create_index(
#     name=index_name,
#     dimension=384, 
#     metric="cosine", 
#     spec=ServerlessSpec(
#         cloud="aws", 
#         region="us-east-1"
#     ) 
# ) 

In [16]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [17]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
# from langchain_pinecone import PineconeVectorStore

# docsearch = PineconeVectorStore.from_documents(
#     documents=text_chunks,
#     index_name=index_name,
#     embedding=embeddings, 
# )

In [18]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name='medchatbot',
    embedding=embeddings
)

In [19]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1a1818a17f0>

In [20]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [21]:
retrieved_docs = retriever.invoke("What is Acne?")

In [22]:
retrieved_docs

[Document(id='5d91da0a-50f4-43d0-8aef-44e51c345e1f', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='f9d0f1c8-799c-430b-8308-91f3a5e3a525', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nat

In [23]:
GEMINI_API_KEY=os.environ.get('GEMINI_API_KEY')
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

In [24]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-lite",
    google_api_key=GEMINI_API_KEY,
    temperature=0.2,
)

## Translate

In [70]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_core.output_parsers import StrOutputParser


translate_vi_to_en_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that translates Vietnamese text to English. Only provide the translated text, do not include any additional information."),
    ("user", "Translate the following Vietnamese text to English: {text}"),
])

translate_vi_to_en_chain = translate_vi_to_en_prompt | llm | StrOutputParser()

In [71]:
translate_vi_to_en_chain.invoke({"text": "Phương pháp chữa bệnh HIV là gì?"})

'What is the treatment for HIV?'

In [72]:
translate_en_to_vi_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that translates English to Vietnamese. Only provide the translated text, do not include any additional information."),
    ("user", "Translate the following English text to Vietnamese: {text}"),
])

translate_en_to_vi_chain = translate_en_to_vi_prompt | llm | StrOutputParser()

## Retrieval

In [79]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    '''
    You are a medical question-answering assistant.  
    Use the retrieved context as your primary source of truth.  
    - If the context does not provide enough information, say "I don't know" or supplement with medically sound general knowledge.  
    - Provide clear, medically accurate explanations. Responses should be detailed as much as the provided context.
    - Vary your wording so repeated questions do not produce identical answers.  
    - When the question involves diagnosis, treatment, or personal medical decisions, remind the user to consult a qualified healthcare professional.  
    '''
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [80]:
prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    You are a medical question-answering assistant.  \n    Use the retrieved context as your primary source of truth.  \n    - If the context does not provide enough information, say "I don\'t know" or supplement with medically sound general knowledge.  \n    - Provide clear, medically accurate explanations. Responses should be detailed as much as the provided context.\n    - Vary your wording so repeated questions do not produce identical answers.  \n    - When the question involves diagnosis, treatment, or personal medical decisions, remind the user to consult a qualified healthcare professional.  \n    {context}'), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, tem

In [75]:
#Legacy Langchain

# question_answer_chain = create_stuff_documents_chain(llm, prompt)
# rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [67]:
# New Langchain (without translation)
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": lambda x: retriever.invoke(x["input"]), "input": RunnablePassthrough() }
    | prompt
    | llm
    | StrOutputParser()
)

In [31]:
response = rag_chain.invoke({"input": "What is HIV?"})
print(response)

HIV, or human immunodeficiency virus, is a transmissible retrovirus that causes AIDS in humans. Two forms of HIV are recognized: HIV-1, which causes most cases of AIDS in Europe, North and South America, and most parts of Africa; and HIV-2, which is chiefly found in West African patients. HIV-2 appears to be less virulent than HIV-1.


In [81]:
# New Langchain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.runnables import RunnableLambda
from operator import itemgetter

rag_chain = (
    RunnableLambda(lambda x: translate_vi_to_en_chain.invoke({"text": x["text"]}))
    | RunnableLambda(lambda x: {"input": x})
    | {
        "context": lambda x: retriever.invoke(x["input"]),
        "input": itemgetter("input"),
    }
    | prompt
    | llm
    | StrOutputParser()
    | RunnableLambda(lambda x: translate_en_to_vi_chain.invoke({"text": x}))
)

In [82]:
response = rag_chain.invoke({"text": "Bệnh ung thư là gì?"})
print(response)


Ung thư được đặc trưng bởi sự phân chia không kiểm soát của các tế bào ác tính. Ví dụ, ung thư vú là do sự phát triển của các tế bào ác tính trong vú, bắt nguồn từ niêm mạc của các tuyến sữa hoặc ống dẫn sữa của vú.
