In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains import create_retrieval_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
import pinecone
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [61]:
#Pinecone details
import os
from dotenv import load_dotenv

load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_index_name = os.getenv("PINECONE_INDEX_NAME")

In [3]:
def load_pdf(data):
    loader = PyPDFDirectoryLoader(data)

    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf("data")

In [5]:
len(extracted_data)

485

In [6]:
#Create text chunks
def text_split(extracted_data):
    test_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    test_chunks = test_splitter.split_documents(extracted_data)
    return test_chunks

In [7]:
text_chunks = text_split(extracted_data)
len(text_chunks)

3769

In [8]:
#download embedding model
def download_hugging_face_embeddings():
    emebeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return emebeddings

In [9]:
embeddings = download_hugging_face_embeddings()

  emebeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 181.05it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [10]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [23]:
os.environ["PINECONE_API_KEY"] = pinecone_api_key
docsearch = PineconeVectorStore.from_documents(
    documents = text_chunks,
    embedding=embeddings,
    index_name=pinecone_index_name
)

In [42]:
prompt = ChatPromptTemplate.from_template("""
You are an expert medical assistant.
Use the following pieces of context (retrieved from medical slides) to answer the question.

If the answer is not in the context, say "I cannot find the answer in the provided slides."
Keep your answer concise and clinical.

Context:
{context}

Question:
{input}
""")

In [62]:
groq_api_key = os.getenv("GROQ_API_KEY")

In [65]:
llm = ChatGroq(
    model="openai/gpt-oss-120b",
    api_key = os.getenv(groq_api_key),
    temperature=0.5
)

In [70]:
llm_chain = create_stuff_documents_chain(llm, prompt)

In [71]:
retriever = docsearch.as_retriever(search_kwargs={"k": 6})

In [72]:
rag_chain = create_retrieval_chain(retriever, llm_chain)

In [73]:
while True:
    user_input = input("\nInput Prompt: ")
    if user_input.lower() == "exit":
        print("Closing the medical chatbot. Goodbye!")
        break
        
    result = rag_chain.invoke({"input": user_input})
    
    # 4. Print the result
    print("Response:", result["answer"])

Response: I cannot find the answer in the provided slides.
Response: I cannot find the answer in the provided slides.
Response: I cannot find the answer in the provided slides.
Response: I cannot find the answer in the provided slides.
Response: Tuberculosis is a disease of the respiratory system.
Response: Tuberculosis infection of the pleura elicits a delayed‑type hypersensitivity reaction that produces a lymphocyte‑rich exudative effusion. The fluid is typically an exudate (high protein) with many lymphocytes; tubercle bacilli are seldom seen directly. The immune response leads to granulomatous inflammation, which can be confirmed by pleural biopsy. When the organism involves the spine, it causes tuberculous discitis with destruction of the inter‑vertebral disc and adjacent vertebral bodies, producing deformities such as a gibbus.
Response: When you are ready to finish the interview, follow these steps:

1. **Give a clear cue** – tell the patient that the session is coming to an end