In [1]:
import argparse
import os
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.agents import Tool, initialize_agent, AgentType
from langchain.schema import Document
from langchain_google_genai import ChatGoogleGenerativeAI

In [2]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash-latest",
    temperature=0,
    google_api_key="API KEY"
)

In [3]:
EMBEDDINGS = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  EMBEDDINGS = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [4]:
CHROMA_DIR = "./vector_store"
RAG_COLLECTION = "research_rag"
MEM_COLLECTION = "long_term_memory"

In [5]:
def split_docs(raw_docs: List[Document]) -> List[Document]:
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    return splitter.split_documents(raw_docs)

In [6]:
def PDF_to_docs(fp: str) -> List[Document]:
    return split_docs(PyPDFLoader(fp).load())

In [7]:
def DOCX_to_docs(fp: str) -> List[Document]:
    return split_docs(Docx2txtLoader(fp).load())

In [24]:
def load_docs(path: str) -> List[Document]:
    docs = []
    if os.path.isfile(path):
        if path.lower().endswith(".pdf"):
            docs.extend(PDF_to_docs(path))
        elif path.lower().endswith((".docx", ".doc")):
            docs.extend(DOCX_to_docs(path))
    else:
        for root, _, files in os.walk(path):
            for f in files:
                fp = os.path.join(root, f)
                if f.lower().endswith(".pdf"):
                    docs.extend(PDF_to_docs(fp))
                elif f.lower().endswith((".docx", ".doc")):
                    docs.extend(DOCX_to_docs(fp))
    return docs

In [25]:
def build_or_load_vs(collection: str) -> Chroma:
    return Chroma(collection_name=collection, embedding_function=EMBEDDINGS, persist_directory=CHROMA_DIR)

In [26]:
def memory_write(text: str):
    vs = build_or_load_vs(MEM_COLLECTION)
    vs.add_texts([text])
    vs.persist()

In [27]:
def memory_search(query: str, k: int = 5) -> List[str]:
    vs = build_or_load_vs(MEM_COLLECTION)
    return [d.page_content for d in vs.similarity_search(query, k=k)]

In [28]:
def rag_retriever():
    return build_or_load_vs(RAG_COLLECTION).as_retriever(search_kwargs={"k": 5})

In [29]:
def tool_retrieve(query: str) -> str:
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=rag_retriever(), return_source_documents=True)
    out = chain({"query": query})
    ans = out["result"]
    sources = "\n".join({d.metadata.get("source", "unknown") for d in out["source_documents"]})
    return f"Answer: {ans}\n\nSources:\n{sources}"

In [30]:
def tool_summarize(path: str) -> str:
    docs = PDF_to_docs(path) if path.lower().endswith(".pdf") else DOCX_to_docs(path)
    chunks = [d.page_content for d in docs]
    summaries = []
    for i in range(0, len(chunks), 6):
        part = "\n\n".join(chunks[i:i+6])
        resp = llm.invoke([("system", "Summarize in bullet points <=120 words"), ("user", part[:6000])])
        summaries.append(resp.content)
    final_resp = llm.invoke([("system", "Combine into one <=200 words"), ("user", "\n\n".join(summaries)[:6000])])
    return final_resp.content

In [31]:
def tool_memory_write(text: str) -> str:
    memory_write(text)
    return "Saved to long-term memory."

In [32]:
def tool_memory_search(query: str) -> str:
    hits = memory_search(query)
    return "\n- ".join(["Memories:"] + hits) if hits else "No memory found."

In [33]:
def make_agent():
    tools = [
        Tool(name="retrieve", func=tool_retrieve, description="Answer research questions from indexed docs"),
        Tool(name="summarize_file", func=tool_summarize, description="Summarize a PDF/DOC file"),
        Tool(name="memory_write", func=tool_memory_write, description="Store a note in long-term memory"),
        Tool(name="memory_search", func=tool_memory_search, description="Search notes in long-term memory"),
    ]
    return initialize_agent(tools=tools, llm=llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=False)

In [34]:
def ingest(path: str):
    docs = load_docs(path)
    vs = build_or_load_vs(RAG_COLLECTION)
    vs.add_texts([d.page_content for d in docs], metadatas=[d.metadata for d in docs])
    vs.persist()
    print(f"Ingested {len(docs)} chunks.")

In [35]:
def chat_cli():
    agent = make_agent()
    print("Type 'exit' to quit.")
    while True:
        q = input("You: ")
        if q.strip().lower() in {"exit", "quit"}:
            break
        try:
            print(agent.run(q))
        except Exception as e:
            print(f"[Error] {e}")

In [36]:
def summarize_cli(path: str):
    print(tool_summarize(path))

In [38]:
ingest("C:\\Users\\LENOVO\\Desktop\\new\\main\\App\\Research_paper.pdf")

Ingested 36 chunks.


  vs.persist()


In [39]:
from langchain.chains import RetrievalQA

def summarize_docs() -> str:
    vs = build_or_load_vs(RAG_COLLECTION)  # load your ingested vector store
    chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vs.as_retriever(search_kwargs={"k": 5}),
        chain_type="stuff"
    )
    return chain.run("Summarize the ingested document in detail.")

In [40]:
summary = summarize_docs()
print(summary)

  return chain.run("Summarize the ingested document in detail.")


This document discusses the application of deep learning (DL) in healthcare, focusing on challenges and opportunities, particularly in the context of rare diseases.  It highlights the need for interpretable DL models, especially when dealing with small datasets, high class imbalance, and limited external validation data, common issues in rare disease research.

The text mentions that attention mechanisms within DL models offer a built-in interpretability advantage by dynamically weighting features during training, improving both transparency and performance in healthcare applications.  Several research papers are cited which explore these topics, including:

*   **Lee et al. [4]:** Emphasizes the need for interpretable DL models in low-data scenarios.
*   **Abdar et al. [25]:** Highlights techniques like reweighting and uncertainty quantification to address class imbalance in DL workflows.
*   **Esteva et al. [13]:** Provides a guide to deep learning in healthcare.
*   **Lundberg and L