In [None]:
import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings

: 

In [None]:
# Load PDFs from directory
def load_pdf_file(data_dir):
    loader = DirectoryLoader(data_dir, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf_file(r'C:\Users\DELL\OneDrive\Desktop\Hackathon\Doc_Chatbot\Medical-Chatbot\Data')

In [None]:
# Split the documents into chunks
def text_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return splitter.split_documents(extracted_data)


In [None]:
text_chunks = text_split(extracted_data)
print("Number of chunks:", len(text_chunks))

In [None]:
# Use sentence-transformers model
def download_hugging_face_embeddings():
    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
embeddings = download_hugging_face_embeddings()

In [None]:
# Check an embedding shape
query_result = embeddings.embed_query("Hello World")
print("Embedding dimension:", len(query_result))

In [None]:
from langchain.vectorstores import FAISS

In [None]:

# Create FAISS vector store
docsearch = FAISS.from_documents(documents=text_chunks, embedding=embeddings)

In [None]:
# Use retriever
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load GPT-2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

In [None]:
# Create generation pipeline
llm_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=300,
    temperature=0.7,
    top_k=50
)

In [None]:
from langchain_core.language_models import BaseLanguageModel

class HuggingFacePipelineLLM(BaseLanguageModel):
    def __init__(self, pipeline):
        self.pipeline = pipeline

    def invoke(self, prompt: str):
        result = self.pipeline(prompt, return_full_text=False)[0]['generated_text']
        return result

llm = HuggingFacePipelineLLM(llm_pipeline)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

# Prompt with system message
system_prompt = (
    "You are an assistant for medical question-answering tasks. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, say that you don't know. "
    "Use three sentences maximum and keep the answer concise.\n\n{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [None]:
# Create LangChain-compatible RAG chain
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
# Invoke the RAG chain
question = "What is Acne"
response = rag_chain.invoke({"input": question})
print("Answer:", response["answer"])
