In [None]:
# Multimodal RAG agent: PDFs → Answers with Citations
# 1. PDFs loaded and chunked
# 2. LangChain (for chaining logic)
# 3. Vector DB created (Chroma)
# 4. SentenceTransformers (for text embeddings)

In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load PDF files from a folder
import os
folder_path = r"D:\4-IntoCode\16_LangChain\AgilProjekt_multiModel\Raw_Data\Apple"  # company folder, Use raw string
all_docs = []

for file in os.listdir(folder_path):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(folder_path, file))
        pages = loader.load_and_split()
        all_docs.extend(pages)

# Split into chunks
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_documents(all_docs)
print(f"Loaded {len(docs)} chunks from {len(all_docs)} pages across {len(os.listdir(folder_path))} PDF files.")
# Result: "Loaded 4419 chunks from 1347 pages across 22 PDF files.

Loaded 4419 chunks from 1347 pages across 22 PDF files.


In [14]:
# Create Embeddings & Save in Chroma Vector Store
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# # Create or load vectorstore
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embedding_model,
    persist_directory="./chroma_db"  # Persistence now automatic
)



In [None]:
# Build the Retrieval QA Chain with proper citations
# Load Model via transformers and Wrap in LangChain
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Load model & tokenizer
# --- 1. Load Model with CPU ---
model_name = "google/flan-t5-base"
# using CPU only: Lightweight, good for basic Q&A

tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    truncation_side="left", # Prefer truncating context, not question
    model_max_length=512     # Explicitly set model's max length
)  


model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Wrap into a generation pipeline
# --- 2. Pipeline with Token Limits ---
pipe = pipeline(
    "text2text-generation", 
    model=model, 
    tokenizer=tokenizer,
    max_new_tokens=256,      # Limit response length
    max_length=512,   # Total input+output limit
    temperature=0.3,  # # Balance creativity/factuality
    truncation=True,  # Explicitly enable
    do_sample=True,
    repetition_penalty=1.1   # Reduce repetition
) 

# Wrap pipeline in LangChain LLM
llm = HuggingFacePipeline(pipeline=pipe)

# Custom prompt
# --- 3. Prompt with Citation Instructions ---
prompt_template = """Answer the question concisely using ONLY the provided context. 
For each fact, cite its source number like [1][2]. If unsure, say "I don't know".

Context:
{context}

Question: {question}
Answer:"""
PROMPT = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

# --- 4. QA Chain with Shortened Context ---
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(
        search_type="mmr",           # Max marginal relevance for diversity
        search_kwargs={
            "k": 4,                  # Optimal balance of sources
            "score_threshold": 0.3   # Minimum relevance score
        }
    ),
        chain_type_kwargs={
        "prompt": PROMPT,
        "verbose": True              # Helpful for debugging
    },
    return_source_documents=True
)

# --- 5. Query Execution with Error Handling ---
try:
    query = "Summarize Apple's 2023 Q2 report, citing sources."
    result = qa_chain.invoke({"query": query})
    
    # Process and format the answer
    answer = result["result"].strip()
    sources = result["source_documents"]
    
    # Add formatted citations
    if sources:
        answer += "\n\nSources:"
        for i, doc in enumerate(sources, start=1):
            source_info = doc.metadata.get('source', 'Unknown document')
            page_info = f", page {doc.metadata['page']}" if 'page' in doc.metadata else ""
            answer += f"\n[{i}] {source_info}{page_info}"
    
    print("Answer:\n", answer)
    
except Exception as e:
    print(f"Error generating answer: {str(e)}")
    print("Please try again with a more specific query.")
