In [None]:
!pip install langchain==0.1.13 langchain-community langchain-google-genai sentence-transformers chromadb pypdf reportlab

Upload PDFs from Local

In [None]:
from google.colab import files
import shutil
import os

# Remove existing folder if exists
if os.path.exists("/content/papers"):
    shutil.rmtree("/content/papers")

# Create target directory
os.makedirs("/content/papers", exist_ok=True)

# Upload files
uploaded = files.upload()

# Move only valid non-empty PDFs into the folder
for filename in uploaded.keys():
    if filename.lower().endswith('.pdf') and os.path.getsize(filename) > 0:
        shutil.move(filename, f"/content/papers/{filename}")
    else:
        print(f"Skipped invalid or empty file: {filename}")

print("Files uploaded successfully.")

Load and Process PDF Documents

In [None]:
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import glob
from pypdf.errors import PdfReadError

# Helper: remove truly empty PDFs before loading
for pdf_file in glob.glob("/content/papers/*.pdf"):
    if os.path.getsize(pdf_file) == 0:
        print(f"Removing empty file: {pdf_file}")
        os.remove(pdf_file)

# Load PDFs
loader = PyPDFDirectoryLoader("/content/papers")
try:
    docs = loader.load()
except PdfReadError as e:
    print(f"Error reading a PDF: {e}")

# Add metadata
for i, doc in enumerate(docs):
    source = doc.metadata.get('source', f'doc_{i}.pdf')
    doc.metadata['filename'] = source.split('/')[-1]
    doc.metadata['page'] = doc.metadata.get('page', i + 1)

# Split into text chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)
chunks = text_splitter.split_documents(docs)

print(f"Loaded {len(docs)} pages and split into {len(chunks)} chunks.")
print("Final list of files to process:")
print(os.listdir("/content/papers"))

Generate Embeddings and Build Vectorstore

In [None]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

# Load embedding model
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# Create vectorstore from chunks
vectorstore = Chroma.from_documents(chunks, embeddings)

# Create retriever
retriever = vectorstore.as_retriever(search_kwargs={'k': 5})

  Set Up Gemini API & LLM

In [None]:
import os

# Set your Google API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyDnxnyeCE5v3TsVZBJrol4Q7XJ5KB0VzZ4"

from langchain_google_genai import ChatGoogleGenerativeAI

# Initialize LLM
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.5)

Define Prompt and Context Formatter

In [None]:
from langchain.prompts import ChatPromptTemplate

# Prepare context from retrieved docs
def prepare_context_with_sources(documents):
    context_blocks = []
    source_citations = set()

    for doc in documents:
        filename = doc.metadata.get("filename", "unknown_file")
        page = doc.metadata.get("page", "N/A")
        content = doc.page_content.strip().replace("\n", " ")

        context_blocks.append(f"[{filename}, Page {page}]: {content}")
        source_citations.add((filename, page))

    return "\n\n".join(context_blocks), source_citations

# Define prompt template
template = """
<context>
{context}
</context>

You are an AI assistant answering questions based on academic papers.
Answer the following question truthfully and clearly using only the above context.
Do not hallucinate or make up information.

Question: {query}
"""

prompt = ChatPromptTemplate.from_template(template)

Define RAG Function & History Tracker

In [None]:
# Initialize history list
qa_history = []

# Function to run RAG + source citations
def rag_with_sources(query):
    docs = retriever.get_relevant_documents(query)
    context, sources = prepare_context_with_sources(docs)

    inputs = {"context": context, "query": query}
    answer = llm.invoke(prompt.format_prompt(**inputs).to_messages())

    formatted_sources = [f"{file}, Page {page}" for file, page in sources]
    qa_entry = {
        "question": query,
        "answer": answer.content.strip(),
        "sources": formatted_sources
    }
    qa_history.append(qa_entry)

    return qa_entry

Sample questions test

In [None]:
sample_questions = [
    "What are the main components of a RAG model, and how do they interact?",
    "What are the two sub-layers in each encoder layer of the Transformer model?",
    "Explain how positional encoding is implemented in Transformers and why it is necessary.",
    "Describe the concept of multi-head attention in the Transformer architecture. Why is it beneficial?",
    "What is few-shot learning, and how does GPT-3 implement it during inference?"
]
for q in sample_questions:
    result = rag_with_sources(q)
    print(f"\nQ: {result['question']}\nA: {result['answer']}\nSources: {', '.join(result['sources'])}\n")

# Interactive loop
import sys
while True:
    user_input = input("Ask a question (or type 'exit'): ")
    if user_input.lower() == "exit":
        print("Exiting Q&A.")
        break
    if user_input.strip() == "":
        continue
    result = rag_with_sources(user_input)
    print(f"\nQ: {result['question']}\nA: {result['answer']}\nSources: {', '.join(result['sources'])}\n")