In [1]:
!pip install faiss-cpu
!pip install langchain
!pip install sentence-transformers
!pip install PyPDF2
!pip install transformers
!pip install datasets
!pip install tqdm
!pip install langchain-community
!pip install pypdf

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from to

In [None]:
import os
import glob
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm
import numpy as np

class UltraSimpleRAG:
    def __init__(self, papers_folder="papers"):
        self.vectorstore = None
        self.documents = []
        self.papers_folder = papers_folder

    def load_pdfs_from_folder(self):
        """Load PDF files from the papers folder"""
        if not os.path.exists(self.papers_folder):
            print(f"❌ Papers folder '{self.papers_folder}' not found!")
            print(f"💡 Please create a '{self.papers_folder}' folder and add your PDF files there.")
            return []

        # Find all PDF files in the papers folder
        pdf_files = glob.glob(os.path.join(self.papers_folder, "*.pdf"))

        if not pdf_files:
            print(f"❌ No PDF files found in '{self.papers_folder}' folder!")
            return []

        print(f"📁 Found {len(pdf_files)} PDF files in '{self.papers_folder}' folder:")
        for pdf_file in pdf_files:
            print(f"   - {os.path.basename(pdf_file)}")

        docs = []
        for pdf_file in tqdm(pdf_files, desc="Loading PDFs"):
            try:
                loader = PyPDFLoader(pdf_file)
                loaded_docs = loader.load()
                docs.extend(loaded_docs)
                print(f"✅ Loaded: {os.path.basename(pdf_file)} ({len(loaded_docs)} pages)")
            except Exception as e:
                print(f"❌ Error loading {os.path.basename(pdf_file)}: {str(e)}")

        print(f"✅ Total loaded: {len(docs)} document pages")
        return docs

    def process_documents(self, documents):
        """Split documents into small chunks"""
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=300,
            chunk_overlap=30,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
        )
        chunks = splitter.split_documents(documents)
        print(f"✅ Created {len(chunks)} chunks")
        return chunks

    def create_vectorstore(self, chunks):
        """Create vector store with embeddings"""
        print("🔧 Loading embedding model...")
        embeddings = HuggingFaceEmbeddings(
            model_name="sentence-transformers/all-MiniLM-L6-v2",
            model_kwargs={'device': 'cpu'}
        )

        print("🔧 Creating vector store...")
        self.vectorstore = FAISS.from_documents(chunks, embeddings)
        self.documents = chunks
        print("✅ Vector store created!")

    def simple_qa(self, question, k=3):
        """Simple Q&A without complex chains"""
        if not self.vectorstore:
            print("❌ Please setup the system first!")
            return

        docs = self.vectorstore.similarity_search(question, k=k)

        context = "\n\n".join([doc.page_content for doc in docs])

        response = self.generate_simple_answer(question, context, docs)

        print(f"\n💬 Question: {question}")
        print("=" * 50)
        print("📝 Answer:")
        print(response)

        print("\n📚 Relevant Sections:")
        for i, doc in enumerate(docs, 1):
            source = doc.metadata.get('source', 'Unknown')
            source_name = os.path.basename(source) if source != 'Unknown' else 'Unknown'
            content = doc.page_content[:150] + "..." if len(doc.page_content) > 150 else doc.page_content
            print(f"{i}. From {source_name}:")
            print(f"   {content}")
            print()

    def generate_simple_answer(self, question, context, docs):
        """Generate a simple answer based on context"""
        question_lower = question.lower()
        context_lower = context.lower()

        if "what" in question_lower or "define" in question_lower:
            return f"Based on the documents, here's what I found:\n\n{context[:500]}..."

        elif "how" in question_lower:
            return f"According to the documents, here's the process/method:\n\n{context[:500]}..."

        elif "why" in question_lower:
            return f"The documents explain the reasoning as follows:\n\n{context[:500]}..."

        elif "summary" in question_lower or "summarize" in question_lower:
            sentences = context.split('.')
            key_sentences = [s.strip() for s in sentences if len(s.strip()) > 20][:3]
            return f"Summary of key points:\n\n" + "\n• ".join(key_sentences)

        elif "main topic" in question_lower or "about" in question_lower:
            return f"The main topics discussed in the documents include:\n\n{context[:400]}..."

        else:
            return f"Here's what I found related to your question:\n\n{context[:500]}..."

    def interactive_qa(self):
        """Start interactive Q&A session"""
        print("\n" + "=" * 60)
        print("🎯 Interactive Q&A Session Started!")
        print("💡 This uses simple keyword matching - ask clear questions!")
        print("Examples: 'What is...?', 'Summarize the main points', 'What are the key topics?'")
        print("Type 'quit' to exit")
        print("=" * 60)

        while True:
            question = input("\n❓ Your question: ").strip()

            if question.lower() in ['quit', 'exit', 'q']:
                print("👋 Session ended!")
                break

            if not question:
                continue

            self.simple_qa(question)

    def search_documents(self, query, k=5):
        """Search for relevant document sections"""
        if not self.vectorstore:
            print("❌ Please setup the system first!")
            return

        docs = self.vectorstore.similarity_search(query, k=k)

        print(f"\n🔍 Search Results for: '{query}'")
        print("=" * 60)

        for i, doc in enumerate(docs, 1):
            source = doc.metadata.get('source', 'Unknown')
            source_name = os.path.basename(source) if source != 'Unknown' else 'Unknown'
            content = doc.page_content
            print(f"{i}. From {source_name}:")
            print(f"   {content}")
            print("-" * 40)

    def list_available_papers(self):
        """List all available PDF papers in the folder"""
        if not os.path.exists(self.papers_folder):
            print(f"❌ Papers folder '{self.papers_folder}' not found!")
            return

        pdf_files = glob.glob(os.path.join(self.papers_folder, "*.pdf"))

        if not pdf_files:
            print(f"❌ No PDF files found in '{self.papers_folder}' folder!")
            return

        print(f"\n📚 Available papers in '{self.papers_folder}' folder:")
        print("=" * 50)
        for i, pdf_file in enumerate(pdf_files, 1):
            file_size = os.path.getsize(pdf_file) / (1024 * 1024)
            print(f"{i}. {os.path.basename(pdf_file)} ({file_size:.2f} MB)")

def quick_setup(papers_folder="papers"):
    """Quick setup for RAG system"""
    rag = UltraSimpleRAG(papers_folder)

    rag.list_available_papers()

    # Load documents from folder
    docs = rag.load_pdfs_from_folder()
    if not docs:
        print("❌ No documents loaded!")
        return None

    # Process documents
    chunks = rag.process_documents(docs)

    # Create vector store
    rag.create_vectorstore(chunks)

    return rag

# Main execution
if __name__ == "__main__":
    print("🚀 Ultra-Simple RAG System - Papers Folder Version!")
    print("✨ Reads PDFs from 'papers' folder in your directory!")
    print("=" * 60)

    papers_folder = "papers"

    rag_system = quick_setup(papers_folder)

    if rag_system:
        print("\n🎉 Setup complete! Choose an option:")
        print("1. Interactive Q&A")
        print("2. Document Search")
        print("3. List available papers")

        choice = input("\nEnter choice (1, 2, or 3): ").strip()

        if choice == "2":
            while True:
                query = input("\n🔍 Search query (or 'quit'): ").strip()
                if query.lower() in ['quit', 'exit', 'q']:
                    break
                if query:
                    rag_system.search_documents(query)
        elif choice == "3":
            rag_system.list_available_papers()
        else:
            rag_system.interactive_qa()
    else:
        print("❌ Setup failed. Please check your papers folder and try again.")
        print(f"💡 Make sure you have a '{papers_folder}' folder with PDF files in your current directory.")

def ask_question(rag_system, question):
    """Helper function to ask a single question"""
    rag_system.simple_qa(question)

def search_docs(rag_system, query):
    """Helper function to search documents"""
    rag_system.search_documents(query)

def create_rag_system(papers_folder="papers"):
    """Helper function to create RAG system with custom folder"""
    return quick_setup(papers_folder)

🚀 Ultra-Simple RAG System - Papers Folder Version!
✨ Reads PDFs from 'papers' folder in your directory!

📚 Available papers in 'papers' folder:
1. 2005.14165v4.pdf (6.45 MB)
2. 2005.11401v4.pdf (0.84 MB)
3. 1706.03762v7.pdf (2.11 MB)
📁 Found 3 PDF files in 'papers' folder:
   - 2005.14165v4.pdf
   - 2005.11401v4.pdf
   - 1706.03762v7.pdf


Loading PDFs:  33%|███▎      | 1/3 [00:01<00:03,  1.84s/it]

✅ Loaded: 2005.14165v4.pdf (75 pages)


Loading PDFs:  67%|██████▋   | 2/3 [00:02<00:01,  1.14s/it]

✅ Loaded: 2005.11401v4.pdf (19 pages)


Loading PDFs: 100%|██████████| 3/3 [00:03<00:00,  1.26s/it]

✅ Loaded: 1706.03762v7.pdf (15 pages)
✅ Total loaded: 109 document pages
✅ Created 1410 chunks
🔧 Loading embedding model...



  embeddings = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

🔧 Creating vector store...
✅ Vector store created!

🎉 Setup complete! Choose an option:
1. Interactive Q&A
2. Document Search
3. List available papers

Enter choice (1, 2, or 3): 1

🎯 Interactive Q&A Session Started!
💡 This uses simple keyword matching - ask clear questions!
Examples: 'What is...?', 'Summarize the main points', 'What are the key topics?'
Type 'quit' to exit

❓ Your question: Summarize the main points

💬 Question: Summarize the main points
📝 Answer:
Summary of key points:

[28] Romain Paulus, Caiming Xiong, and Richard Socher
• A deep reinforced model for abstractive
summarization
• arXiv preprint arXiv:1705

📚 Relevant Sections:
1. From 1706.03762v7.pdf:
   [28] Romain Paulus, Caiming Xiong, and Richard Socher. A deep reinforced model for abstractive
summarization. arXiv preprint arXiv:1705.04304, 2017.
[...

2. From 2005.14165v4.pdf:
   anything from correcting grammar, to generating examples of an abstract concept, to critiquing a short story. For many
of these tasks