In [1]:
!pip install langchain langchain-community pypdf sentence-transformers faiss-cpu transformers

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting pypdf
  Downloading pypdf-6.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
INFO: pip is looking at multiple versions of langchain-community to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-community
  Downloading langchain_community-0.4-py3-none-any.whl.metadata (3.0 kB)
  Downloading langchain_community-0.3.31-py3-none-any.whl.metadata (3.0 kB)
Collecting requests<3,>=2 (from langchain)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain-community)
  Downl

In [None]:
import os
import logging
from google.colab import files
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class LocalRAGSystem:
    def __init__(self):
        self.documents = []
        self.vector_store = None
        self.embeddings = None
        self.llm = None
        self.qa_chain = None

    def upload_pdfs(self):
        uploaded = files.upload()
        pdf_paths = list(uploaded.keys())
        logger.info(f"Uploaded PDFs: {pdf_paths}")
        return pdf_paths

    from langchain_community.document_loaders import UnstructuredWordDocumentLoader, UnstructuredPowerPointLoader

    def load_documents(self, file_paths):
        for file_path in file_paths:
            if file_path.endswith(".pdf"):
                loader = PyPDFLoader(file_path)
            elif file_path.endswith(".docx"):
                loader = UnstructuredWordDocumentLoader(file_path)
            elif file_path.endswith(".pptx"):
                loader = UnstructuredPowerPointLoader(file_path)
            else:
                logger.warning(f"Unsupported file type: {file_path}")
                continue
            documents = loader.load()
            self.documents.extend(documents)
        logger.info(f"Loaded {len(self.documents)} pages in total.")

    def split_documents(self, chunk_size=1000, chunk_overlap=200):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        self.document_chunks = text_splitter.split_documents(self.documents)
        logger.info(f"Split into {len(self.document_chunks)} chunks.")

    def setup_embeddings(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
        self.embeddings = HuggingFaceEmbeddings(model_name=model_name)
        logger.info(f"Embedding model {model_name} loaded.")

    def create_vector_store(self):
        self.vector_store = FAISS.from_documents(
            self.document_chunks, self.embeddings)
        logger.info("Created the FAISS vector store.")

    def setup_local_llm(self, model_id="google/flan-t5-base", device="auto"):
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map=device)
        pipe = pipeline("text2text-generation", model=model,
                        tokenizer=tokenizer, max_new_tokens=512, temperature=0.7)
        self.llm = HuggingFacePipeline(pipeline=pipe)
        logger.info(f"Local LLM {model_id} ready.")

    def setup_qa_chain(self, k=3):
        self.qa_chain = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=self.vector_store.as_retriever(search_kwargs={"k": k})
        )
        logger.info(f"Retrieval QA chain set with top {k} documents retrieved.")

    def answer_question(self, question):
        answer = self.qa_chain.run(question)
        logger.info(f"Answered question: {question}")
        return answer

    def run_setup(self, chunk_size=1000, chunk_overlap=200, model_id="google/flan-t5-base", k=3):
        pdf_paths = self.upload_pdfs()
        self.load_documents(pdf_paths)
        self.split_documents(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        self.setup_embeddings()
        self.create_vector_store()
        self.setup_local_llm(model_id=model_id)
        self.setup_qa_chain(k=k)
        logger.info("RAG summarizer is ready to answer questions.")

In [31]:
rag = LocalRAGSystem()
rag.run_setup()

q1 = "What is the main topic of these documents?"
print(f"Q: {q1}\nA: {rag.answer_question(q1)}")

q2 = "Summarize the key points from the documents."
print(f"Q: {q2}\nA: {rag.answer_question(q2)}")

Saving grep_cheat_sheet.pdf to grep_cheat_sheet (1).pdf


Device set to use cpu


Q: What is the main topic of these documents?
A: GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Cheat Sheet - Twinkle Twinkle Little Star GREP Command Che