<a href="https://colab.research.google.com/github/SwayamChandak/Yardstick-/blob/main/RAG_MODEL_Yardstick.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install pinecone
!pip install langchain_community
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


In [7]:
import os
import google.generativeai as genai
from typing import List, Dict, Any
import pinecone
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader
import textwrap
from IPython.display import display, Markdown
from pinecone import Pinecone, ServerlessSpec

In [10]:
class GeminiRAGSystem:
    def __init__(self):
        GOOGLE_API_KEY = 'AIzaSyDaIXKMCPh4L4AmhG4-t3i1AsU8zmXpNFQ'
        genai.configure(api_key=GOOGLE_API_KEY)
        self.index_name = "yardstick"
        PINECONE_API_KEY = 'pcsk_2qJTjg_B6avJhYxa2KpAxPeqGeK1bokGKu9pWNx6RuxeYi2xo27noq7Ug87wFKHRKxumUn'

        self.model = genai.GenerativeModel('gemini-pro')
        self.pc = Pinecone(
            api_key=PINECONE_API_KEY
        )

        # Create index if it doesn't exist
        if self.index_name not in self.pc.list_indexes().names():
            self.pc.create_index(
                name=self.index_name,
                dimension=768,
                metric='cosine',
                spec=ServerlessSpec(
                    cloud='aws',
                    region='us-east-1'
                )
            )

        # Initialize the index attribute
        self.index = self.pc.Index(self.index_name)

        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            length_function=len,
        )

    def load_documents(self, directory_path: str) -> List[str]:
        """Load documents from a directory."""
        pdf_loader = DirectoryLoader(
            directory_path,
            glob="**/*.pdf",
            loader_cls=PyPDFLoader
        )
        documents = pdf_loader.load()
        return [doc.page_content for doc in documents]

    def split_documents(self, documents: List[str]) -> List[str]:
        """Split documents into chunks."""
        return self.text_splitter.split_text('\n'.join(documents))

    def generate_embeddings(self, texts: List[str]) -> List[List[float]]:
        """Generate embeddings using Gemini's embedding model."""
        embeddings = []
        for chunk in texts:
            embedding = genai.embed_content(
                model="models/embedding-001",
                content=chunk,
                task_type="retrieval_document"
            )
            embeddings.append(embedding['embedding'])
        return embeddings

    def index_documents(self, directory_path: str) -> None:
        """Process and index documents."""
        try:
            print("Loading documents...")
            documents = self.load_documents(directory_path)

            print("Splitting documents...")
            chunks = self.split_documents(documents)

            print("Generating embeddings...")
            embeddings = self.generate_embeddings(chunks)

            print("Indexing in Pinecone...")
            batch_size = 100
            for i in range(0, len(chunks), batch_size):
                batch_chunks = chunks[i:i + batch_size]
                batch_embeddings = embeddings[i:i + batch_size]

                vectors = [
                    (str(j + i), embedding, {"text": chunk})
                    for j, (chunk, embedding) in enumerate(zip(batch_chunks, batch_embeddings))
                ]

                self.index.upsert(vectors=vectors)

            print("Indexing complete!")
        except Exception as e:
            print(f"Error during indexing: {str(e)}")
            raise

    def query(self, question: str, top_k: int = 3) -> str:
        """Query the RAG system using Gemini."""
        try:
            question_embedding = genai.embed_content(
                model="models/embedding-001",
                content=question,
                task_type="retrieval_query"
            )['embedding']

            search_results = self.index.query(
                vector=question_embedding,
                top_k=top_k,
                include_metadata=True
            )

            context = "\n\n".join([
                match.metadata["text"] for match in search_results.matches
            ])

            prompt = f"""Use the following context to answer the question. If the answer cannot be derived from the context, say "I cannot answer this question based on the available information."

Context:
{context}

Question: {question}

Answer:"""

            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            print(f"Error during query: {str(e)}")
            return "An error occurred while processing your query."


# Add necessary imports at the top of your notebook
import google.generativeai as genai
from pinecone import Pinecone, ServerlessSpec
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from IPython.display import display, Markdown

def test_rag_system():
    # Initialize the RAG system
    rag = GeminiRAGSystem()

    # Initialize documents (do this only once)
    print("Starting document indexing...")
    documents_path = "/content/drive/MyDrive/YardStick"  # Update this path as needed
    rag.index_documents(documents_path)
    print("Indexing complete! Ready for questions.\n")

    while True:
        # Get user input
        question = input("\nEnter your question (or type 'exit' to quit): ")

        # Check for exit condition
        if question.lower() == 'exit':
            print("Thank you for using the QA system!")
            break

        # Get and display answer
        try:
            answer = rag.query(question)
            print("\nAnswer:")
            display(Markdown(answer))
        except Exception as e:
            print(f"Error: {str(e)}")


https://www.researchgate.net/publication/342876160_Google

this research paper on google has been used as the database for the RAG Model


In [12]:
# Upload documents to Colab
from google.colab import files

# Run the test
if __name__ == "__main__":
    test_rag_system()

Starting document indexing...
Loading documents...
Splitting documents...
Generating embeddings...
Indexing in Pinecone...
Indexing complete!
Indexing complete! Ready for questions.


Enter your question (or type 'exit' to quit): who are the google founders 

Answer:


Larry Page and Sergey Brin


Enter your question (or type 'exit' to quit): what are the services provided by google

Answer:


Google offers a number of services, including:
- Gmail (email service)
- Google Docs (written documents)
- Google Drive (file storage)
- Google Hangouts (online chat and video conferencing)


Enter your question (or type 'exit' to quit): who is the contributor of the paper

Answer:


Gordon B. Schmidt


Enter your question (or type 'exit' to quit): exit
Thank you for using the QA system!
