# RAG Pipeline for Indian Legal Documents

## 1. Installation

In [None]:
!pip install langchain langchain-community pypdf chromadb jupyter langchain-google-genai langchain-chroma -q

## 2. Configuration

In [None]:
import os
from getpass import getpass

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"] = getpass("Enter your Google API key: ")

## 3. Data Ingestion

In [None]:
import os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

DATA_PATH = "data"
CHROMA_PATH = "chroma"

def ingest_data():
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_PATH)
        print(f"Created directory: {DATA_PATH} Awaiting PDF files...")
        return

    loader = PyPDFDirectoryLoader(DATA_PATH)
    documents = loader.load()
    
    if not documents:
        print("No documents found in the 'data' directory.")
        return

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_documents(documents)
    
    print(f"Splitting {len(documents)} documents into {len(chunks)} chunks.")

    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
    vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory=CHROMA_PATH)
    
    print(f"Successfully ingested data and created a vector store at: {CHROMA_PATH}" )

if __name__ == "__main__":
    ingest_data()

## 4. Running the Ingestion

In [None]:
ingest_data()

## 5. RAG Chain with Gemini

In [14]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

def create_rag_chain():
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash-latest")
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vectorstore = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)

    retriever = vectorstore.as_retriever()

    system_prompt = (
        "You are an assistant for question-answering tasks. "
        "Use the following pieces of retrieved context to answer the question. "
        "If you don't know the answer, just say that you don't know. "
        "Use three sentences maximum and keep the answer concise."
        "\n\n{context}"
    )

    prompt = ChatPromptTemplate.from_messages([("system", system_prompt), ("human", "{input}")])

    question_answer_chain = create_stuff_documents_chain(llm, prompt)
    rag_chain = create_retrieval_chain(retriever, question_answer_chain)

    return rag_chain

rag_chain = create_rag_chain()

query = "What is the main point of the document?"
response = rag_chain.invoke({"input": query})

print(response["answer"])

The document outlines rules for legal pleadings, including how to handle documents,  state the effect of documents, and allege things like malice or notice.  It also addresses the organization and presentation of documents in legal proceedings.
