In [None]:
!pip install langchain langchain-core langchain-google-genai langchain-cohere langchain-community pydantic python-dotenv chromadb pypdf



In [None]:
# Import Libraries and environment variables
import os
from dotenv import load_dotenv
from google.colab import userdata


# Load environment variables from '.env' file
load_dotenv()

# Set Google API key
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY1')

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

# Update the path to your PDF file
loader = PyPDFLoader("/content/Legal Case Studies for RAG Demonstration.pdf")
data = loader.load()  # entire PDF is loaded as a single Document

# Verify the data
print(data)


[Document(metadata={'source': '/content/Legal Case Studies for RAG Demonstration.pdf', 'page': 0}, page_content='Case Title: Smith vs. Johnson \nCase Number: 2025-CV-12345 \nJurisdictions Involved:  \n• California, USA \n• New York, USA \n• Ontario, Canada \nSummary: The case involves a dispute over intellectual property rights between two technology \ncompanies, Smith Tech and Johnson Innovations. The primary issues revolve around patent \ninfringement and trade secret misappropriation. \nKey Documents: \n1. Patent Documents: \no Patent No. US1234567B1: "Method for Data Encryption" \no Patent No. US2345678B2: "System for Secure Data Transmission" \n2. Legal Precedents: \no California: Case No. 2019-CV-98765, "Doe vs. ABC Corp. " \no New York: Case No. 2020-CV-54321, "XYZ Inc. vs. LMN Ltd. " \no Ontario: Case No. 2021-ON-67890, "PQR Ltd. vs. STU Inc. " \n3. Statutes: \no California Civil Code Section 3426: Trade Secrets \no New York General Business Law Section 360: Trademark Infringem

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
docs = text_splitter.split_documents(data)

vectorstore = Chroma.from_documents(documents=docs, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))
retriever = vectorstore.as_retriever(
                search_type="similarity",
                search_kwargs={'k': 4}, # number of documents to retrieve
            )

In [None]:
# Question
question = "List some cases only related to financial?"

# Retrieve docs
docs = retriever.invoke(question)
print(docs)

[Document(metadata={'page': 6, 'source': '/content/Legal Case Studies for RAG Demonstration.pdf'}, page_content='o Victoria: Case No. 2016-VIC-67890, "Clark vs. Davis" \no Quebec: Case No. 2017-QC-78901, "Evans vs. Foster" \n3. Statutes: \no New York Real Property Law Section 443: Disclosure Requirements \no Victoria Sale of Land Act 1962: Property Transactions \no Quebec Civil Code: Real Estate Transactions \nTimeline: \n• May 2024: Patel files a lawsuit against Singh in New York. \n• July 2024: Singh files a counterclaim in Victoria. \n• October 2024: Both parties agree to consolidate the cases and include Quebec'), Document(metadata={'page': 6, 'source': '/content/Legal Case Studies for RAG Demonstration.pdf'}, page_content='o Victoria: Case No. 2016-VIC-67890, "Clark vs. Davis" \no Quebec: Case No. 2017-QC-78901, "Evans vs. Foster" \n3. Statutes: \no New York Real Property Law Section 443: Disclosure Requirements \no Victoria Sale of Land Act 1962: Property Transactions \no Quebec 

In [None]:
# Check document relevancy
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import ChatGoogleGenerativeAI

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

# LLM with function call
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

# Filter out the non-relevant docs
docs_to_use = []
for doc in docs:
    #print(doc.page_content, '\n', '-'*50)
    res = retrieval_grader.invoke({"question": question, "document": doc.page_content})
    #print(res,'\n')
    if res.binary_score == 'yes':
        docs_to_use.append(doc)
print(docs_to_use)

[Document(metadata={'page': 4, 'source': '/content/Legal Case Studies for RAG Demonstration.pdf'}, page_content='• Martinez: Claims that Davis was partially at fault and seeks damages for vehicle repairs. \nConclusion: The case is ongoing, with the next court hearing scheduled for April 2025. Both parties \nare preparing for a potential trial, with significant implications for personal injury law. \n \nCase Title: Lee vs. Kim \nCase Number: 2025-CV-34567 \nJurisdictions Involved:  \n• California, USA \n• New South Wales, Australia \n• Alberta, Canada \nSummary: The case involves a dispute over a business partnership between Lee and Kim. The \nprimary issues revolve around breach of fiduciary duty and misappropriation of funds. \nKey Documents: \n1. Partnership Agreement:'), Document(metadata={'page': 4, 'source': '/content/Legal Case Studies for RAG Demonstration.pdf'}, page_content='• Martinez: Claims that Davis was partially at fault and seeks damages for vehicle repairs. \nConclusio

In [None]:
# Generate Result
from langchain_core.output_parsers import StrOutputParser

# Prompt
system = """You are an assistant for question-answering tasks. Answer the question based upon your knowledge.
provide the case details like title,number,place and document name as source for cases separately.
provide Summary of the case
"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved documents: \n\n <docs>{documents}</docs> \n\n User question: <question>{question}</question>"),
    ]
)

# LLM
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

def format_docs(docs):
    formatted_docs = []
    for i, doc in enumerate(docs):
        case_title = doc.metadata.get('title', 'No Title')
        document_info = doc.metadata.get('source', 'No Source')
        content = doc.page_content
        formatted_docs.append(f"Case Title: {case_title}\nDocument Info: {document_info}\nContent:\n{content}\n{'-'*50}")
    return "\n".join(formatted_docs)
# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
generation = rag_chain.invoke({"documents":format_docs(docs_to_use), "question": question})
print(generation)



Based on the provided text, only one case is clearly related to financial matters:

**Case Title:** Lee vs. Kim
**Case Number:** 2025-CV-34567
**Jurisdictions Involved:** California, USA; New South Wales, Australia; Alberta, Canada
**Document Name:** /content/Legal Case Studies for RAG Demonstration.pdf
**Summary:** This case involves a dispute over a business partnership between Lee and Kim, focusing on breach of fiduciary duty and misappropriation of funds.  These are both financial issues.


In [None]:
# Check for Hallucinations
# Data model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in 'generation' answer."""

    binary_score: str = Field(
        ...,
        description="Answer is grounded in the facts, 'No Hallucination' or 'Yes the response is Hallucinated'"
    )

# LLM with function call
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1)
structured_llm_grader = llm.with_structured_output(GradeHallucinations)

# Prompt
system = """You are a hallucination grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts.if llm generation is partially incorrect say yes hallunicated \n
    Give Hallucination_score 'No Hallucination' or 'Yes the response is Hallucinated'. 'No Hallucination' means that the answer is grounded in / supported by the set of facts."""
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n <facts>{documents}</facts> \n\n LLM generation: <generation>{generation}</generation>"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader

response = hallucination_grader.invoke({"documents": format_docs(docs_to_use), "generation": generation})
print(response)

binary_score='No Hallucination'


In [None]:
from typing import List
from langchain_core.prompts import PromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

# LLM
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.1)

# Prompt
system = """You are an advanced assistant for document search and retrieval. You are provided with the following:
1. A question.
2. A generated answer based on the question.
3. A set of documents that were referenced in generating the answer.

first explain about the case which is related to question only mention about case which matches high clearly in structured way
second mention how many extracted segments from the documents are used .
third mention how many documents you have used to generate response
last how much does it match with user query


Ensure that:
- (Important) Each segment is an exact match to a part of the document and is fully contained within the document text.
- The relevance of each segment to the generated answer is clear and directly supports the answer provided.
- (Important) If you didn't use the specific document don't mention it.

Used documents: <docs>{documents}</docs> \n\n User question: <question>{question}</question> \n\n Generated answer: <answer>{generation}</answer>
"""

prompt = PromptTemplate(
    template=system,
    input_variables=["documents", "question", "generation"]
)

# Chain
doc_lookup = prompt | llm

# Run the hallucination check
response = hallucination_grader.invoke({"documents": format_docs(docs_to_use), "generation": generation})
print(response)

# Use the hallucination check result
if response.binary_score == 'No Hallucination':
    print("The generated answer is reliable.")
    # Proceed with using the generation
    lookup_response = doc_lookup.invoke({"documents": format_docs(docs_to_use), "question": question, "generation": generation})
    print(lookup_response.content)


else:
    print("The generated answer may contain hallucinations.")
    lookup_response = doc_lookup.invoke({"documents": format_docs(docs_to_use), "question": question, "generation": generation})
    print(lookup_response.content)

binary_score='No Hallucination'
The generated answer is reliable.
Here's a breakdown of the answer generation process:

**1. Case Summary:**

The answer focuses on the "Lee vs. Kim" case.  This case centers on a dispute between business partners involving "breach of fiduciary duty and misappropriation of funds."  These are clearly financial issues, making it the only case from the provided documents that directly relates to financial matters.

**2. Number of Extracted Segments:**

Three segments were extracted from the documents to generate the answer:

* "Case Title: Lee vs. Kim"
* "Case Number: 2025-CV-34567"
* "Summary: The case involves a dispute over a business partnership between Lee and Kim. The primary issues revolve around breach of fiduciary duty and misappropriation of funds."

**3. Number of Documents Used:**

Only one document was used to generate the response:  `/content/Legal Case Studies for RAG Demonstration.pdf`.

**4. Match with User Query:**

The answer has a high m