In [None]:
!pip install transformers langchain langchain_community langchain-openai
!pip install -U transformers accelerate
!pip install tiktoken
!pip install pinecone
!pip install langchain-pinecone

In [None]:
from pydantic import BaseModel, Field
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
import json
from sentence_transformers import SentenceTransformer, SimilarityFunction
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

In [6]:
from google.colab import userdata
import os
from langchain_openai import ChatOpenAI

hf_token = userdata.get('HF_TOKEN')

if hf_token is None:
    raise ValueError("Hugging Face token not found. Please add it to Colab Secrets.")

from huggingface_hub import login


login(token=hf_token)

os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=userdata.get('LG_SMITH')
os.environ['LANGSMITH_PROJECT']="RAG"
os.environ["OPENAI_API_KEY"]=str(userdata.get('OPEN_AI')).strip()


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model_embed = SentenceTransformer("nomic-ai/modernbert-embed-base", similarity_fn_name=SimilarityFunction.COSINE)

In [8]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

# Query Translation(Multi-Query)


Considering the project's specifics and target audience, we assume that potential users might lack legal education. Their questions may be poorly structured or ambiguous, making accurate semantic search challenging. To address this, we generate multiple refined sub-queries based on the user's initial query. This approach ensures better semantic coverage, resolves ambiguities, and improves the retrieval of relevant legal information.



In [9]:
from typing import List

from langchain_core.output_parsers import BaseOutputParser
from pydantic import BaseModel, Field

class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate three
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the cosine-distance-based similarity search.
    Provide these alternative questions + original question separated by newlines.Do not lable
    alternative or original question with any text.
    Original question: {question}""",
)
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)


MultyQueryGen = QUERY_PROMPT | llm | output_parser


question = "The students shall be informed, in an appropriate manner, of the exact examination regulations"

MultyQueryGen.invoke({"question": question})

['The students need to be notified of the specific examination rules in a suitable way.',
 'How can the students be properly informed about the exact examination regulations?',
 'What is the appropriate method to inform the students about the exact examination regulations?']

# Query Translation(Decomposition)

In contrast to the scenario addressed in the previous method, this approach considers cases where the user's query is a complex, hierarchically structured question. In such instances, it makes sense to decompose the query into its constituent parts to ensure broader contextual coverage and retrieve the most relevant information for each sub-question.

In [10]:
from langchain.prompts import ChatPromptTemplate


template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question}. Provide these sub-questions + original question separated by newlines.\n
Output (up to 3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)




llm = ChatOpenAI(model="gpt-3.5-turbo-0125",temperature=0)


generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))


question = "If someone is accused of a crime, but they believe the evidence used against them was obtained unfairly, what can they do to challenge it?"
questions = generate_queries_decomposition.invoke({"question":question})
print(questions)


['- What are the legal procedures for challenging evidence obtained unfairly in a criminal case?', '- Are there specific laws or regulations that protect individuals from unfair evidence in court?', '- How can a defense attorney argue against the admissibility of unfairly obtained evidence in a trial? ', '', 'If someone is accused of a crime, but they believe the evidence used against them was obtained unfairly, what can they do to challenge it?']


# Retriever


For the retriever, we utilize a vector database search, using cosine distance as the metric to measure similarity between the query and stored document embeddings.

In [35]:
import pinecone
import langchain_pinecone





from langchain_pinecone import PineconeVectorStore

from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Pinecone
from langchain.embeddings.base import Embeddings




pinecone_client = pinecone.Pinecone(api_key=userdata.get('PINE'))


hf_model = model_embed


class HuggingFaceEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        """Generate embeddings for a list of documents."""
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        """Generate embedding for a single query."""
        return self.model.encode(text).tolist()

hf_embeddings = HuggingFaceEmbeddings(hf_model)


index_name = "rag-data-paragraphs"





index = pinecone_client.Index(index_name)

class Retriever:
    def __init__(self, index, embedder):
        self.index = index
        self.embedder = embedder

    def get_relevant_documents(self, query):
        query_vector = self.embedder.embed_query(query)

        response = self.index.query(
            vector=query_vector,
            top_k=5,
            include_metadata=True
        )

        results = []
        for match in response["matches"]:
            cosine_similarity = match['score']
            cosine_distance = 1 - cosine_similarity
            results.append({
                "text": match['metadata']['Text'],
                "metadata": match['metadata'],
                "cosine_similarity": cosine_similarity,
                "cosine_distance": cosine_distance
            })
        return results

retriever = Retriever(index, hf_embeddings)

In [24]:
def multiple_retrieve(question,sub_question_generator_chain):
    """Retreive on each sub-question/alternative-question"""

    sub_questions = sub_question_generator_chain.invoke({"question":question})

    retreive_results = []

    for sub_question in sub_questions:

        retrieved_docs = retriever.get_relevant_documents(sub_question)
        print(f'Sub-Question:{sub_question}, Retrieved Documents: {len(retrieved_docs)}')
        for doc in retrieved_docs:
          if doc not in retreive_results:
              retreive_results.append(doc)
    print("Retrieved Unique Documents: ",len(retreive_results))
    return retreive_results

#Ranking

In [55]:
def rank_docs(documents):
    """Rank the documents based on their cosine-similarity to the query."""
    docs = [doc for doc in documents if doc['cosine_similarity'] >= 0.75]
    print("Ranked Documents: ",len(docs))
    return sorted(documents, key=lambda x: x['cosine_similarity'], reverse=True)

# Strucuted Grade of Retrieval





For retrieving and grading a large number of documents

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain import hub


class GradeDocuments(BaseModel):

    binary_score: str = Field(
        description="Are documents relevant to the question, 'yes' or 'no'"
    )

grade_prompt = hub.pull("efriis/self-rag-retrieval-grader")


llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

retrieval_grader = grade_prompt | structured_llm_grader

def GradeDocs(query, retreive_results):
    """Grade the retrieved documents and return formatted text"""

    retreive_results_useful = []
    for doc in retreive_results:
        grade = retrieval_grader.invoke({"question": query, "document": doc['text']})
        if grade.binary_score == "yes":
            retreive_results_useful.append(doc)


    return retreive_results_useful


The re-ranking process utilizes a large language model (LLM) as an AI agent to refine the initial retrieval results. The AI agent evaluates the contextual relevance of each document in relation to the query, leveraging its advanced understanding of semantics and linguistic patterns.

In [56]:
from langchain.output_parsers import RegexParser

class RelevanceScore(BaseModel):
    relevance_score: float = Field(
        ge=0.0, le=1.0,
        description="Relevance score of the document to the question. 0 means completely irrelevant, 1 means completely relevant."
    )


rerank_prompt = PromptTemplate.from_template("""
You are a professional document relevance grader. For each document provided, assign a relevance score between 0 and 1.
The score should reflect how well the document answers the provided question:
- 0 means completely irrelevant.
- 1 means highly relevant.

Question: {question}

Document: {document}

Relevance Score (0-1):
""")

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_reranker = llm.with_structured_output(RelevanceScore, method="function_calling")

output_parser = RegexParser(
    regex=r"Relevance Score \(0-1\): (.*)",
    output_keys=["relevance_score"],
    default_output_key="relevance_score"
)

retrieval_ranker = rerank_prompt | structured_llm_reranker


def rerank_docs(documents, query):
    """Rank the documents based on their relevance to the query."""

    ranked_docs = []
    for doc in documents:
        grade = retrieval_ranker.invoke({"question": query, "document": doc['text']})
        doc['relevance_score'] = grade.relevance_score
        ranked_docs.append(doc)
    return sorted(ranked_docs, key=lambda x: x['relevance_score'], reverse=True)

# Generation

The generation process leverages techniques such as prompt engineering and few-shot learning (providing a few examples in the prompt) to guide the model in producing accurate and contextually relevant outputs.

In [61]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser



prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are a professional legal assistant tasked with providing clear, accurate, and detailed answers to legal questions. Your goal is to directly address the query by including specific details (such as numbers, dates, or examples) and providing a concise explanation to help non-experts understand the context.

### Task:
1. **Direct Answer**: Start by providing a precise and concise answer to the question, including all relevant details, such as numerical values, dates, or other specifics from the legal context.
2. **Explanation**: Follow the answer with a clear explanation in simple terms, avoiding legal jargon. Use examples or analogies if necessary to make the information relatable and easier to understand.

### Important Notes:
- Always prioritize specific details if they are present in the provided context.
- Avoid overly general or vague responses.
- Ensure the explanation directly relates to the user's query and provides actionable or practical insights.

Example:

1. Query: Can I break my lease early if I lose my job?
   Legal Context: Tenancy agreements often include clauses about early termination. In many jurisdictions, tenants can break a lease early if they provide proper notice and a valid reason, such as job loss. However, penalties or fees may apply unless specified otherwise in the contract.
   Output:
   - Answer: You may be able to break your lease early if you lose your job, but fees or penalties might apply depending on your contract.
   - Explanation: Check your lease agreement for an early termination clause. Many agreements allow breaking the lease if you provide written notice, but you might have to pay a penalty or forfeit your deposit. For example, if you have six months left on your lease, your landlord might require payment for one or two months as a penalty.

2.  Query: What should I do if my employer doesn’t pay me on time?
   Legal Context: Labor laws typically require employers to pay employees on the agreed-upon schedule. Late payments may be a violation of these laws, and employees can file a complaint with the local labor board or seek legal assistance to recover unpaid wages.
   Output:
   - Answer: If your employer doesn’t pay you on time, you can file a complaint with the labor board or pursue legal action to recover unpaid wages.
   - Explanation: Employers are legally required to pay employees on time. If your payment is late, start by contacting your employer to resolve the issue informally. If that doesn’t work, you can file a formal complaint with the labor board. For instance, if your paycheck is delayed by more than a week, you could report this as a violation of labor laws to protect your rights.

Now complete the task for the following input:

Query: {query}
Legal Context: {context}

Output:
- Answer: <Provide a concise and direct answer to the query.>
- Explanation: <If needed, explain the content in simple terms, addressing the query and making it relatable for a non-expert audience.> """)

llm = ChatOpenAI(model_name="gpt-4", temperature=0)

rag_chain = prompt | llm | StrOutputParser()

query = "What are the working hour limits for employees residing in their employer's household, and how do they differ based on age and year of enforcement?"    #Query
print(f'Query: {query} \n')
retreived_docs = multiple_retrieve(query,generate_queries_decomposition) #Query Translation
ranked_docs = rank_docs(retreived_docs) #Ranking
reranked_docs = rerank_docs(ranked_docs,query) #Re-ranking
context = ''
# ensures that the length of content fits in model's content window
for doc in reranked_docs:
  if len(word_tokenize(context)) + len(word_tokenize(doc['text'])) < 3000:
    context = context + doc['metadata']['ActName'] + " " + doc['metadata']['Section'] + " " + doc['metadata']['Paragraph'] + doc['text'] + "\n\n"
  else:
    break
print("Length of context:", len(word_tokenize(context)))
generation = rag_chain.invoke({"context": context, "query": query})
print(generation)

Query: What are the working hour limits for employees residing in their employer's household, and how do they differ based on age and year of enforcement? 

Sub-Question:- What are the current working hour limits for employees residing in their employer's household?, Retrieved Documents: 5
Sub-Question:- How do working hour limits vary based on the age of employees residing in their employer's household?, Retrieved Documents: 5
Sub-Question:- How have working hour limits for employees residing in their employer's household changed over the years in terms of enforcement?, Retrieved Documents: 5
Retrieved Unique Documents:  15
Ranked Documents:  3
Length of context: 2072
- Answer: For employees residing in their employer's household, the working hours within two calendar weeks should not exceed 106 hours for those under 18 years of age and 116 hours for those 18 and older, as of 5 January 1970. These limits were reduced to 104 hours and 114 hours respectively from 3 January 1972, and fur

# Hallucination Detection

In [63]:
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

    binary_score: str = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )

hallucination_prompt = PromptTemplate(
    input_variables=["documents", "response"],
    template="""You are an assistant that evaluates whether a given response is supported by the provided documents.

Task:
- Carefully analyze the response and compare it with the provided documents.
- Determine if all the claims made in the response are explicitly supported by the content of the documents.
- If there is any part of the response that is not directly supported by the documents, the answer should be "no".
- If every claim in the response is backed by the documents, the answer should be "yes".

Context:
Documents: {documents}

Response to Evaluate: {response}

Output:
Answer (yes/no): <Answer>"""
)

hallucination_grader = hallucination_prompt | structured_llm_grader
print(generation)
print(hallucination_grader.invoke({"documents": useful_docs, "response": generation}))




- Answer: For employees residing in their employer's household, the working hours within two calendar weeks should not exceed 106 hours for those under 18 years of age and 116 hours for those 18 and older, as of 5 January 1970. These limits were reduced to 104 hours and 114 hours respectively from 3 January 1972, and further reduced to 100 hours and 110 hours from 6 January 1975. 

- Explanation: This means that if you are an employee living in your employer's household, the maximum amount of time you can be asked to work within a two-week period depends on your age and the year. For example, if you were under 18 and working in 1970, you could be asked to work up to 106 hours in two weeks. But if you were the same age and working in 1975, you could only be asked to work up to 100 hours in two weeks. The same reductions apply if you are 18 or older, but the maximum hours are slightly higher. These limits include the time you need to be available for work, not just the time you are activ