In [None]:
!pip install transformers langchain langchain_community langchain-openai
!pip install -U transformers accelerate
!pip install tiktoken
!pip install pinecone
!pip install langchain-pinecone

In [42]:
from pydantic import BaseModel, Field
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
import json
from sentence_transformers import SentenceTransformer, SimilarityFunction

In [43]:
from google.colab import userdata
import os
from langchain_openai import ChatOpenAI

hf_token = userdata.get('HF_TOKEN')

if hf_token is None:
    raise ValueError("Hugging Face token not found. Please add it to Colab Secrets.")

from huggingface_hub import login


login(token=hf_token)

os.environ["LANGSMITH_TRACING"]="true"
os.environ["LANGSMITH_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGSMITH_API_KEY"]=userdata.get('LG_SMITH')
os.environ['LANGSMITH_PROJECT']="RAG"
os.environ["OPENAI_API_KEY"]=str(userdata.get('OPEN_AI')).strip()


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

model_embed = SentenceTransformer("nomic-ai/modernbert-embed-base", similarity_fn_name=SimilarityFunction.COSINE)

In [45]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

# Query Translation(Multi-Query)


In [84]:
from typing import List

from langchain_core.output_parsers import BaseOutputParser
from pydantic import BaseModel, Field

class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""

    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate three
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the cosine-distance-based similarity search.
    Provide these alternative questions + original question separated by newlines.Do not lable
    alternative or original question with any text.
    Original question: {question}""",
)
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)


MultyQueryGen = QUERY_PROMPT | llm | output_parser


question = "The students shall be informed, in an appropriate manner, of the exact examination regulations"

MultyQueryGen.invoke({"question": question})

['The students need to be notified of the specific examination rules in a suitable way.',
 'Can the students be adequately informed about the precise examination regulations?',
 'How can the students be informed effectively about the exact examination regulations?']

# Query Translation(Decomposition)

In [47]:
from langchain.prompts import ChatPromptTemplate


template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question}. Provide these sub-questions + original question separated by newlines.\n
Output (up to 3 queries):"""
prompt_decomposition = ChatPromptTemplate.from_template(template)




llm = ChatOpenAI(model="gpt-3.5-turbo-0125",temperature=0)


generate_queries_decomposition = ( prompt_decomposition | llm | StrOutputParser() | (lambda x: x.split("\n")))


question = "If someone is accused of a crime, but they believe the evidence used against them was obtained unfairly, what can they do to challenge it?"
questions = generate_queries_decomposition.invoke({"question":question})
print(questions)


['1. What are the legal procedures for challenging evidence obtained unfairly in a criminal case?', '2. Are there specific laws or precedents that protect individuals from unfair evidence in criminal proceedings?', '3. How can a defense attorney argue for the exclusion of unfairly obtained evidence in court? ', '', 'If someone is accused of a crime, but they believe the evidence used against them was obtained unfairly, what can they do to challenge it?']


# Retriever


In [48]:
import pinecone
import langchain_pinecone




index_name = "rag-data"
from langchain_pinecone import PineconeVectorStore

from sentence_transformers import SentenceTransformer
from langchain.vectorstores import Pinecone
from langchain.embeddings.base import Embeddings




pinecone_client = pinecone.Pinecone(api_key=userdata.get('PINE'))


hf_model = model_embed


class HuggingFaceEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        """Generate embeddings for a list of documents."""
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        """Generate embedding for a single query."""
        return self.model.encode(text).tolist()

hf_embeddings = HuggingFaceEmbeddings(hf_model)


index_name = "rag-data"



index = pinecone_client.Index(index_name)

retriever= Pinecone(
    embedding=hf_embeddings,
    index=index,
    text_key="Text",
).as_retriever()


In [63]:
def multiple_retrieve(question,sub_question_generator_chain):
    """Retreive on each sub-question/alternative-question"""

    # Generate sub-questions/alternative questions
    sub_questions = sub_question_generator_chain.invoke({"question":question})

    # Initialize a list to hold RAG chain results
    retreive_results = []

    for sub_question in sub_questions:

        # Retrieve documents for each sub-question
        retrieved_docs = retriever.get_relevant_documents(sub_question)


        retreive_results.append((sub_question, retrieved_docs))

    return retreive_results

# Strucuted Grade of Retrieval

In [77]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain import hub


class GradeDocuments(BaseModel):

    binary_score: str = Field(
        description="Are documents relevant to the question, 'yes' or 'no'"
    )

grade_prompt = hub.pull("efriis/self-rag-retrieval-grader")


llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

retrieval_grader = grade_prompt | structured_llm_grader

def GradeDocs(retreive_results):
    """Grade the retrieved documents and return formatted text"""

    retreive_results_useful = []

    formatted_result = ""

    for sub_question, retrieved_docs in retreive_results:
        print(f"Processing Sub Query: {sub_question}")
        for doc in retrieved_docs:
            grade = retrieval_grader.invoke({"question": sub_question, "document": doc.page_content})
            if grade.binary_score == "yes":
                retreive_results_useful.append((sub_question, retrieved_docs))
                formatted_result += f"Sub Query: {sub_question}\n"
                formatted_result += "Retrieved Documents:\n"
                for doc_for_formatting in retrieved_docs:
                  formatted_result += f"- {doc_for_formatting.page_content}\n"
                formatted_result += "\n"

    return formatted_result




In [85]:
# Test the retrieval grader
question = "damage caused by a nuclear event?"
docs = retriever.invoke(question)
doc_txt = docs[0].page_content
print(doc_txt)
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

References:
Text:This Federal Act shall not apply to damage caused by a nuclear event covered by an international convention ratified by EFTA states and EC Member States.
Provision for Coverage
binary_score='yes'


# Meta-Data Augmentation(does not show expected effectiveness)


In [52]:
# from langchain.prompts import ChatPromptTemplate


# def format_references_context(documents):

#     formatted_list = []

#     for doc in documents:
#         act_name = doc.metadata.get("ActName", "Unknown Act")
#         page_content = doc.page_content

#         formatted_text = f"Act Name: {act_name}\nContent: {page_content}"
#         formatted_list.append(formatted_text)

#     return formatted_list

# template = """
# You are a helpful assistant that generates a single, comprehensive search query based on the references mentioned in the text.
# Your goal is to create a unified and contextually relevant search query that captures how all the references are mentioned and described in the text.

# Context:
# {document}

# Instructions:
# - Identify and extract all references mentioned in the text.
# - Analyze the context around each reference to understand its description, purpose, or significance.
# - Combine the extracted information about all references into one detailed and semantically meaningful search query.
# - Do not return the original text

# Output:
# Unified Search Query: <Generated query that includes all references and their contextual descriptions>
# """

# prompt_refs = ChatPromptTemplate.from_template(template)




# llm = ChatOpenAI(temperature=0)


# # Use prompt_refs instead of prompt_decomposition
# retreive_references = ( prompt_refs | llm | StrOutputParser()|retriever.get_relevant_documents|format_references_context)


# document = """References:Para 1 Para 4 Para 2 Para 3
# Text:. The Federal Minister for European and International Affairs shall inform the National and Federal Councils each half-year about the projects of the European Union, as announced by the respective competent Federal Minister, on which the Council is expected to embark on deliberations within the following six months, if these projects
# 	1.	result in a modification of the contractual bases of the European Union or
# 	2.	are subject to a special right of participation of the National and Federal Councils under Art. 23i and Art. 23j Federal Constitutional Law or

# 	3.	are subject to a special information duty under Art. 23e para 2 Federal Constitutional Law or

# 	4.	are decisions to extend the competences under Art. 82 para 2 lit. d TFEU, Art. 83 para 1 sub-para 3 TFEU and Art. 86 para 4 TFEU or
# 	5.	are aimed at establishing enhanced co¬operation under Art. 20 TEU or
# 	6.	concern negotiating mandates for the Commission with regard to international treaties or
# 	7.	negotiating guidelines for the Commission within the framework of the common commercial policy or
# 	8.	are of special importance for the Republic of Austria.
# Written Information"""
# # questions = generate_query_for_reference.invoke({"document": document})
# # print(questions)


# def MetaDataAugmentation(retreive_results):
#     """Second order retreive based on metadata"""
#     retreive_results_usefull = []
#     for document in retreive_results:
#         retreive_meta_data = retreive_references.invoke({"document": document})
#         retreive_meta_data = []

#         result_text = "possible references context:\n"
#         for i in range(len(retreive_meta_data)):
#           if i == 0:
#             continue
#           else:
#             if retrieval_grader.invoke({"question": sub_question, "documents": retrieved_docs})
#             result_text += f"*{retreive_meta_data[i]}*\n"


#     return retreive_results
# print(retreive_references.invoke(document))

# Generation

In [81]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser


prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are an assistant tasked with summarizing legal information and explaining it in simple and clear terms for people without specialized legal education.

Task:
1. Provide a brief summary of the legal content.
2. Use clear and accessible language, avoiding legal jargon wherever possible.
3. Include practical examples or analogies if necessary to help the audience understand.
4. Focus on the key points and their implications for an ordinary person.

Example:

1. Query: What does § 12 cover?
   Legal Context: § 12 discusses taxation policies. It specifies that these policies apply to auctions and works of art. The section outlines the requirements for reporting and calculating taxes during these transactions.
   Output:
   - Summary: § 12 focuses on taxation policies and explains how they apply to auctions and works of art.
   - Explanation: This section of the law describes how taxes should be calculated and reported for auctions and the sale of works of art. For example, if someone sells a painting at an auction, the seller needs to ensure that the taxes are calculated and reported correctly. This ensures transparency and helps prevent disputes about taxation.



Now complete the task for the following input:

Query: {query}
Legal Context: {context}

Output:
- Summary: <Provide a brief and clear summary of the legal content.>
- Explanation: <Explain the content in simple terms, addressing the query and making it relatable for a non-expert audience.> """)

llm = ChatOpenAI(model_name="gpt-3.5-turbo-0125", temperature=0)

rag_chain = prompt | llm | StrOutputParser()

query = "damage caused by a nuclear event?"    #Query
retreived_docs = multiple_retrieve(query,MultyQueryGen) #Query Translation
useful_docs = GradeDocs(retreived_docs)     #Document Grading


generation = rag_chain.invoke({"context": useful_docs, "query": query})
print(generation)

Processing Sub Query: What are the consequences of a nuclear incident?
Processing Sub Query: Effects of a nuclear event?
Processing Sub Query: What kind of harm can be attributed to a nuclear disaster?
- Summary: The legal provision states that damage caused by a nuclear event covered by an international convention ratified by EFTA states and EC Member States is not covered by this Federal Act.
- Explanation: This means that if a nuclear event occurs and causes damage, the rules and regulations outlined in this Federal Act do not apply if the event is covered by an international convention ratified by certain countries. For example, if a nuclear accident happens and is governed by an international agreement, the legal framework in this Federal Act may not be relevant in determining liability or compensation for the damage caused.


# Hallucination Detection

In [83]:
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

    binary_score: str = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )

hallucination_prompt = PromptTemplate(
    input_variables=["documents", "response"],
    template="""You are an assistant that evaluates whether a given response is supported by the provided documents.

Task:
- Carefully analyze the response and compare it with the provided documents.
- Determine if all the claims made in the response are explicitly supported by the content of the documents.
- If there is any part of the response that is not directly supported by the documents, the answer should be "no".
- If every claim in the response is backed by the documents, the answer should be "yes".

Context:
Documents: {documents}

Response to Evaluate: {response}

Output:
Answer (yes/no): <Answer>"""
)

hallucination_grader = hallucination_prompt | structured_llm_grader
print(generation)
print(hallucination_grader.invoke({"documents": useful_docs, "response": generation}))




- Summary: The legal provision states that damage caused by a nuclear event covered by an international convention ratified by EFTA states and EC Member States is not covered by this Federal Act.
- Explanation: This means that if a nuclear event occurs and causes damage, the rules and regulations outlined in this Federal Act do not apply if the event is covered by an international convention ratified by certain countries. For example, if a nuclear accident happens and is governed by an international agreement, the legal framework in this Federal Act may not be relevant in determining liability or compensation for the damage caused.
binary_score='yes'
