<a href="https://colab.research.google.com/github/SunnySze0000/TEMG4950N/blob/main/TEMG4950_W4_RAG_pipeline_20954788.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install -qU langchain langchain_community langchain_chroma langchain_together pypdf rapidocr-onnxruntime langgraph

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.9/14.9 MB[0m [31m84.7 MB/s[0m eta [36m0:00:00

In [None]:
import os

os.environ["TOGETHER_AI_API_KEY"] = "a3f5c60b10317ed3745810a435037f2e2ace916529db7f5585696f709d628f94"

In [None]:
from langchain_together import ChatTogether,TogetherEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import BaseOutputParser, JsonOutputParser, StrOutputParser
from langchain.retrievers.multi_query import MultiQueryRetriever

# **Preparing data (Load / Split / Embed)**

In [None]:
# LLM
llm = ChatTogether(
    model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
    temperature=0.2,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=os.environ["TOGETHER_AI_API_KEY"]
)

In [None]:
# LOADING
loader = PyPDFLoader("2309.01105v2.pdf", extract_images=True)
pages = loader.load_and_split()

# SPLITTING
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
all_splits = text_splitter.split_documents(pages)

# EMBEDDING
embeddings = TogetherEmbeddings(
    model="togethercomputer/m2-bert-80M-8k-retrieval",
    api_key=os.environ["TOGETHER_AI_API_KEY"]
)
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embeddings)

# **Set up Retriever / Generator / Graders**

In [None]:
# RETRIEVER
from typing import List
class LineListOutputParser(BaseOutputParser[List[str]]):
    """Output parser for a list of lines."""
    def parse(self, text: str) -> List[str]:
        lines = text.strip().split("\n")
        return list(filter(None, lines))

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

retrieving_chain = QUERY_PROMPT | llm | LineListOutputParser()

retriever = MultiQueryRetriever(
    retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 10}),
    llm_chain = retrieving_chain,
    parser_key = "lines"
)

In [None]:
# GENERATOR
GENERATOR_PROMPT = PromptTemplate(
    input_variables=["question", "document"],
    template="""You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context to answer the question.
    If the context have nothing to be related with the question, just answer dont't know.
    Use three sentences maximum and keep the answer concise."
    Here is the user question: {question}
    \n\n
    Here are the context:
    {document}
    """,
)

generating_chain = GENERATOR_PROMPT | llm | StrOutputParser()

In [None]:
# RETRIEVAL_GRADER
GRADER_PROMPT = PromptTemplate(
    input_variables=["question", "document"],
    template="""You are a grader assessing relevance
    of a retrieved document to a user question. If the document contains keywords related to the user question,
    grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
    Provide the binary score as a JSON with a single key 'score' and no premable or explanation.
    Here is the retrieved document: {document}
    Here is the user question: {question}
    """,
)

grading_chain = GRADER_PROMPT | llm | JsonOutputParser()

In [None]:
# HALLUCINATION_GRADER
prompt = PromptTemplate(
    template="""You are a grader assessing whether
    an answer is grounded in / supported by a set of facts. Give a binary 'yes' or 'no' score to indicate
    whether the answer is grounded in / supported by a set of facts. Provide the binary score as a JSON with a
    single key 'score' and no preamble or explanation.
    Here are the facts:
    \n ------- \n
    {documents}
    \n ------- \n
    Here is the answer: {generation}""",
    input_variables=["generation", "documents"],
)

hallucination_grader = prompt | llm | JsonOutputParser()

In [None]:
# ANSWER_GRADER
prompt = PromptTemplate(
    template="""You are a grader assessing whether an
    answer is useful to resolve a question. Give a binary score 'yes' or 'no' to indicate whether the answer is
    useful to resolve a question. Provide the binary score as a JSON with a single key 'score' and no preamble or explanation.
    Here is the answer:
    \n ------- \n
    {generation}
    \n ------- \n
    Here is the question: {question}""",
    input_variables=["generation", "question"],
)

answer_grader = prompt | llm | JsonOutputParser()

In [None]:
# QUESTION_RE-WRITER
re_write_prompt = PromptTemplate(
    template="""You a question re-writer that converts an input question to a better version that is optimized \n
     for vectorstore retrieval. Look at the initial and formulate an improved question. \n
     Here is the initial question: \n\n {question}. Improved question with no preamble: \n """,
    input_variables=["generation", "question"],
)

question_rewriter = re_write_prompt | llm | StrOutputParser()

# **Create Nodes and Workflow**

In [None]:
from pprint import pprint
from typing import List
from langchain_core.documents import Document
from typing_extensions import TypedDict
from langgraph.graph import END, StateGraph, START

class GraphState(TypedDict):
    question: str
    generation: str
    web_search: str
    documents: List[str]
    retry_count: int

In [None]:
# RETRIEVING_NODE
def retrieve(state):

    print("---RETRIEVE---")
    question = state["question"]

    documents = retriever.invoke(question)

    return {"documents": documents, "question": question}

In [None]:
# GENERATING_NODE
def generate(state):

    print("---GENERATE---")
    question = state["question"]
    documents = state["documents"]

    response = generating_chain.invoke({"question": question,  "document": documents})

    print(response + "\n")

    return {"documents": documents, "question": question, "generation": response}

In [None]:
# GRADING_NODE
def grade_documents(state):

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["documents"]

    # Score each doc
    filtered_docs = []
    web_search = "No"
    for d in documents:
        score = grading_chain.invoke(
            {"question": question, "document": d.page_content}
        )
        grade = score["score"]

        if grade.lower() == "yes":
            print("---GRADE: DOCUMENT RELEVANT---")
            filtered_docs.append(d)

        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            web_search = "Yes"
            continue

    return {"documents": filtered_docs, "question": question}

In [None]:
# QUESTION_RE-WRITING_NODE
def transform_query(state):
    print("---TRANSFORM QUERY---")
    question = state["question"]
    documents = state["documents"]
    retry_count = state["retry_count"] + 1

    # Re-write question
    better_question = question_rewriter.invoke({"question": question})
    print(better_question)
    return {"documents": documents, "question": better_question, "retry_count": retry_count}

In [None]:
#CONDITIONAL_EDGE_FUNCTIONS
def decide_to_generate(state):
    print("---ASSESS GRADED DOCUMENTS---")
    state["question"]
    filtered_documents = state["documents"]

    if not filtered_documents:
        # All documents have been filtered check_relevance
        # We will re-generate a new query
        print(
            "---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION---"
        )
        return "transform_query"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"


def grade_generation_v_documents_and_question(state):
    print("---CHECK HALLUCINATIONS---")
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]

    score = hallucination_grader.invoke(
        {"documents": documents, "generation": generation}
    )
    grade = score["score"]

    # Check hallucination
    if grade == "yes":
        print("---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---")
        # Check question-answering
        print("---GRADE GENERATION vs QUESTION---")
        score = answer_grader.invoke({"question": question, "generation": generation})
        grade = score["score"]
        if grade == "yes":
            print("---DECISION: GENERATION ADDRESSES QUESTION---")
            return "useful"
        else:
            print("---DECISION: GENERATION DOES NOT ADDRESS QUESTION---")
            return "not useful"
    else:
        print("---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY---")
        return "not supported"

def check_retry_count(state):
    retry_count = state["retry_count"]
    print("---RETRY COUNT " + str(retry_count) + "---")
    # Stop looping after the fifth retry to prevent infinite loop
    if retry_count > 5:
        return "end"
    else:
        return "continue"

In [None]:
#WORKFLOW_NODES
workflow = StateGraph(GraphState)

workflow.add_node("retrieve", retrieve)  # retrieve
workflow.add_node("grade_documents", grade_documents)  # grade documents
workflow.add_node("generate", generate)  # generatae
workflow.add_node("transform_query", transform_query)  # transform_query

In [None]:
#WORKFLOW_EDGES
workflow.add_edge(START, "retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
    "grade_documents",
    decide_to_generate,
    {
        "transform_query": "transform_query",
        "generate": "generate",
    },
)
workflow.add_conditional_edges(
    "generate",
    grade_generation_v_documents_and_question,
    {
        "not supported": "generate",
        "useful": END,
        "not useful": "transform_query",
    },
)
workflow.add_conditional_edges(
    "transform_query",
    check_retry_count,
    {
        "continue": "retrieve",
        "end": END,
    },
)

# **Input here**

In [None]:
question = "What is embedding in RAG"

# **Run the Self-RAG!!**

In [None]:
# Compile
app = workflow.compile()

# Run
inputs = {"question": question,
          "retry_count": 0,
}
for output in app.stream(inputs):
    for key, value in output.items():
        pprint(f"Finished running: {key}:")
try:
    pprint(value["generation"])
except KeyError:
    print("Sorry but I don't have related information \n")
    print("---END OF PROCESS: EXCEED TIME LIMIT---")

---RETRIEVE---
'Finished running: retrieve:'
---CHECK DOCUMENT RELEVANCE TO QUESTION---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT RELEVANT---
---GRADE: DOCUMENT NOT