In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader, TextLoader
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.vectorstores import Chroma
from googlesearch import search
import google.generativeai as genai

In [2]:
safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

In [3]:
def get_prompt_retriever(context,question):
    return f'''You are grader assessing relavance of a retrieved document to a user question. \n
    Here is the retrieved document:\n\n {context} \n
    Here is the user question:\n\n {question} \n
    If the document document contains keywords related to the user question, grade it as relevant. \n
    It does not need to be a stringent test.The goal is to filter out erroneous retrievels. \n
    Give a  score between 0 and 1 score to indicate the document is relevant to the question. \n
    Provide the score without any premable or explaination. \n'''

# Retrieve Doc

In [4]:
def get_doc(question):
    url= "https://articles.outlier.org/math-integration-definition"
    loader=WebBaseLoader(url)
    docs=loader.load()

    text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000,chunk_overlap=50)
    all_splits=text_splitter.split_documents(docs)


    embegging=GPT4AllEmbeddings()

    vectorstore=Chroma.from_documents(documents=all_splits,collection_name="rag-chroma",embedding=embegging)
    retriver=vectorstore.as_retriever()
    docs=retriver.get_relevant_documents(question,k=1)
    return docs[0].page_content

# Evaluator

In [5]:
def get_score(docs,question):
    genai.configure(api_key='AIzaSyCdMEDGRTlN7_camesAFg6z0ygRK5fCHvs')
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(get_prompt_retriever(docs,question), safety_settings=safety_settings)
    return float(response.text)

# Query Rewriter

In [6]:
def get_prompt_rewriter(question):
    return f'''You are a question rewriter. \n
    Here is the user question:\n\n {question} \n
    Rewrite the question to make it more clear and concise. \n
    At the same time, try to keep the meaning of the question the same. \n
    '''

In [7]:
def rewrite_question(question):
    genai.configure(api_key='AIzaSyCdMEDGRTlN7_camesAFg6z0ygRK5fCHvs')
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content(get_prompt_rewriter(question), safety_settings=safety_settings)
    return response.text

# Knowledge Refinement

In [8]:
def refine_doc(doc,question):
    file = open('docs_to_refine.md', 'w',encoding="utf-8") 
    file.write(doc) 
    file.close()
    loader=TextLoader('docs_to_refine.md',encoding = 'UTF-8')
    docs_to_refine=loader.load()

    text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250,chunk_overlap=50)
    all_splits=text_splitter.split_documents(docs_to_refine)


    embegging=GPT4AllEmbeddings()

    vectorstore=Chroma.from_documents(documents=all_splits,collection_name="rag-chroma",embedding=embegging)
    retriver=vectorstore.as_retriever()

    docs_refined=retriver.get_relevant_documents(question,k=1)

    score=[]
    for i in docs_refined :
        score.append(get_score(i.page_content,question))
    best_doc_index=sorted(range(len(score)), key=lambda i: score[i])[-2:]
    best_doc=[docs_refined[i] for i in best_doc_index]
    return best_doc_index,best_doc

# Knowledge Searching

In [9]:
def web_search(query, num_results=5):
    results = []
    for result in search(query, num_results=num_results):
        results.append(result)
    return results

In [10]:
def External_Knowledge(question):
    url= web_search(question)[0]
    loader=WebBaseLoader(url)
    docs=loader.load()

    text_splitter=RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000,chunk_overlap=50)
    all_splits_=text_splitter.split_documents(docs)


    embedding=GPT4AllEmbeddings()

    vectorstore_=Chroma.from_documents(documents=all_splits_,collection_name="rag-chroma",embedding=embedding)
    retriver_=vectorstore_.as_retriever()

    docs=retriver_.get_relevant_documents(question,k=1)
    return docs[0].page_content

# CRAG

0.3=>x --> not Incorrect

0.3< x < 0.7 --> Ambiguous

0.7>= x --> Correct

In [11]:
def CRAG(question):
    docs=get_doc(question)
    score=get_score(docs,question)
    if score >=0.7 :
        score_refined_doc,refined_doc=refine_doc(docs,question)
        return refined_doc[0].page_content + '\n\n' + refined_doc[1].page_content
    elif 0.3<score <0.7 :
        score_refined_doc,refined_doc=refine_doc(docs,question)
        external_knowledge=External_Knowledge(question)
        #print(score)
        return external_knowledge + '\n\n' + refined_doc[score_refined_doc[-1]].page_content
    else:
        external_knowledge=External_Knowledge(question)
        #print(score)
        return external_knowledge 

In [12]:
question="What is the integration of x^2"
CRAG(question)

"F’(x)=f(x)F’(x) = f(x)F’(x)=f(x)An infinite number of functions fit this bill, since the derivative of any constant is simply zero. For a simple integration example, let f(x)=2xf(x) = 2xf(x)=2x. Observe that ∫(2x)dx=x2+C=F(x)\\int (2x) dx = x^2 + C = F(x)∫(2x)dx=x2+C=F(x). Now, notice the following:\rIf F(x)=x2+1F(x) = x^2+1F(x)=x2+1, then F’(x)=2xF’(x) = 2xF’(x)=2x\rIf F(x)=x2+2F(x) = x^2+2F(x)=x2+2, then F’(x)=2xF’(x)= 2xF’(x)=2xIf F(x)=x2+63F(x) = x^2+63F(x)=x2+63, then F’(x)=2xF’(x)= 2xF’(x)=2xIf F(x)=x2+100F(x) = x^2+100F(x)=x2+100, then F’(x)=2xF’(x) = 2xF’(x)=2x\r\n\r\nNo matter what constant we place at the end of the antiderivative function F(x)F(x)F(x), the derivative F’(x)F’(x)F’(x) is always the same; and F’(x)F’(x)F’(x) is always equal to the original function f(x)f(x)f(x). Since there are an infinite number of functions F(x)F(x)F(x) whose derivative is f(x)f(x)f(x), the inclusion of C gives us a way to express the general form of the antiderivative F(x)F(x)F(x) by repres