# Tools used

- Tuning hyperparameters
  - Parsers = pdfplumer(removed header footer)
  - Embedding model = BAAI/bge-large-en from HuggingFace
  - Vectorstore = FAISS (fast), chromadb(slow)
  - LLMs = llama3-70b-8192 from ollama(secure but slow), llama3-70b-8192 from groq(fast)
  - Text splitters = RecursiveCharacterTextSplitter,ParentDocumentRetriever,  section-wise-chunking
  - Two comparison approaches
    - prompting to LLM after getting context from both pdfs
    - prompting to LLM after getting both the answers (perfomrd better)

  
- a page in largest pdf contains 254 words on average


# Installing Dependencies and libraries

In [8]:
import time
import warnings
warnings.filterwarnings("ignore")

In [9]:
%%time
# 3 min
!pip install -q langchain
# !pip install -q langchain-core
!pip install -q langchain-community
!pip install -q fastembed
!pip install -q pypdf
!pip install -q langchain_groq
!pip install -q faiss-gpu
!pip install -q sentence_transformers
!pip install -q pdfplumber

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.4.1 requires cubinlinker, which is not installed.
cudf 24.4.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.4.1 requires ptxcompiler, which is not installed.
cuml 24.4.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.4.1 requires cupy-cuda11x>=12.0.0, which is not installed.
keras-cv 0.9.0 requires keras-core, which is not installed.
keras-nlp 0.12.1 requires keras-core, which is not installed.
tensorflow-decision-forests 1.8.1 requires wurlitzer, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.8 which is incompatible.
apache-beam 2.46.0 requires numpy<1.25.0,>=1.14.3, but you have numpy 1.26.4 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 14.0.2 which is incompatible

In [10]:
import time
import numpy as np 
import pandas as pd
import random
import pdfplumber
import re
from sklearn.metrics.pairwise import cosine_similarity
# from langchain.embeddings import FastEmbedEmbeddings
from langchain_community.embeddings import FastEmbedEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.chains.base import Chain
from langchain_community.document_loaders import PyPDFLoader
from langchain.docstore.document import Document
from langchain_community.vectorstores import FAISS
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain.retrievers import ParentDocumentRetriever
from IPython.display import Markdown, display
# from fastembed import LateInteractionTextEmbedding
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.storage import InMemoryStore
from tqdm.autonotebook import tqdm, trange

### Environment Variables

In [11]:
# AEBS PDFs
path1 = "/kaggle/input/pdffiles/GB AEBS.pdf"
path2 = "/kaggle/input/pdffiles/UN AEBS.pdf"

# Light PDFs
path3 = "/kaggle/input/pdffiles/GB Lighting installation.pdf"
path4 = "/kaggle/input/pdffiles/R048r12e.pdf"

qpath = "/kaggle/input/queries-pdf/Questions_trial.pdf"

In [12]:
query_df = pd.read_csv("/kaggle/input/query-csv/Queries_46.csv")[:46]
query_df.columns = ["q_no", "query", "level"]
new_queries = query_df['query'].tolist()

In [31]:
# LLMs
llama3 = ChatGroq(groq_api_key = os.getenv('GROQ_API_KEY'), model = 'llama3-70b-8192', temperature=0.01)

# For question answering
template1 = """
You are the Vehicle Regulation Assistant, a helpful AI assistant.
Your task is to answer given questions from the provided relevant part of the PDF.
The answer should be highly-detailed and well-sturctured.
If possible, refer to specific sections number within the context (e.g., "According to section 4.1.2,...").
Do not begin your response with phrases like "Based on the provided context, the answer to the question is:".
Be polite and helpful.

CONTEXT: {context}

QUESTION: {question}
"""
template2 = """
Your task is to answer the question, using only the information provided in the given context.
The answer should be accuate and detailed.
Where applicable, refer to specific section numbers within the context (e.g., "According to section 4.1.2,...").
If the answer is not found in the provided context, simply state that there is no relevant information available 
without sharing details about the context.

CONTEXT: {context}

QUESTION: {question}
"""
template3 = """
Please provide a detailed and well-structured response to the question below, using only the information provided in the context.
If the context does not contain information related to the question, explicitly state that there is no relevant information in the provided context. 
Be polite and helpful.
Also, provide a confidence level from 0 to 100% in your response based on how certain you are about the information you have provided.

CONTEXT: {context}

QUESTION: {question}
"""

dspy_template = """
        System: You are a system specialized in legal document analysis. Your tasks are to analyze the provided pages of legal documents related to vehicle regulations. For each document, perform the following:
            1. **Extract Relevant Content:** Identify and extract all relevant content related to the query from the document.
            2. **Extract Numerical Values:** Extract all numerical values, as they are crucial for regulatory compliance.
            3. **Summarize the Content:** Provide a detailed summary of the extracted information.
            4. **Section References:** Refer to specific section numbers where applicable (e.g., "According to section 4.1.2,...").
            5. **Missing Information:** Clearly state "No relevant information available" if the query cannot be answered based on the document.
            6. **Confidence Level:** Return the confidence level of your response in percentage at the top of your response in **bold letters**.
        If you receive two documents, follow these additional tasks:
            1. **Group Similar Responses:** Group responses from both documents that cover similar aspects.
            2. **Include Section Numbers:** Include section numbers from both documents where applicable.
            3. **Comparison Table:** Create a table comparing the grouped responses. The table should include the following columns: Aspect, Document 1, Document 2, and Equivalence. Use these guidelines for evaluation:
            - **Equivalent:** Information in both documents is essentially the same or conveys the same meaning.
            - **Partially Equivalent:** Information is similar but contains differences in details or conditions.
            - **Not Equivalent:** Information is significantly different or one document contains information not present in the other.
            - **Notes:** Provide detailed notes explaining how the responses align or differ, including specific details or generalizations.
            4. **Provide a JSON Summary:** Include a JSON summary of the comparison, indicating equivalence status and providing brief explanations.
        Generate the comparison table and JSON summary as described.
            **Example Table:**
            | **Aspect**                        | **Document 1**                                                                                     | **Document 2**                                                                                                 | **Equivalence**      | **Notes**                                                                                                                   |
            |-----------------------------------|--------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------|
            | **Section Number**                | Section 6                                                                                       | Section 5                                                                                                      | Dis-equivalent       | The sections of the documents differ (6 vs. 5).                                                                                         |
            | **Test Conditions**               | - Flat, dry concrete or asphalt surface with good adhesion                                      | - Test environment with horizontal visibility range                                                            | Partially Equivalent | Document 1 provides more specific details about test surface conditions, while Document 2 is more general. |
            // other aspects
            **json Summary:
            [
                "equivalence_status": "Partially Equivalent",
                "explanation": "The two documents have some similarities in their test procedures for lane departure warning systems, such as specifying test surface conditions and vehicle mass requirements. However, there are also notable differences in the specific tests and test parameters. Document 2 (GB/T 26773-2011) has additional tests for repeatability and false alarms, while Document 1 (UN/ECE Regulation No. 130) includes tests for failure detection and deactivation that are not present in Document 2. The warning test details and criteria also differ between the two documents."
            ]
    Context:
    {context}
    User: {question}
    Bot:"""
prompt = PromptTemplate(template=template2, input_variables=["question", "context"])

combine_template = """
Your task is to answer the question, by synthesizing relevant information from the provided answers.
The answer should be accuate and detailed.
Where applicable, refer to specific section numbers within the context (e.g., "According to section 4.1.2,...").
Do not reveal that the information comes from multiple answers, directly answer the question.

QUESTION: {question}

ANSWER 1: {answer1}

ANSWER 2: {answer2}
"""
# For comparison RAG 1
comparison_template = """
We have provided a question and their two answers. Generate a comparison section without a heading which includes whether both answer are same or partially same or different. If they are paritially same, then what is same and what is different. This comparison is based on the answers generated from both the contexts. Accuracy and precision are crucial for this task.

QUESTION: {question}

ANSWER 1: {answer1}

ANSWER 2: {answer2}
"""
# For comparison RAG 2
def get_comparision_prompt(query, context1, context2):
    comparison_template = """
    Response in three sections
    
    ANSWER 1: This is firts section, here answer the question form the context 1.
    
    ANSWER 2: This is second section, here answer the question form the context 2.
    
    COMPARISON: This is third section, here answer whether both answer are same or partially same or different. If they are paritially same, then what is same and what is different.
    This section is completely based on answer generated in first and second section.
    
    Please answer the question solely based on the provided context. If you can't answer any of the both questions from their context then just tell that there is no answer in that context. This is very important for my life, be very precised and accurate in answering the question and also in comparison..

    QUESTION: {question}

    CONTEXT1: {context1}

    CONTEXT2: {context2}
    """
    comparison_prompt = comparison_template.format(context1 = context1,context2 = context2, question = query)
    return comparison_prompt

# Lightning
old_queries = ["Whats the difference between Grouped and Combined lamps?", "Can dipped-beam headlamp and main-beam headlamp for front lighting system?", "what is color of End Outline marker lamp?", "Can yellow lamp used as front fog lamp?", "Can red color light placed in the front of the vehicle?", "Can white light can be placed at the back of the vehicle?", "What are 1,1,a,1b,2a,2b,5,6 in direction indicator lamps?", "is cornering lamp mandatory?", "does reflective tape come under light and light signalling?", "standard weight of a person for testing?","can dipped beam uses as a main beam?", "what are the light functions to be kept rear of the vehicle?", "What lamp should be fitted for passenger vehicles?"]
queries = new_queries + old_queries

# Embedding models

In [8]:
from transformers import AutoTokenizer, AutoModel
import torch

In [8]:
def get_embedding_model(model_name='stella_en_1.5B_v5'):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

def embed_texts(texts, tokenizer, model):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings

def get_vectorstore1(path, embedding_model_name='stella_en_1.5B_v5'):
    # Extract text from the document
    text = extract_text(path)
    
    # Split the text into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_text(text)
    docs = [Document(text) for text in texts if text.strip()]
    
    # Load the embedding model
    tokenizer, model = get_embedding_model(embedding_model_name)
    
    # Embed the documents
    embeddings = embed_texts([doc.page_content for doc in docs], tokenizer, model)
    
    # Create the FAISS vector store
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

In [None]:
# Example usage
path_to_document = path1
vectorstore = get_vectorstore1(path_to_document)

In [None]:
%%time
hf_bge_large = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-large-en",
                               model_kwargs={'device': 'cuda'},
                               encode_kwargs={'normalize_embeddings': False})

In [15]:
# embedding_model = FastEmbedEmbeddings(model_name = "BAAI/bge-small-en-v1.5")
# fee_bge_large = FastEmbedEmbeddings(model_name = "BAAI/bge-large-en-v1.5")

In [16]:
embedding_model = hf_bge_large
embedding_model_header = hf_bge_large

# Section wise chunking
- after removing header footer

In [17]:
def embed_texts(texts):
#     embedding = FastEmbedEmbeddings.embed_documents(embedding_model,texts = texts)
    embedding = HuggingFaceBgeEmbeddings.embed_documents(embedding_model,texts = texts)
    return embedding

def get_header_footer(pdf_path, threshold=0.71):
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        if total_pages >= 15:
            random_page_nos = random.sample(range(5, total_pages), 10)
        else:
            random_page_nos = list(range(total_pages))
        
        avg_similarity = 1
        header_lines = -1
        
        while avg_similarity > threshold and header_lines < 4:
            header_lines += 1
            five_lines = []
            
            for page_no in random_page_nos:
                lines = pdf.pages[page_no].extract_text().split('\n')
                if len(lines) > header_lines:
                    five_lines.append(lines[header_lines])
            similarities = cosine_similarity(embed_texts(five_lines))
            avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])
            
        avg_similarity = 1
        footer_lines = -1
        
        while avg_similarity > threshold and footer_lines < 4:
            footer_lines += 1
            five_lines = []
            
            for page_no in random_page_nos:
                lines = pdf.pages[page_no].extract_text().split('\n')
                if len(lines) > footer_lines:
                    five_lines.append(lines[-(footer_lines+1)])
            similarities = cosine_similarity(embed_texts(five_lines))
            avg_similarity = np.mean(similarities[np.triu_indices(len(similarities), k=1)])
            
        return header_lines, footer_lines
    
def extract_text(pdf_path):
    header_lines, footer_lines = get_header_footer(pdf_path)
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                lines = page_text.split('\n')
                if lines:
                    page_text = '\n'.join(lines[header_lines:-(footer_lines+1)])
                    text += page_text + '\n'
        return text

In [18]:
pattern = re.compile(r'\n([1-9]|1[0-9])\. [A-Z][a-zA-Z]+')   #United nations
# pattern = re.compile('\d\. [A-Z]')                          # Smb United nations
# pattern = re.compile(r'\n([1-9]|1[0-9]) [A-Z][a-zA-Z]+')    #Chinese
# pattern = re.compile(r'(\n(1[0-9]|[1-9])\s+[A-Z][a-zA-Z]+.*?)(?=\n(?:[1-9]|1[0-5])\s+[A-Z]|$)', re.DOTALL) #Chinese

In [19]:
def section_wise_chunking(pdf_path):
    text = extract_text(pdf_path)
    matches = list(pattern.finditer(text))
    
    # Use the positions of the matches to split the text into sections
    sections = []
    last_index = 0
    for match in matches:
        start, end = match.span()
        section_text = text[last_index:start].strip()
        if section_text:
            sections.append(section_text)
        last_index = start
    if last_index < len(text):
        sections.append(text[last_index:].strip())
    
    # Handeling too small and too large sections
    text_chunks = []
    previous_chunk_token_count = 0
    for i, section in enumerate(sections):
        tokens_count = len(section.split())
        if i != 0 and tokens_count + last_chunk_token_count < 900:
            text_chunks[-1] += "\n"+section
        elif tokens_count > 850:
            splitted_chunks = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=300).split_text(section)
            text_chunks += splitted_chunks[:1] + [splitted_chunks[0].split('\n')[0]+' (Partial)\n'+ chunk for chunk in splitted_chunks[1:]]
        else:
            text_chunks.append(section)
        last_chunk_token_count = len(text_chunks[-1].split())
    return text_chunks

In [20]:
# print("No. of chunks:", len(chunks))
# for i, chunk in enumerate(chunks):
# #     print(chunk)
#     print("\n" + "_"*0,"chunk",i, "have", len(chunk.split()), "words\n")

# Get vectorstore
- FAISS.from_documents takes list of docs as argument


In [21]:
# for RecursiveCharacterTextSplitter
def get_vectorstore1(path):
    text = extract_text(path)
    texts = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text)
    docs = [Document(text) for text in texts if text.strip()]
#     docs = PyPDFLoader(path).load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True))
    vectorstore = FAISS.from_documents(docs, embedding_model)
    return vectorstore

In [22]:
# for section_wise_chunking
def get_vectorstore2(path):
    texts = section_wise_chunking(path)
    docs = [Document(text) for text in texts if text.strip()]
    vectorstore = FAISS.from_documents(docs, embedding_model)
    return vectorstore

# Double retrieval RAG

In [23]:
%%time
retriever1 = get_vectorstore1(path4).as_retriever(search_kwargs={"k": 6})

CPU times: user 47.2 s, sys: 297 ms, total: 47.5 s
Wall time: 47.6 s


In [24]:
%%time
retriever2 = get_vectorstore2(path4).as_retriever(search_kwargs={"k": 3})

CPU times: user 32.7 s, sys: 161 ms, total: 32.9 s
Wall time: 32.7 s


In [29]:
def Double_RAG(query, verbose = 0):
    
    chain1 = RetrievalQA.from_llm(llm=llm, retriever=retriever1, prompt= prompt)
    chain2 = RetrievalQA.from_llm(llm=llm, retriever=retriever2, prompt= prompt)
    
    answer1 = chain1.invoke(query)['result']
    answer2 = chain2.invoke(query)['result']
    
    if verbose > 0:
        print("ANSWER1:", answer1)
        print("ANSWER2:", answer2)

    combine_prompt = combine_template.format(question = query,answer1 = answer1, answer2 = answer2)
    response = llm.invoke(combine_prompt).content
    
    return response

In [32]:
query = queries[0]
print(f"QUERY:{query} ")
s = time.time()
response = Double_RAG(query, 1) 
display(Markdown(response))
print("_"*60, " generated in", time.time()-s)

QUERY:What are the requirements for L category of vehicle? 
ANSWER1: There is no relevant information available in the provided context regarding the requirements for L category of vehicle. The context only mentions categories M, N, O, and their subcategories, but does not mention category L.
ANSWER2: There is no relevant information available in the provided context regarding the requirements for L category of vehicles. The context only mentions categories M, N, O, and their subcategories, but does not mention category L.


Unfortunately, there is no relevant information available regarding the requirements for L category of vehicle. The provided context only mentions categories M, N, O, and their subcategories, but does not mention category L. Therefore, it is not possible to provide the requirements for L category of vehicle.

____________________________________________________________  generated in 1.3242483139038086


In [33]:
query = queries[10]
print(f"QUERY:{query} ")
s = time.time()
response = Double_RAG(query,1) 
display(Markdown(response))
print("_"*60, " generated in", time.time()-s)

QUERY:What are the lights placed in front of the vehicle? 
ANSWER1: According to the provided context, the lights placed in front of the vehicle are:

* Front position lamp (2.7.14): used to indicate the presence and width of the vehicle when viewed from the front.
* Daytime running lamp (2.7.25): used to make the vehicle more easily visible when driving during daytime.
* Cornering lamp (2.7.26): used to provide supplementary illumination of that part of the road which is located near the forward corner of the vehicle.
* Parking lamp (2.7.22): used to draw attention to the presence of a stationary vehicle in a built-up area, replacing the front and rear position lamps.
* End-outline marker lamp (2.7.23): fitted near to the extreme outer edge and as close as possible to the top of the vehicle, intended to indicate clearly the vehicle's overall width.
* Main-beam headlamps (6.1.7.3 and 6.1.7.4): used for illumination of the road ahead.

Note that these lights may not be exhaustive, as th

The lights placed in front of the vehicle are:

* Front position lamp: used to indicate the presence and width of the vehicle when viewed from the front.
* Daytime running lamp: used to make the vehicle more easily visible when driving during daytime.
* Cornering lamp: used to provide supplementary illumination of that part of the road which is located near the forward corner of the vehicle.
* Parking lamp: used to draw attention to the presence of a stationary vehicle in a built-up area, replacing the front and rear position lamps.
* End-outline marker lamp: fitted near to the extreme outer edge and as close as possible to the top of the vehicle, intended to indicate clearly the vehicle's overall width.
* Main-beam headlamps: used for illumination of the road ahead.
* Headlamps: used for illumination of the road ahead.
* Adaptive front lighting system (AFS): used to provide adaptive illumination of the road ahead.

These lights are placed in front of the vehicle and serve various purposes, including indicating the vehicle's presence, providing illumination, and enhancing safety.

____________________________________________________________  generated in 37.172677755355835


In [35]:
query = "List all the lamps and lights mentioned in the pdf. Also mention which of them are placed in front or rear of the vehicle."
print(f"QUERY:{query} ")
s = time.time()
response = Double_RAG(query, 1) 
display(Markdown(response))
print("_"*60, " generated in", time.time()-s)

QUERY:List all the lamps and lights mentioned in the pdf. Also mention which of them are placed in front or rear of the vehicle. 
ANSWER1: Based on the provided context, the following lamps and lights are mentioned:

1. Front position lamps (placed in the front of the vehicle)
2. Rear position lamps (placed in the rear of the vehicle)
3. End-outline marker lamps (placed near the extreme outer edge and as close as possible to the top of the vehicle, can be placed in the front or rear)
4. Side marker lamps (placed on the side of the vehicle)
5. Rear registration plate lamp (placed in the rear of the vehicle)
6. Parking lamps (can replace front and rear position lamps when the vehicle is stationary in a built-up area)
7. Daytime running lamps (facing in a forward direction, placed in the front of the vehicle)
8. Cornering lamps (used to provide supplementary illumination of the road near the forward corner of the vehicle, placed in the front of the vehicle)
9. Adaptive front lighting syst

Here is the comprehensive list of lamps and lights mentioned in the context, along with their placement on the vehicle:

1. Front position lamps (front)
2. Rear position lamps (rear)
3. End-outline marker lamps (front and rear)
4. Side marker lamps (side)
5. Rear registration plate lamp (rear)
6. Parking lamps (front and rear)
7. Daytime running lamps (front)
8. Cornering lamps (front)
9. Adaptive front lighting system (AFS) (front)
10. Front retro-reflectors (non-triangular, front)
11. Side retro-reflectors (non-triangular, side)
12. Rear retro-reflectors (non-triangular, rear)
13. Direction-indicator lamps (categories 1, 1a, 1b, 2a, 2b, 5, and 6, front, rear, or side)
14. Manoeuvring lamps (front or rear)
15. Rear fog lamp (rear)
16. Stop lamp (rear)
17. Emergency stop signal (rear)
18. Exterior courtesy lamp (front or rear)
19. Lighting unit (part of AFS, front)
20. Installation unit (part of AFS, front)
21. Interdependent lamp system (various locations)
22. Interdependent lamp (part of interdependent lamp system, various locations)
23. Conspicuity markings (various locations)

Note that some lamps may have multiple categories or types, but they are listed here only once.

____________________________________________________________  generated in 10.916836977005005


In [None]:
saved_responses = []
for q_no, query in enumerate(queries):
    print(f"QUERY {q_no}:{query} ")
    s = time.time()
    response = Double_RAG(query) 
    print("RESPONSE:")
    display(Markdown(response))
    print("_"*60, " generated in", time.time()-s)
    saved_responses.append({"query": query, "response": response})

In [None]:
df = pd.DataFrame(saved_responses)
df.to_csv('responses_DRAG_.csv', index=False)

In [26]:
def get_vectorstore3(path):
    text = extract_text(path)
    texts = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text)
    colbert1.index(collection=texts,index_name="file1",max_document_length=10000,split_documents=True)
    vectorstore = colbert1.as_langchain_retriever(k=6)
    return vectorstore

def get_vectorstore4(path):
    texts = section_wise_chunking(path)
    colbert2.index(collection=texts,index_name="file2",max_document_length=10000,split_documents=True)
    vectorstore = colbert2.as_langchain_retriever(k=3)
    return vectorstore

In [None]:
%%time
retriever3 = get_vectorstore3(path4)

In [None]:
%%time
retriever4 = get_vectorstore4(path4)

In [None]:
def Double_RAG(query, verbose = 0):
    
    chain1 = RetrievalQA.from_llm(llm=llm, retriever=retriever1, prompt= prompt)
    chain2 = RetrievalQA.from_llm(llm=llm, retriever=retriever2, prompt= prompt)
    
    answer1 = chain1.invoke(query)['result']
    answer2 = chain2.invoke(query)['result']
    
    if verbose > 0:
        print("ANSWER1:", answer1)
        print("ANSWER2:", answer2)

    combine_prompt = combine_template.format(question = query,answer1 = answer1, answer2 = answer2)
    response = llm.invoke(combine_prompt).content
    
    return response

In [22]:
def RAG1(query):
    qa_chain = RetrievalQA.from_llm(llm=llama3, retriever=RAG.as_langchain_retriever(k=3), prompt= prompt)
    return qa_chain.invoke(query)['result']

In [23]:
RAG1(queries[10])

'According to section 6.22.4.2, all lighting units of an AFS (Adaptive Front-lighting System) shall be mounted at the front of the vehicle. Additionally, section 6.12.1 mentions that parking lamps are optional on motor vehicles not exceeding 6 m in length and not exceeding 2 m in width. Therefore, the lights placed in front of the vehicle are the AFS lighting units and possibly parking lamps, if installed.'

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}"""
)

# creating Document stuff chain
document_chain = create_stuff_documents_chain(llm, prompt)

# creating retrival chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)


# invoking the chain 
retrieval_chain.invoke({"input": queries[10]})

In [None]:
!git clone https://github.com/stanford-futuredata/ColBERT.git

## RAG 1
- Using FAISS retriever
- Adv. any k
- RetrievalQA chain

In [None]:
%%time
# retriever = get_vectorstore_2(path4).as_retriever(search_kwargs={"k": 4})
retriever = vectorstore_path4.as_retriever()

In [None]:
def RAG1(query):
    qa_chain = RetrievalQA.from_llm(llm=llama3, retriever=retriever, prompt= prompt)
    return qa_chain.invoke(query)['result']

In [None]:
data = []
for query in queries4:
    print("Query:",query)
    print("Tokens:",[len(retriever.invoke(query)[i].page_content.split()) for i in range(4)])   
    response = RAG1(query) 
    display(Markdown(response))
    print("_"*100)
    data.append({"query": query, "response": response})

In [None]:
df = pd.DataFrame(data)
df.to_excel('responses10_prompt3.xlsx', index=False)

In [None]:
q = "what is color of End Outline marker lamp?"
for context in retriever.invoke(q):
    print(context.page_content)
    print("_"*80)

# Comparison_RAG2
- using PDR
- Fixed k = 4

In [None]:
def PDR(path):
    documents = PyPDFLoader(path).load()
    combined_text = "\n".join(document.page_content for document in documents)
    document = [Document(page_content=combined_text, metadata={"source": path})]

    retriever = ParentDocumentRetriever(vectorstore=Chroma(collection_name="full_documents", embedding_function=FastEmbedEmbeddings()),
                                        docstore=InMemoryStore(),
                                        child_splitter=RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100),
                                        parent_splitter=RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100))
    
    retriever.add_documents(document, ids=None)
    return retriever

In [None]:
%%time
retriever2 = PDR(path4)

In [None]:
def Comparison_RAG2(query):
    qa_chain = RetrievalQA.from_llm(llm=llama3, retriever=retriever2, prompt= prompt)
    return qa_chain.invoke(query)['result']

In [None]:
data = []
for query in queries4:
    print("#",query)
    response = Comparison_RAG2(query) 
    print(response)
    print("---------------------------------------------------------------------")
    data.append({"query": query, "response": response})

In [None]:
# Save responses and export
df = pd.DataFrame(data)
df.to_excel('query_responses10.xlsx', index=False)

# Comparison_RAG 1
- RAG for comparison
- by FAISS Retrieval
- by calling LLM 3 times
- chain - RetrievalQA

In [None]:
%%time
retriever1 = get_vectorstore(path1).as_retriever(search_kwargs={"k": 6})
retriever2 = get_vectorstore(path2).as_retriever(search_kwargs={"k": 6})

In [None]:
def Comparison_RAG1(query):
    llm = llama3
    
    qa_chain1 = RetrievalQA.from_llm(llm=llm, retriever=retriever1, prompt= prompt,)
    qa_chain2 = RetrievalQA.from_llm(llm=llm, retriever=retriever2, prompt= prompt)

    answer1 = qa_chain1.invoke(query)['result']
    answer2 = qa_chain2.invoke(query)['result']
    
    comparison_prompt = comparison_template.format(question = query,answer1 = answer1, answer2 = answer2)
    comparison = llm.invoke(comparison_prompt).content
    
    response = f"**ANSWER 1**: {answer1}\n\n**ANSWER 2**: {answer2}\n\n**COMPARISION**: {comparison}"
    return response

In [None]:
# AEBS
queries2 = ["Explain the test procedures in detail, in details.", "What are warning indications, in details."]
for query in queries2:
    display(Markdown(Co1parison_RAG1(query)))

# Comparison_RAG 2
- by PDR
- 3 LLM calls

In [None]:
retriever1 = PDR(path1)
retriever2 = PDR(path2)

In [None]:
def Comparison_RAG2(query):
    context1 = get_context(query, retriever1)
    context2 = get_context(query, retriever2)

    qa_prompt1 = qa_template.format(question = query,context = context1)
    qa_prompt2 = qa_template.format(question = query,context = context2)
    
    answer1 = llama3.invoke(qa_prompt1).content
    answer2 = llama3.invoke(qa_prompt2).content
    
    comparison_prompt = comparison_template.format(question = query,answer1 = answer1, answer2 = answer2)
    comparison = llm.invoke(comparison_prompt).content
    
    response = f"**ANSWER 1**: {answer1}\n\n**ANSWER 2**: {answer2}\n\n**COMPARISION**: {comparison}"
    return response

# Comparison_RAG 3
- Vectostore to context from scratch`
- Dis. - 1 prompt, 1 LLM call, too much load on geneartion

In [None]:
def get_context(query, vectorstore):
    retrieved_docs = vectorstore.similarity_search_with_relevance_scores(query, k = 6)
    context = ""
    for doc in retrieved_docs:
        context += doc.page_content
    return context

In [None]:
vectorstore1 = get_vectorstore(path1)
vectorstore2 = get_vectorstore(path2)

In [None]:
def Comparison_RAG3(query):
    context1 = get_context(query, retriever1)
    context2 = get_context(query, retriever2)
    
    prompt = get_comparision_prompt(query, context1, context2)
    
    return llama3.invoke(prompt).content

In [None]:
#AEBS
queries1 = ["Explain the test procedures in detail.","When collision early warning signal shall be sent?","What are warning indications, in details.","What AEBS should do in vehicle ignition?","The total speed reduction of the subject vehicle at the time of the collision with the stationary target shall be not less than how many kilometers per hour?"]

In [None]:
for query in queries1:
    print("#",query)
    display(Markdown(Comparison_RAG3(query)))

### Some notes

In [None]:
#multivector embeddings have best performance for retrieval https://www.rungalileo.io/blog/mastering-rag-how-to-select-an-embedding-model

In [None]:
# Optional way for RAG2# def get_context(query, retriever):
#     retrieved_docs = retriever.get_relevant_documents(query)
#     context = " ".join([doc.page_content for doc in retrieved_docs])
#     return context

# def RAG2(query):
#     context = get_context(query,retriever1)
#     qa_prompt = template2.format(question = query,context = context)
#     return llama3.invoke(qa_prompt).content

# Documentation

- llama3 could be taken from groq or ollama
- Possible raesons for latency in pdr-llama2-chromadb code
  - llama2 downloaded in local computer
  - slow parent document retriever
  - small cpu
- Giskard for evaluation
  - By default uses gpt4 but llama3 also could be connected
  - it finds contexts from pdf/url and generates question, further context is matched to the RAG response for evaluating RAG.
- Use an image processing model (e.g., CLIP, a Vision Transformer, or a CNN) to convert images into embeddings or textual descriptions.

# Optional codes

### Section-wise chunking (ss)

In [None]:
# Initialize Anthropic client
client = Anthropic()

def count_tokens(text):
    '''
    Count the number of tokens in the provided text using Anthropic's client.

    Args:
    - text (str): The text for which tokens need to be counted.

    Returns:
    - int: Number of tokens in the text.
    '''
    return client.count_tokens(text)

def extract_all_text(pdf_path):
    '''
    Extract all text from a PDF document.

    Args:
    - pdf_path (str): Path to the PDF file.

    Returns:
    - dict: Dictionary where keys are page numbers and values are extracted text strings.
    '''
    with pdfplumber.open(pdf_path) as pdf:
        text_dict = {}
        for i, page in enumerate(pdf.pages, start=1):
            if i == 1:
                text_dict[i] = page.extract_text(layout=False, strip=True, return_chars=True)
                
            else:
                text_dict[i] = '\n'.join(line for line in page.extract_text().split('\n') if 'Official Journal of the European Union' not in line)
    return text_dict

# Function to concatenate lines where the element before the period (.) is the same
def concat_lines_by_same_element(text):
    '''
    Concatenate lines where the element before the period (.) is the same into blocks.

    Args:
    - text (str): Text to process.

    Returns:
    - list: List of concatenated blocks.
    '''
    lines = text.split('\n')
    concatenated_blocks = []
    current_block = ""
    current_element = None
    
    for line in lines:
        match = re.match(r'^(\d+)\.', line.strip())
        if match:
            element = match.group(1)
            if current_element is None:
                if current_block:
                    concatenated_blocks.append(current_block.strip())
                current_element = element
                current_block = line
            elif element == current_element:
                current_block += " " + line
            else:
                concatenated_blocks.append(current_block.strip())
                current_element = element
                current_block = line
        else:
            current_block += " " + line
    
    if current_block:
        concatenated_blocks.append(current_block.strip())
    
    return concatenated_blocks

def concatenate_blocks(blocks, max_tokens=2000):
    '''
    Concatenate text blocks ensuring the total token count for each concatenated block does not exceed the specified limit.

    Args:
    - blocks (list): List of text blocks to concatenate.
    - max_tokens (int): Maximum number of tokens allowed per concatenated block (default is 2000).

    Returns:
    - list: List of concatenated blocks where each block's total token count is within the specified limit.
    '''
    concatenated_blocks = []
    temp_block = []
    total_value = 0
    
    for ele in blocks:
        value = count_tokens(ele)
        if total_value + value <= max_tokens:
            temp_block.append(ele)
            total_value += value
        else:
            print(f"Total tokens for current block: {total_value}")
            concatenated_blocks.append(' '.join(temp_block))
            temp_block = [ele]
            total_value = value
    
    # Add the last block if it's not empty
    if temp_block:
        concatenated_blocks.append(' '.join(temp_block))
    
    return concatenated_blocks

# %%
# Example usage of extract_all_text function
pdf_path = path1
all_text = extract_all_text(pdf_path)

# %%
full_text = '\n'.join(all_text.values())
# Encode some misc unicode characters
full_text = full_text.encode('utf-8').decode()
# Example usage of concat_lines_by_same_element function
blocks = concat_lines_by_same_element(full_text)  # Assuming all_text is a dictionary with page numbers
print(blocks)
blocks_1 = concatenate_blocks(blocks)

### Section-wise chunking (smb)

In [None]:
import pdfplumber
import re

def read_pdf_pagewise(file_path):
    
    section_pattern = re.compile(r'(\d [A-Z]|Annex [A-Z])')

    sections = {}
    with pdfplumber.open(file_path) as pdf:
        num_pages = len(pdf.pages)
        print(f'Total pages: {num_pages}')

        current_section = None
        for page_num in range(2, num_pages):
            page = pdf.pages[page_num]
            page_text = page.extract_text()

            if page_text:
                lines = page_text.split('\n')
                if len(lines) > 3:  
                    lines = lines[1:-2]
                else:
                    lines = [] 
                
                for line in lines:
                    match = section_pattern.match(line)
                    if match:
                        current_section = match.group(1)
                        if current_section not in sections:
                            sections[current_section] = []
                    
                    if current_section:
                        sections[current_section].append(line)
    return sections
#     for section, content in sections.items():
#         print(f"--- {section} ---")
#         print('\n'.join(content))
#         print("\n\n")