In [77]:
from langchain_community.document_loaders import PyMuPDFLoader
import pandas as pd

In [78]:

set_1="Data/PDF Documents/Set1"
set_2="Data/PDF Documents/Set2"

In [79]:
import os


In [80]:
print("Loading documents from Set1")
documents=[]
for filename in os.listdir(set_1):
    if filename.endswith(".pdf"):
        file_path=os.path.join(set_1,filename)
        loader=PyMuPDFLoader(file_path)
        documents.extend(loader.load())
print("No of documents loaded from set1:",len(documents))

Loading documents from Set1
No of documents loaded from set1: 420


In [81]:

print("Loading pdfs from set 2")
for filename in os.listdir(set_2):
    if filename.endswith(".pdf"):
        file_path=os.path.join(set_2,filename)
        loader_set2=PyMuPDFLoader(file_path)
        documents.extend(loader_set2.load())
print(f"No of documents loaded : {len(documents)}")

Loading pdfs from set 2
MuPDF error: syntax error: syntax error in array

MuPDF error: syntax error: syntax error in content stream

MuPDF error: syntax error: syntax error in array

MuPDF error: syntax error: syntax error in content stream

MuPDF error: syntax error: syntax error in array

MuPDF error: syntax error: syntax error in content stream

MuPDF error: syntax error: syntax error in array

MuPDF error: syntax error: syntax error in content stream

No of documents loaded : 2511


In [82]:
chunk_size=1000
chunk_overlap=200

In [83]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time

In [84]:
text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)

In [85]:
def measure_time(func,*args, **kwargs):
    start_time=time.time()
    results=func(*args,**kwargs)
    end_time=time.time()
    return results,end_time-start_time

In [86]:
chunks,chunking_time=measure_time(text_splitter.split_documents,documents)

In [87]:
print(f"Total {len(chunks)} chunks created in time : {chunking_time:.2f} seconds")

Total 13289 chunks created in time : 1.33 seconds


In [88]:
#creating embeddings and store in chroma db
from pyexpat import model
from langchain_community.embeddings import HuggingFaceEmbeddings
print("Creating embeddings and building vectore store")
embedding_model_name="sentence-transformers/all-MiniLM-L6-v2"
embeddings=HuggingFaceEmbeddings(model_name=embedding_model_name)

Creating embeddings and building vectore store


In [89]:
from langchain_community.vectorstores import Chroma
db,db_load_time=measure_time(Chroma.from_documents,chunks,embeddings)

In [37]:
print(f"Vector database created in {db_load_time:.2f} seconds")

Vector database created in 460.28 seconds


In [38]:
#Loading LLM via HuggingFacePipeline
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline


print("Loading LLM model")
model_name="google/flan-t5-base"
llm_pipeline=pipeline(
    "text2text-generation",
        model=model_name,
        max_length=512,
        device=0, # Use device=0 for GPU, -1 for CPU
        temperature=0.7,
        do_sample=True,
        top_p=0.95,
)
llm=HuggingFacePipeline(pipeline=llm_pipeline)

Loading LLM model


Device set to use cpu


In [39]:
#Create the Retrieval QA chain
from langchain.chains import RetrievalQA
retriever=db.as_retriever(search_kwargs={"k": 4})
qa_chain=RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
)

In [40]:
#Starting RAG



label_df_set1 = pd.read_csv('Data/Labelled Data/Labelled_Data_Set1.csv')
label_df_set2 = pd.read_excel('Data/Labelled Data/Labelled_Data_Set2.xlsx',sheet_name='Misc.')
all_labelled=pd.concat([label_df_set1, label_df_set2], ignore_index=True)


In [41]:
queries_df_set1= pd.read_excel('Data/Queries/Queries.xlsx',sheet_name='Queries_Set1')
queries_df_set2= pd.read_excel('Data/Queries/Queries.xlsx',sheet_name='Queries_Set2')
all_queries_df = pd.concat([queries_df_set1, queries_df_set2], ignore_index=True)


In [42]:
all_labelled.head(10)

Unnamed: 0,S. No.,Ctegories,Manual Name,Queries,Answers,Product ID,Category,Page no.
0,1.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,What is the use of level command kit?,The purpose of the level command kit is to pro...,,,
1,2.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,What to do if I already have an air suspension...,"If you already have an air suspension system, ...",,,
2,3.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,When should I avoid installing airline tubing?,If you are installing an air suspension system...,,,
3,4.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,How can I connect the airline tubing?,To connect the air line tubing to the fittings...,,,
4,5.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,Give the list of parts available in the level ...,Following is the list of parts available in th...,,,
5,6.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,Give the list of parts available in the level ...,Below is the list of parts with the number of ...,,,
6,7.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,Give me the instructions to install level comm...,Below are the steps to install level command –...,,,
7,8.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,What are the tools required to install?,"Tools required: \n 1. 3/16"" drill bit \n 2. 3/...",,,
8,9.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,What to do after selecting a mounting location...,After selecting a mounting location for the co...,,,
9,10.0,Air Compressor,Black&Decker_AirCompresssor_Nil1_Editable,How can I prepare the compressor to mount?,To prepare the compressor install the rubber i...,,,


In [76]:
query="What is the gaurantee for the 4700 HP air compressor?"
#queries=all_queries_df["Queries"].to_list()
#uery=queries[0]
#print(queries)
result=qa_chain.invoke(query)
print(f"Generated answer is : {result['result']}")
docs=result['source_documents']
# for doc in docs:
#     print(doc.metadata.get('source',''))
#     print("--------")
# #print(result['source_documents'])
labelled_row=all_labelled.loc[all_labelled["Queries"].str.strip()==query.strip()]
ground_truth_manual=labelled_row['Manual Name'].iloc[0]
ground_truth_manual=ground_truth_manual.replace('&','_')
if ground_truth_manual.strip() in docs[0].metadata.get('source','').strip():
    print("yesss")
else:
    print(ground_truth_manual.strip())
    print(docs[0].metadata.get('source','').strip())
    print("noo")

# print(labelled_row['Answers'].iloc[0])
# if "Black_Decker_AirCompresssor_Nil1_Editable" in "Data/PDF Documents/Set1/Black_Decker_AirCompresssor_Nil1_Editable.pdf":
#     print("yesss")
# else:
#     print("no")

Generated answer is : DWFP55120 Air Compressor E. Safety Valve F. Drain Valve G. Check Valve H. Air Outlet (1/4" NPTF) APPROX CUT-IN PRESSURE 105 psi APPROX. CUT-OUT PRESSURE 135 psi SCFM @ 90 PSI 1.8 A. On(I)/Off(O) Switch B. Tank Pressure Gauge C. Outlet Pressure Gauge D. Regulator DWFP55120 Air Compressor E. Safety Valve F. Drain Valve G. Check Valve H. Air Outlet (1/4" NPTF) APPROX CUT-IN PRESSURE 105 psi APPROX. CUT-OUT PRESSURE 135 psi SCFM @ 90 PSI 1.8 A. On(I)/Off(O) Switch B. Tank Pressure Gauge C. Outlet Pressure Gauge D. Regulator DWFP55120 Air Compressor E. Safety Valve F. Drain Valve G. Check Valve H. Air Outlet (1/4" NPTF) APPROX CUT-IN PRESSURE 105 psi APPROX. CUT-OUT PRESSURE 135 psi SCFM @ 90 PSI 1.8 A. On(I)/Off(O) Switch B. Tank Pressure Gauge C. Outlet Pressure Gauge D. Regulator DWFP55120 Air Compressor E. Safety Valve F. Drain Valve G. Check Valve H. Air Outlet (1/4" NPTF) APPROX CUT-IN PRESSURE 105 psi APPROX. CUT-OUT PRESSURE 135 psi SCFM @ 90 PSI 1.8 A. On(I)/O

In [73]:

def evaluate_rag(qa_chain,df_queries,all_labelled):
    results=[]
    print("Starting evaluation")
    i=0
    for index,row in df_queries.iterrows():
        query=row['Queries']
        i=i+1
        labelled_row=all_labelled.loc[all_labelled["Queries"].str.strip()==query.strip()]
        if not labelled_row.empty:
            ground_truth_answer=labelled_row['Answers'].iloc[0]
            ground_truth_manual=labelled_row['Manual Name'].iloc[0]
            ground_truth_manual=ground_truth_manual.replace('&','_')
        else:
            ground_truth_answer='N/A'
            ground_truth_manual='N/A'
        #ground_truth=row['Answers']
        #manual_name=row['Manual Name']

        start_time=time.time()
        rag_results=qa_chain.invoke(query)
        end_time=time.time()
        total_time = end_time-start_time

        generated_answer=rag_results['result']
        retrieved_docs=rag_results['source_documents']

        retrieved_correct_manual=False
        if ground_truth_manual.strip() in docs[0].metadata.get('source','').strip():
            retrieved_correct_manual=True
        results.append({
            "Query": query,
            "Ground Truth Answer": ground_truth_answer,
            "Generated Answer": generated_answer,
            "Total Time (s)": total_time,
            "Retrieved Correct Manual": retrieved_correct_manual,
            "Retrieved Documents": retrieved_docs
        })
        if i>50:
            break
    print("Evaluation complete.")
    return pd.DataFrame(results)



In [74]:
evaluation_results_df = evaluate_rag(qa_chain, queries_df_set1,all_labelled)

# Calculate and print key metrics
recall_at_k = evaluation_results_df['Retrieved Correct Manual'].mean()
avg_total_time = evaluation_results_df['Total Time (s)'].mean()

print("\n--- Overall Evaluation Metrics ---")
print(f"Overall Recall@k: {recall_at_k:.2f}")
print(f"Average Total Processing Time per Query: {avg_total_time:.2f} seconds")

print("\n--- Sample Queries for Manual Review ---")
for i in range(5):
    row = evaluation_results_df.iloc[i]
    print("-" * 50)
    print(f"Query: {row['Query']}")
    print(f"Ground Truth Answer: {row['Ground Truth Answer']}")
    print(f"Generated Answer: {row['Generated Answer']}")
    print(f"Retrieved Correct Manual: {row['Retrieved Correct Manual']}")
    print("Retrieved Sources:")
    for doc in row['Retrieved Documents']:
        print(f"  - Source: {doc.metadata.get('source')}, Page: {doc.metadata.get('page')}")

Starting evaluation
Evaluation complete.

--- Overall Evaluation Metrics ---
Overall Recall@k: 0.43
Average Total Processing Time per Query: 33.29 seconds

--- Sample Queries for Manual Review ---
--------------------------------------------------
Query:  What is the use of level command kit? 
Ground Truth Answer: The purpose of the level command kit is to provide inflation control of your
 air helper springs. This kit will be an asset to your vehicle, meeting nearly all of your air supply needs.
Generated Answer: Inflation control of your air helper springs
Retrieved Correct Manual: True
Retrieved Sources:
  - Source: Data/PDF Documents/Set1/Black_Decker_AirCompresssor_Nil1_Editable.pdf, Page: 0
  - Source: Data/PDF Documents/Set1/Black_Decker_AirCompresssor_Nil1_Editable.pdf, Page: 0
  - Source: Data/PDF Documents/Set1/Black_Decker_AirCompresssor_Nil1_Editable.pdf, Page: 0
  - Source: Data/PDF Documents/Set1/Black_Decker_AirCompresssor_Nil1_Editable.pdf, Page: 0
---------------------

In [63]:
%pwd

'/mnt/c/Users/sasidhar.chennup/Documents/Tiger-Training/NLP-GenAI-Classroom/NLP-Gen-AI-classroom/Assignment-3'

In [75]:
import os
evaluation_results_df.to_csv('evaluation_results.csv', index=False)