In [9]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
# Extract text from the Pdf file
def load_pdf_files(pdf_path):
    loader = DirectoryLoader(pdf_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    # documents = loader.lazy_load()
    return documents

In [11]:
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader

def load_pdf_files(pdf_path: str, 
                   glob_pattern: str = "**/*.pdf",
                   loader_kwargs: dict = None,
                   show_progress: bool = False,
                   silent_errors: bool = False) -> list:
    
    if loader_kwargs is None:
        loader_kwargs = {}

    # Use DirectoryLoader to scan the directory and use PyPDFLoader for each PDF
    loader = DirectoryLoader(
        pdf_path,
        glob=glob_pattern,
        loader_cls=lambda path: PyPDFLoader(path, **loader_kwargs),
        show_progress=show_progress,
        silent_errors=silent_errors
    )
    documents = loader.load()
    return documents

In [12]:
extracted_data = load_pdf_files("./industrial-safety-pdfs")

In [13]:
len(extracted_data)

337

In [14]:
extracted_data[56].page_content

'(53) The harmonised standards relevant to this Regulation should take into account the requirements of Directive (EU) \n2019/882 of the European Parliament and of the Council (  16 ) and the United Nations Convention on the Rights \nof Persons with Disabilities (  17 ). \n(54) The list of products in Annex IV of Directive 2006/42/EC has until now been based on the risk emanating from \nthe intended use or any reasonably foreseeable misuse of those products or their critical protective function. \nNevertheless, the machinery field embraces new ways of designing and constructing machinery or related \nproducts that might present higher risk factors, regardless of such intended use or any reasonably foreseeable \nmisuse. For example, systems with self-evolving behaviour ensuring safety functions should be included in \nAnnex I due to their characteristics such as data dependency, opacity, autonomy and connectivity, which \nmight considerably increase the probability and severity of harm 

In [15]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    ''' Given a list of Document objects, return a new list of document objects 
    containing only 'source' in the metadata and the original page_content'''
    
    minimal_docs: List[Document] = []
    
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [16]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [17]:
# Split the text into chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [18]:
texts_chunk = text_split(minimal_docs)
len(texts_chunk)

1809

In [19]:
# generate the vector embeddings using HuggingFaceEmbeddings
from langchain_huggingface import HuggingFaceEmbeddings

def download_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name='sentence-transformers/all-MiniLM-L6-v2',
        model_kwargs={'device': 'cpu'}
    )
    return embeddings

embeddings = download_embeddings()

In [20]:
embeddings

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [None]:
vector = embeddings.embed_query("What's our Q1 revenue?")
vector

In [22]:
print(len(vector))

384


In [None]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="rag-collection",
    embedding_function=embeddings,  
    persist_directory="./chroma_db"
)

# Add Documents
vector_store.add_documents(documents=texts_chunk)

In [24]:
# Create the retriever
# Base retriever (before reranking)

retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [25]:
# retrieved_docs = retriever.invoke("What are RIA ?")
retrieved_docs = retriever.invoke("When did ANSI/RIA R15.06 published ?")

In [26]:
retrieved_docs

[Document(id='665b1be8-b4f6-46b1-8aee-7d7d863d670f', metadata={'source': 'industrial-safety-pdfs\\12-Franklin--RIA-IEEE_2019-05-19_v2.pdf'}, page_content='ANSI/RIA R15.06-2012: Table of Contents'),
 Document(id='2117b2e9-1516-4ce1-ae55-065b33ac7949', metadata={'source': 'industrial-safety-pdfs\\12-Franklin--RIA-IEEE_2019-05-19_v2.pdf'}, page_content='ANSI/RIA R15.06-2012: Table of Contents'),
 Document(id='0f2710fb-ab98-4d43-9a11-c3af33acdd8a', metadata={'source': 'industrial-safety-pdfs\\12-Franklin--RIA-IEEE_2019-05-19_v2.pdf'}, page_content='About RIA’s Robot Standards:  History\nYear Standard Document\n1986 ANSI/RIA R15.06-1986\n1992 ANSI/RIA R15.06-1992\n1999 ANSI/RIA R15.06-1999\n~2000 ISO 10218 begun, based on R15.06-1999\n2006 ISO 10218-1:2006\n2007 ANSI/RIA ISO 10218-1:2017 & RIA TR to enable its use\n2011 ISO 10218-1,2:2011\n2012 ANSI/RIA R15.06-2012 (U.S. national adoption of 10218-1,2:2011)\n2014 RIA TRs 306, 406, 506\n2016 ISO/TS15066:2016; RIA TR R15.606-2016 (U.S. nation

## Cohere reranking

In [27]:
from dotenv import load_dotenv
import os
load_dotenv()

cohere_api_key = os.getenv("COHERE_API_KEY")
if cohere_api_key is None:
    raise ValueError("COHERE_API_KEY is not set in environment")

In [28]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere.rerank import CohereRerank

# --- New: Set up the reranker / compression layer ---

# Initialize the Cohere reranker (you must specify model)
compressor = CohereRerank(model="rerank-english-v3.0", cohere_api_key=cohere_api_key)  # adjust model name as per your API

# Wrap your base retriever with contextual compression / reranking
compression_retriever = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=compressor
)

# Now compression_retriever.invoke(query) will:
#   1. fetch top-k docs via base_retriever
#   2. rerank them via CohereRerank
#   3. return compressed / reranked set of docs

## GOOGLE MODEL

In [29]:
from langchain_google_genai import ChatGoogleGenerativeAI
from dotenv import load_dotenv
import os

load_dotenv()  # this loads the .env file

api_key = os.getenv("GOOGLE_API_KEY")
if api_key is None:
    raise ValueError("GOOGLE_API_KEY not set in environment")

# Create the model instance
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash", 
    temperature=0.0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

## RAG Chain with LCEL

In [30]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
from langchain_core.prompts import ChatPromptTemplate


# Alternative: Custom prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", """You are an assistant for question-answering tasks. 
    Use the following pieces of retrieved context to answer the question. 
    If you don't know the answer, just say that you don't know. 
    Use three sentences maximum and keep the answer concise.
    
    Context: {context}"""),
    ("human", "{question}")
])

def format_docs(docs):
    """Format retrieved documents for the prompt"""
    return "\n\n".join(doc.page_content for doc in docs)

In [31]:
parallel_chain = RunnableParallel({
    'context': compression_retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

# Build RAG chain using LCEL
rag_chain = parallel_chain | prompt| llm | StrOutputParser()

In [32]:
parallel_chain.invoke('When did ANSI/RIA R15.06 published')

{'context': 'About RIA’s Robot Standards:  History\nYear Standard Document\n1986 ANSI/RIA R15.06-1986\n1992 ANSI/RIA R15.06-1992\n1999 ANSI/RIA R15.06-1999\n~2000 ISO 10218 begun, based on R15.06-1999\n2006 ISO 10218-1:2006\n2007 ANSI/RIA ISO 10218-1:2017 & RIA TR to enable its use\n2011 ISO 10218-1,2:2011\n2012 ANSI/RIA R15.06-2012 (U.S. national adoption of 10218-1,2:2011)\n2014 RIA TRs 306, 406, 506\n2016 ISO/TS15066:2016; RIA TR R15.606-2016 (U.S. national adoption). TR 306 update.\n2017 ISO/TR 20218-2:2017\n\nAbout RIA’s Robot Standards:  History\nYear Standard Document\n1986 ANSI/RIA R15.06-1986\n1992 ANSI/RIA R15.06-1992\n1999 ANSI/RIA R15.06-1999\n~2000 ISO 10218 begun, based on R15.06-1999\n2006 ISO 10218-1:2006\n2007 ANSI/RIA ISO 10218-1:2017 & RIA TR to enable its use\n2011 ISO 10218-1,2:2011\n2012 ANSI/RIA R15.06-2012 (U.S. national adoption of 10218-1,2:2011)\n2014 RIA TRs 306, 406, 506\n2016 ISO/TS15066:2016; RIA TR R15.606-2016 (U.S. national adoption). TR 306 update.\n2

In [33]:
# Test the chain
response = rag_chain.invoke("When did ANSI/RIA R15.06 published ?")
print(response)

The ANSI/RIA R15.06 standard has been published multiple times. It was first published in 1986 as ANSI/RIA R15.06-1986. Subsequent versions were released in 1992, 1999, and 2012.
