In [3]:
from langchain.document_loaders import TextLoader
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader

In [4]:
# Load and process documents
loader = PyPDFLoader("eu_ai_act.pdf")
documents = loader.load()

In [5]:
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
split_docs = splitter.split_documents(documents)

In [8]:
# Embed and index documents
embeddings = OllamaEmbeddings(model="llama3")

In [12]:
vectorstore = Chroma.from_documents(split_docs, embeddings)

In [11]:
# !pip install chromadb

In [40]:
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold",
                                     search_kwargs={"score_threshold": 0.75})

In [15]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="llama3",
    temperature=0,  # Adjust temperature for desired response variability
)


In [20]:
from langchain import PromptTemplate

In [21]:
# Prompt
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

In [41]:
qa_chain = RetrievalQA.from_chain_type(
        llm,
        retriever=vectorstore.as_retriever(),
        chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
    )

In [42]:
query = "what's the main asepcts of the EU AI Act regarding data governance"
response = qa_chain.run(query)
print(response)

The EU AI Act does not explicitly address data governance as a main aspect, but rather focuses on ensuring high-risk AI systems comply with requirements established in Chapter 2. The regulation emphasizes the importance of confidentiality and sets out rules for information exchange (Title X).


In [34]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain import hub


In [35]:
prompt = hub.pull("rlm/rag-prompt")



In [43]:

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [44]:
rag_chain.invoke("According to the EU AI Act, what transparency obligations must providers of high-risk AI systems comply with, and how do these differ from obligations for general-purpose AI systems?")


  self.vectorstore.similarity_search_with_relevance_scores(
No relevant docs were retrieved using the relevance score threshold 0.75


"According to the EU AI Act, providers of high-risk AI systems must comply with more stringent transparency obligations than those for general-purpose AI systems. Specifically, they must provide clear information about the system's decision-making process and the data used to train it, as well as notify users if the system makes a mistake that could have significant consequences. This is in addition to the general obligation to provide transparent information about the AI system's functionality and limitations."

In [39]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_chain_with_source.invoke("According to the EU AI Act, what transparency obligations must providers of high-risk AI systems comply with, and how do these differ from obligations for general-purpose AI systems?")

{'context': [Document(metadata={'page': 75, 'source': 'eu_ai_act.pdf'}, page_content='Directive 2013/36/EU and for high -risk AI systems which are safety components of \ndevices, or are themselves devices, covered by Regulation (EU) 2017/745 and \nRegulation (EU) 2017/746, the notification of serious incidents or malfunctioning \nshall be limited to those that that constitute a breach of obligations under Union law \nintended to protect fundamental rights.'),
  Document(metadata={'page': 16, 'source': 'eu_ai_act.pdf'}, page_content='Title XII contains an obligation for the Commission to assess regularly the need for an update \nof Annex III and to prepare regular reports on the evaluation and review of the regulation. It \nalso lays d own final provisions, including a differentiated transitional period for the initial \ndate of the applicability of the regulation to facilitate the smooth implementation for all \nparties concerned.'),
  Document(metadata={'page': 27, 'source': 'eu_ai_ac

In [47]:
# Adding new documents to the index
from langchain_core.documents import Document

new_docs = [
    Document(page_content="Solar panels improve energy independence.",
             metadata={"source": "Report_C", "author": "EnergyWorld", "date": "2023-06-20"})
]

loader2 = PyPDFLoader("eu_ai_act.pdf")
documents2 = loader.load()

vectorstore.add_documents(documents2)
retriever = vectorstore.as_retriever()
query = "Explain the role of solar panels in sustainability."
print(retriever.get_relevant_documents(query))


[Document(metadata={'page': 16, 'source': 'eu_ai_act.pdf'}, page_content='Title XII contains an obligation for the Commission to assess regularly the need for an update \nof Annex III and to prepare regular reports on the evaluation and review of the regulation. It \nalso lays d own final provisions, including a differentiated transitional period for the initial \ndate of the applicability of the regulation to facilitate the smooth implementation for all \nparties concerned.'), Document(metadata={'page': 45, 'source': 'eu_ai_act.pdf'}, page_content='1. Irrespective of whether an AI system is placed on the market or put into service \nindependently from the products referred to in points (a) and (b), that AI system shall \nbe considered high-risk where both of the following conditions are fulfilled: \n(a) the AI system is i ntended to be used as a safety component of a product, or is \nitself a product, covered by the Union harmonisation legislation listed in Annex \nII;  \n(b) the prod

In [52]:
# Script for EU AI Act Analysis with Document Embedding and Retrieval
# ---------------------------------------------------

# Install required dependencies
# Ensure the following packages are installed via terminal or uncomment to install directly in the script:
# !pip install langchain-ollama langchain-community chromadb pypdf

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.ollama import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain import PromptTemplate
from langchain_ollama import ChatOllama
from langchain_core.documents import Document
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain import hub

# Step 1: Load and Process the EU AI Act PDF
# Description: Load the PDF and prepare the text for processing.
pdf_path = "eu_ai_act.pdf"  # Replace with the actual file path
loader = PyPDFLoader(pdf_path)
documents = loader.load()

# Step 2: Split the Text into Manageable Chunks
# Description: Use RecursiveCharacterTextSplitter to chunk text into smaller sections for analysis.
splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
split_docs = splitter.split_documents(documents)
print(f"Number of chunks created: {len(split_docs)}")

# Step 3: Generate Embeddings with Ollama
# Description: Use the `OllamaEmbeddings` model to generate vector embeddings for the chunks.
embeddings = OllamaEmbeddings(model="llama3")

# Step 4: Store Embeddings in Chroma Vector Database
# Description: Index the document embeddings using Chroma for efficient retrieval.
vectorstore = Chroma.from_documents(split_docs, embeddings)
print("Embeddings stored in Chroma vector database.")

# Step 5: Setup Retriever
# Description: Configure the retriever for similarity-based search with a score threshold.
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold",
                                     search_kwargs={"score_threshold": 0.75})

retriever = vectorstore.as_retriever()


# Step 6: Initialize LLM with ChatOllama
# Description: Configure the language model with desired response variability.
llm = ChatOllama(
    model="llama3",
    temperature=0  # Lower temperature for deterministic responses
)

# Step 9: Advanced Retrieval with RAG Chain
# Description: Configure a Retrieval-Augmented Generation (RAG) chain for improved retrieval and LLM processing.
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | hub.pull("rlm/rag-prompt")
    | llm
    | StrOutputParser()
)

# Example Query with RAG Chain
rag_response = rag_chain.invoke("According to the EU AI Act, what transparency obligations must providers of high-risk AI systems comply with, and how do these differ from obligations for general-purpose AI systems?")

print("\n--- RAG Chain Response ---")
print(rag_response)

#Adjusting the RAG to retrive sources
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | hub.pull("rlm/rag-prompt")
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

rag_response = rag_chain_with_source.invoke("According to the EU AI Act, what transparency obligations must providers of high-risk AI systems comply with, and how do these differ from obligations for general-purpose AI systems?")


print("\n--- RAG Chain Response with Sources ---")
print(rag_response)


Number of chunks created: 497
Embeddings stored in Chroma vector database.





--- RAG Chain Response ---
According to the EU AI Act, providers of high-risk AI systems must comply with transparency obligations related to serious incidents or malfunctions that constitute a breach of obligations under Union law intended to protect fundamental rights. This differs from general-purpose AI systems, which do not have these specific transparency requirements.





--- RAG Chain Response with Sources ---
{'context': [Document(metadata={'page': 75, 'source': 'eu_ai_act.pdf'}, page_content='Directive 2013/36/EU and for high -risk AI systems which are safety components of \ndevices, or are themselves devices, covered by Regulation (EU) 2017/745 and \nRegulation (EU) 2017/746, the notification of serious incidents or malfunctioning \nshall be limited to those that that constitute a breach of obligations under Union law \nintended to protect fundamental rights.'), Document(metadata={'page': 75, 'source': 'eu_ai_act.pdf'}, page_content='Directive 2013/36/EU and for high -risk AI systems which are safety components of \ndevices, or are themselves devices, covered by Regulation (EU) 2017/745 and \nRegulation (EU) 2017/746, the notification of serious incidents or malfunctioning \nshall be limited to those that that constitute a breach of obligations under Union law \nintended to protect fundamental rights.'), Document(metadata={'page': 75, 'source': 'eu

In [None]:
# Step 10: Adding New Documents to the Index
# Description: Dynamically add new documents to the existing vectorstore.
new_docs = [
    Document(page_content="Solar panels improve energy independence.",
             metadata={"source": "Report_C", "author": "EnergyWorld", "date": "2023-06-20"})
]

vectorstore.add_documents(new_docs)
retriever = vectorstore.as_retriever()

# Query with Updated Vectorstore
updated_query = "Explain the role of solar panels in sustainability."
relevant_docs = retriever.get_relevant_documents(updated_query)
print("\n--- Updated Query Results ---")
for doc in relevant_docs:
    print(doc.page_content)
