In [6]:
# !pip install langchain-community pypdf

In [7]:
!pip install -qU langchain-ollama

In [8]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

In [9]:
# 1. Load the EU AI Act PDF
loader = PyPDFLoader("eu_ai_act.pdf")
raw_docs = loader.load()

# 2. Split the loaded documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    separators=["\n\n","."],
    add_start_index=True
)
docs = text_splitter.split_documents(raw_docs)

print(f"Number of chunks: {len(docs)}")


Number of chunks: 270


In [10]:
# 3. Use OllamaEmbeddings
# Note: Ensure Ollama is running: `ollama serve`
# and you have a model like `llama3` available locally.
embeddings = OllamaEmbeddings(model="llama3")

  embeddings = OllamaEmbeddings(model="llama3")


In [14]:
!pip install faiss-cpu



In [21]:
!pip install numpy==1.26.4

Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl (20.6 MB)
Installing collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [22]:
vector_1 = embeddings.embed_query(docs[0].page_content)
vector_2 = embeddings.embed_query(docs[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

Generated vectors of length 4096

[-1.6067274808883667, 0.021475689485669136, 0.6372528076171875, 0.8379046320915222, -1.8185207843780518, -1.669992446899414, -6.153677940368652, -0.22479216754436493, -4.192078113555908, -0.2850227653980255]


In [24]:
!pip install -qU langchain-community

In [26]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [28]:
ids = vector_store.add_documents(documents=docs)

In [33]:
results = vector_store.similarity_search(
    "According to the EU AI Act, what transparency obligations must providers of high-risk AI systems comply with, and how do these differ from obligations for general-purpose AI systems?"
)

print(results[0])

page_content='. Where the compliance of the AI systems with 
the requirements set out in Chapter 2 of this Title has been demonstrated following 
that conformity assessm ent, the providers shall draw up an EU declaration of 
conformity in accordance with Article 48 and affix the CE marking of conformity in 
accordance with Article 49.  
2. For high-risk AI systems referred to in point 5(b) of Annex III that are placed on the 
market or put into service by providers that are credit institutions regulated by 
Directive 2013/36/EU, the conformity assessment  shall be carried out as part of the 
procedure referred to in Articles 97 to101 of that Directive. 
Article 20 
Automatically generated logs 
1. Providers of high -risk AI systems shall keep the logs automatically generated by 
their high-risk AI systems, to the extent such logs are under their control by virtue of 
a contractual arrangement with the user or otherwise by law. The logs s hall be kept 
for a period that is appropriate i

In [32]:
results = await vector_store.asimilarity_search("According to the EU AI Act, what transparency obligations must providers of high-risk AI systems comply with, and how do these differ from obligations for general-purpose AI systems?")

print(results[0])

page_content='. That guida nce shall be issued 12 months after the entry into force of 
this Regulation, at the latest. 
3. For high-risk AI systems referred to in point 5(b) of Annex III which are placed on 
the market or put into service by providers that are credit institutions regu lated by 
Directive 2013/36/EU and for high -risk AI systems which are safety components of 
devices, or are themselves devices, covered by Regulation (EU) 2017/745 and 
Regulation (EU) 2017/746, the notification of serious incidents or malfunctioning 
shall be limited to those that that constitute a breach of obligations under Union law 
intended to protect fundamental rights.' metadata={'source': 'eu_ai_act.pdf', 'page': 75, 'start_index': 2430}


In [35]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)


retriever.batch(
    [
        "How many distribution centers does Nike have in the US?",
        "When was Nike incorporated?",
    ],
)

[[Document(id='6816a522-768a-4763-a69f-ba5ed3127113', metadata={'source': 'eu_ai_act.pdf', 'page': 85, 'start_index': 1014}, page_content='.” \nArticle 78 \nAmendment to Directive 2014/90/EU \nIn Article 8 of Directive 2014/90/EU, the following paragraph is added: \n“4. For Artificial Intelligence systems  which are safety components in the meaning of \nRegulation (EU) YYY/XX [on Artificial Intelligence]  of the European Parliament and of the \nCouncil*, when carrying out its activities pursuant to paragraph 1 and when adopting \ntechnical specifications and testing standards in accordance with paragraphs 2 and 3, the \nCommission shall take into account the requirements set out in Title III, Chapter 2 of that \nRegulation. \n__________ \n* Regulation (EU) YYY/XX [on Artificial Intelligence] (OJ …).”. \nArticle 79 \nAmendment to Directive (EU) 2016/797 \nIn Article 5 of Directive (EU) 2016/797, the following paragraph is added: \n“12. When adopting delegated acts pursuant to paragraph 

In [2]:
# Step-by-Step Explanation and Script for EU AI Act PDF Analysis

# Install required dependencies
# Run these commands in your terminal or uncomment to install directly via the script:
# !pip install -qU langchain-ollama langchain-community pypdf faiss-cpu numpy

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.runnables import chain
from typing import List
from langchain_core.documents import Document

# Step 1: Load the EU AI Act PDF
# Use PyPDFLoader to load the PDF into raw document objects.
pdf_path = "eu_ai_act.pdf"  # Replace with the path to your PDF file
loader = PyPDFLoader(pdf_path)
raw_docs = loader.load()

# Step 2: Split the loaded documents into manageable chunks
# Use RecursiveCharacterTextSplitter to divide the content into smaller sections for efficient processing.
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=200,
    separators=["\n\n", "."],
    add_start_index=True
)
docs = text_splitter.split_documents(raw_docs)
print(f"Number of chunks created from PDF: {len(docs)}")

# Step 3: Use OllamaEmbeddings
# Note: Ensure Ollama is running (`ollama serve`) and a model like `llama3` is available locally.
# Generate embeddings using the Ollama embedding model.
embeddings = OllamaEmbeddings(model="llama3")

# Step 4: Initialize InMemoryVectorStore
# Store and manage the embeddings in an in-memory vector store.
vector_store = InMemoryVectorStore(embeddings)
ids = vector_store.add_documents(documents=docs)
print(f"Documents added to vector store. Total IDs: {len(ids)}")

# Step 5: Perform Similarity Search
# Query the vector store to find the most relevant chunk(s) based on the given question.
query = ("According to the EU AI Act, what transparency obligations must providers of high-risk AI systems "
         "comply with, and how do these differ from obligations for general-purpose AI systems?")
results = vector_store.similarity_search(query, k=1)  # Adjust `k` for more results

print("\n--- Top Matching Document ---")
print(results[0].page_content if results else "No relevant document found.")

# Step 6: Define a retriever function for batch queries
# Process multiple queries in one call to the retriever.
@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)

# Batch query example
batch_queries = [
    "What are the transparency obligations in the EU AI Act?",
    "How does the EU AI Act define high-risk AI systems?"
]
batch_results = retriever.batch(batch_queries)
print("\n--- Batch Query Results ---")
for idx, result in enumerate(batch_results):
    print(f"Query {idx + 1}: {batch_queries[idx]}")
    print(f"Answer: {result[0].page_content if result else 'No relevant document found.'}")
    print("-----------")


Number of chunks created from PDF: 270


  embeddings = OllamaEmbeddings(model="llama3")


Documents added to vector store. Total IDs: 270

--- Top Matching Document ---
. That guida nce shall be issued 12 months after the entry into force of 
this Regulation, at the latest. 
3. For high-risk AI systems referred to in point 5(b) of Annex III which are placed on 
the market or put into service by providers that are credit institutions regu lated by 
Directive 2013/36/EU and for high -risk AI systems which are safety components of 
devices, or are themselves devices, covered by Regulation (EU) 2017/745 and 
Regulation (EU) 2017/746, the notification of serious incidents or malfunctioning 
shall be limited to those that that constitute a breach of obligations under Union law 
intended to protect fundamental rights.

--- Batch Query Results ---
Query 1: What are the transparency obligations in the EU AI Act?
Answer: . That guida nce shall be issued 12 months after the entry into force of 
this Regulation, at the latest. 
3. For high-risk AI systems referred to in point 5(b) of A