In [1]:
from dotenv import load_dotenv
import os
import requests
from mistralai import Mistral
import numpy as np

load_dotenv()

True

In [2]:
MISTRAL_API_KEY = os.getenv("MISTRAL_API")

client = Mistral(api_key=MISTRAL_API_KEY)

## Document Parsing


In [None]:
import os, sys, subprocess
import fitz
from pathlib import Path

pdf_path = Path("data") / "ai_act.pdf"
assert pdf_path.exists(), f"PDF not found at {pdf_path}"

with fitz.open(pdf_path) as doc:
    pages_text = []
    for page in doc:
        pages_text.append(page.get_text("text"))
text = "\n\n".join(pages_text)

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", "", ".", "?", "!"],
)
chunks = splitter.split_text(text)

In [None]:
print(f"Total characters: {len(text):,}")
print(
    f"Total pages (approx): {'unknown' if 'pages_text' not in globals() else len(pages_text)}"
)
lengths = [len(c) for c in chunks]
print(
    f"Chunks: {len(chunks)} | avg: {int(np.mean(lengths)) if lengths else 0} | min: {min(lengths) if lengths else 0} | max: {max(lengths) if lengths else 0}"
)
print("Sample chunk 0:\n", chunks[0][:500] if chunks else "<no chunks>")

Total characters: 600,061
Total pages (approx): 144
Chunks: 639 | avg: 1040 | min: 171 | max: 1199
Sample chunk 0:
 REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL
of 13 June 2024
laying down harmonised rules on artificial intelligence and amending Regulations (EC) No 300/2008, 
(EU) No 167/2013, (EU) No 168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and 
Directives 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Artificial Intelligence Act)
(Text with EEA relevance)
THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,
Having regard to the Treaty on the Functioning 


In [None]:
# Sample the first 100 chunks for inspection
sample_chunks = chunks[:50]
print(f"Sampling first {len(sample_chunks)} chunks out of {len(chunks)} total chunks")
print(
    f"Sample chunk lengths: min={min(len(c) for c in sample_chunks)}, max={max(len(c) for c in sample_chunks)}, avg={int(np.mean([len(c) for c in sample_chunks]))}"
)


Sampling first 50 chunks out of 639 total chunks
Sample chunk lengths: min=398, max=1196, avg=1077


In [12]:
sample_chunks

['REGULATION (EU) 2024/1689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL\nof 13 June 2024\nlaying down harmonised rules on artificial intelligence and amending Regulations (EC) No 300/2008, \n(EU) No 167/2013, (EU) No 168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and \nDirectives 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Artificial Intelligence Act)\n(Text with EEA relevance)\nTHE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION,\nHaving regard to the Treaty on the Functioning of the European Union, and in particular Articles 16 and 114 thereof,\nHaving regard to the proposal from the European Commission,\nAfter transmission of the draft legislative act to the national parliaments,\nHaving regard to the opinion of the European Economic and Social Committee (1),\nHaving regard to the opinion of the European Central Bank (2),\nHaving regard to the opinion of the Committee of the Regions (3),\nActing in accordance with the ordinary legislative procedure (4),\nWh

## Data Ingestion


In [None]:
def get_text_embedding(input):
    embeddings_batch_response = client.embeddings.create(
        model="mistral-embed", inputs=input
    )
    return embeddings_batch_response.data[0].embedding


text_embeddings = np.array([get_text_embedding(chunk) for chunk in sample_chunks])

In [16]:
import faiss

# Build vector search index for semantic similarity
embedding_dimension = text_embeddings.shape[1]
similarity_index = faiss.IndexFlatL2(embedding_dimension)
similarity_index.add(text_embeddings)

## Retrieval Step


In [None]:
# Query: find most relevant document chunks
user_question = "What is biometric categorisation?"
question_embedding = np.array([get_text_embedding(user_question)])

# Search for the 2 most similar chunks using L2 distance
distances, chunk_indices = similarity_index.search(question_embedding, k=3)

# Retrieve the actual text chunks that are most relevant to the question
most_relevant_chunks = [chunks[idx] for idx in chunk_indices[0]]

## Generation


In [28]:
prompt = f""" You are an AI Assistant specialised in Conformity & Regulation. You are given a question and a context. You need to answer the question based on the context.
Context information is below.
---------------------
{most_relevant_chunks}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {user_question}
Answer:
"""

In [None]:
def run_mistral(user_message, model="mistral-large-latest"):
    messages = [{"role": "user", "content": user_message}]
    chat_response = client.chat.complete(model=model, messages=messages)
    return chat_response.choices[0].message.content


answer = run_mistral(prompt)

'Based on the provided context, **biometric categorisation** is defined as the process of assigning natural persons to specific categories based on their biometric data. These categories can include aspects such as:\n\n- Sex\n- Age\n- Hair colour\n- Eye colour\n- Tattoos\n- Behavioural or personality traits\n- Language\n- Religion\n- Membership of a national minority\n- Sexual or political orientation\n\nHowever, the context specifies that this definition does **not** include biometric categorisation systems that are purely ancillary features intrinsically linked to another commercial service, where the feature cannot be used independently of the principal service. Examples of such ancillary features include filters on online marketplaces or social networks that allow users to preview products or modify images, as these are tied to the primary service (e.g., purchasing a product or sharing content).\n\nAdditionally, the context highlights that certain biometric categorisation systems, 

## Full Pipeline


## Evaluation


In [38]:
# Build chunk documents with page-aware metadata for citations
import sys, subprocess
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

pdf_file = Path("data/ai_act.pdf")
loader = PyPDFLoader(str(pdf_file))
page_docs = loader.load()

# Stamp a stable citation string on each page
documents_with_source = []
for d in page_docs:
    d.metadata["source"] = f"{pdf_file.name}#page={d.metadata.get('page', 'NA')}"
    documents_with_source.append(d)

splitter_docs = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", "", ".", "?", "!"],
)
chunk_docs = splitter_docs.split_documents(documents_with_source)

# Create a sample of the first 50 chunks
sample_chunk_docs = chunk_docs[:50]

doc_texts = [d.page_content for d in sample_chunk_docs]
doc_metas = [d.metadata for d in sample_chunk_docs]
doc_sources = [m.get("source", "unknown") for m in doc_metas]

len(sample_chunk_docs), doc_metas[0] if doc_metas else {}

(50,
 {'producer': 'PDFlib+PDI 9.0.7p3 (C++/Win64)',
  'creator': 'Servigistics Arbortext Publishing Engine',
  'creationdate': '2024-07-11T14:47:17+02:00',
  'author': 'Publications Office of the European Union L-2985 Luxembourg LUXEMBOURG',
  'keywords': 'ISSN 1977-0677',
  'moddate': '2024-07-11T15:55:28+02:00',
  'subject': 'I Legislative acts',
  'title': 'Regulation (EU) 2024/1689 of the European Parliament and of the Council of 13 June 2024 laying down harmonised rules on artificial intelligence and amending Regulations (EC) No 300/2008, (EU) No 167/2013, (EU) No 168/2013, (EU) 2018/858, (EU) 2018/1139 and (EU) 2019/2144 and Directives 2014/90/EU, (EU) 2016/797 and (EU) 2020/1828 (Artificial Intelligence Act)Text with EEA relevance.',
  'source': 'ai_act.pdf#page=0',
  'total_pages': 144,
  'page': 0,
  'page_label': '1'})

In [None]:
def get_text_embedding(input):
    embeddings_batch_response = client.embeddings.create(
        model="mistral-embed", inputs=input
    )
    return embeddings_batch_response.data[0].embedding


# Embed each chunk's text content (not the document object itself)
text_embeddings = np.array(
    [get_text_embedding(chunk.page_content) for chunk in sample_chunk_docs]
)

In [40]:
import faiss

# Build vector search index for semantic similarity
embedding_dimension = text_embeddings.shape[1]
similarity_index = faiss.IndexFlatL2(embedding_dimension)
similarity_index.add(text_embeddings)

In [None]:
# Retrieval helper that returns texts + sources
from typing import List, Tuple


def retrieve(query: str, k: int = 4) -> Tuple[List[str], List[str]]:
    q_emb = np.array([get_text_embedding(user_question)])
    distances, idxs = similarity_index.search(q_emb.astype(np.float32), k)
    idxs = idxs[0]
    texts = [doc_texts[i] for i in idxs]
    sources = [doc_sources[i] for i in idxs]
    return texts, sources


retrieve("What is biometric categorisation?", k=3)

(['(16) The notion of ‘biometr ic catego r isation’ refer red to in this Regulation should be def ined as assigning natural persons \nt o specif ic cate gor ies on the basis of their biometr ic data. Such specif ic catego r ies can relate to aspects suc h as sex, \nag e, hair colour , e ye colour , tattoos, behavio ural or personality traits, languag e, religion, membership of a national \nminor ity , sexual or political or ientation. This does not include biometr ic catego r isation systems that are a purely \nancillar y f eature intr insically linked to another commercial ser vice, meaning that the f eature cannot, f or objective \nt echnical reasons, be used without the pr incipal ser vice, and the inte gration of that feature or functionality is not \na means to circum vent the applicability of the r ules of this Regulation. For example, filters cate gor ising f acial or body \nf eatures used on online marke tplaces could constitute suc h an ancillar y f eature as they can be used 

In [None]:
# Generation with inline context and a Sources footer


def answer_with_citations(question: str, model="mistral-large-latest", k: int = 4):
    contexts, sources = retrieve(question, k=k)
    context_block = "\n\n---\n\n".join(contexts)
    prompt = f"""
You are an AI Assistant specialised in Conformity & Regulation.
Answer the user's question concisely using ONLY the provided context. If unsure, say you don't know.

Context:
---------------------
{context_block}
---------------------
Question: {question}
Answer with a short paragraph. Then provide a 'Sources' section listing the citations.
"""
    messages = [{"role": "user", "content": prompt}]
    chat_response = client.chat.complete(model=model, messages=messages)
    answer = chat_response.choices[0].message.content

    # Deduplicate and sort sources for readability
    unique_sources = []
    for s in sources:
        if s not in unique_sources:
            unique_sources.append(s)
    sources_footer = "\n".join(f"- {s}" for s in unique_sources)

    return f"{answer}\n\nSources:\n{sources_footer}"


print(answer_with_citations("What is biometric categorisation?", k=3))

Biometric categorisation refers to the process of assigning natural persons to specific categories based on their biometric data. These categories can include aspects such as sex, age, hair colour, eye colour, tattoos, behavioural or personality traits, language, religion, membership of a national minority, or sexual or political orientation. However, this does not include biometric categorisation systems that are purely ancillary features intrinsically linked to another commercial service, where the feature cannot be used independently for technical reasons.

**Sources:**
- Context (16)

Sources:
- ai_act.pdf#page=4
- ai_act.pdf#page=3
