## Import libraries

In [2]:
%load_ext autoreload
%autoreload 2
import os
from pprint import pprint
from IPython.display import Markdown
import requests

In [3]:
from dotenv import load_dotenv

load_dotenv()  # Load environment variables from .env file

True

## Get the ID list of the last three sessions

In [4]:
url = 'https://data.europarl.europa.eu/api/v2/plenary-session-documents'

params = {
    'work-type': 'CRE_PLENARY',
    'limit': 25,
    'offset':40,
    'format':"application/ld+json"
}
response = requests.get(url, params=params).json()
response

{'data': [{'id': 'eli/dl/doc/CRE-10-2025-03-31',
   'type': 'Work',
   'work_type': 'def/ep-document-types/CRE_PLENARY',
   'identifier': 'CRE-10-2025-03-31',
   'label': 'P10_CRE(2025)03-31'},
  {'id': 'eli/dl/doc/CRE-10-2025-04-01',
   'type': 'Work',
   'work_type': 'def/ep-document-types/CRE_PLENARY',
   'identifier': 'CRE-10-2025-04-01',
   'label': 'P10_CRE(2025)04-01'},
  {'id': 'eli/dl/doc/CRE-10-2025-04-02',
   'type': 'Work',
   'work_type': 'def/ep-document-types/CRE_PLENARY',
   'identifier': 'CRE-10-2025-04-02',
   'label': 'P10_CRE(2025)04-02'},
  {'id': 'eli/dl/doc/CRE-10-2025-04-03',
   'type': 'Work',
   'work_type': 'def/ep-document-types/CRE_PLENARY',
   'identifier': 'CRE-10-2025-04-03',
   'label': 'P10_CRE(2025)04-03'},
  {'id': 'eli/dl/doc/CRE-10-2025-05-05',
   'type': 'Work',
   'work_type': 'def/ep-document-types/CRE_PLENARY',
   'identifier': 'CRE-10-2025-05-05',
   'label': 'P10_CRE(2025)05-05'},
  {'id': 'eli/dl/doc/CRE-10-2025-05-06',
   'type': 'Work',
  

In [5]:
response['data'][-1]['identifier']

'CRE-9-2020-01-16'

In [6]:
id_list = []
for i in range(len(response['data'])):
    id_list.append(response['data'][i]['identifier'])

print(id_list)

['CRE-10-2025-03-31', 'CRE-10-2025-04-01', 'CRE-10-2025-04-02', 'CRE-10-2025-04-03', 'CRE-10-2025-05-05', 'CRE-10-2025-05-06', 'CRE-10-2025-05-07', 'CRE-10-2025-05-08', 'CRE-10-2025-05-21', 'CRE-10-2025-05-22', 'CRE-10-2025-06-16', 'CRE-10-2025-06-17', 'CRE-10-2025-06-18', 'CRE-10-2025-06-19', 'CRE-10-2025-07-07', 'CRE-10-2025-07-08', 'CRE-10-2025-07-09', 'CRE-10-2025-07-10', 'CRE-10-2025-09-08', 'CRE-10-2025-09-09', 'CRE-10-2025-09-10', 'CRE-10-2025-09-11', 'CRE-9-2019-10-23', 'CRE-9-2019-10-24', 'CRE-9-2020-01-16']


## Extract the PDF or XML with the help of the identifier.

In [7]:
url_ep = f'https://www.europarl.europa.eu/doceo/document/{id_list[-3]+"_EN"}.pdf'

In [8]:
for id in id_list:
    url_ep = f'https://www.europarl.europa.eu/doceo/document/{id+"_EN"}.pdf'
    resp = requests.get(url_ep)
    with open(f'EP/{id}_EN.pdf', 'wb') as f:
        f.write(resp.content)

In [9]:
url_ep

'https://www.europarl.europa.eu/doceo/document/CRE-9-2020-01-16_EN.pdf'

## Store all the documents in a chroma vector store

In [10]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")

E0000 00:00:1759240613.318100  243198 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.
E0000 00:00:1759240613.322498  243198 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [11]:
from langchain_community.document_loaders import PyPDFLoader
file_path = f'data/{id_list[0]}.pdf'
loader = PyPDFLoader('metadata.pdf')
pages = []
for page in loader.lazy_load():
    pages.append(page)

In [12]:
pages[0].page_content

'2024-2029\nПЪЛЕН ПРОТОКОЛ НА РАЗИСКВАНИЯТА DEBAŠU STENOGRAMMA\nACTA LITERAL DE LOS DEBATES POSĖDŽIO STENOGRAMA\nDOSLOVNÝ ZÁZNAM ZE ZASEDÁNÍ AZ ÜLÉSEK SZÓ SZERINTI JEGYZŐKÖNYVE\nFULDSTÆNDIGT FORHANDLINGSREFERAT RAPPORTI VERBATIM TAD-DIBATTITI\nAUSFÜHRLICHE SITZUNGSBERICHTE VOLLEDIG VERSLAG VAN DE VERGADERINGEN\nISTUNGI STENOGRAMM PEŁNE SPRAWOZDANIE Z OBRAD\nΠΛΗΡΗ ΠΡΑΚΤΙΚΑ ΤΩΝ ΣΥΖΗΤΗΣΕΩΝ RELATO INTEGRAL DOS DEBATES\nVERBATIM REPORT OF PROCEEDINGS STENOGRAMA DEZBATERILOR\nCOMPTE RENDU IN EXTENSO DES DÉBATS DOSLOVNÝ ZÁPIS Z ROZPRÁV\nTUARASCÁIL FOCAL AR FHOCAL NA N-IMEACHTAÍ DOBESEDNI ZAPISI RAZPRAV\nDOSLOVNO IZVJEŠĆE SANATARKAT ISTUNTOSELOSTUKSET\nRESOCONTO INTEGRALE DELLE DISCUSSIONI FULLSTÄNDIGT FÖRHANDLINGSREFERAT\nЧетвъртък - jueves - Čtvrtek - torsdag - Donnerstag - neljapäev - Πέμπτη - Thursday\njeudi - Déardaoin - četvrtak - giovedì - ceturtdiena - Ketvirtadienis - csütörtök - Il-Ħamis\ndonderdag - czwartek - Quinta-feira - joi - Štvrtok - Četrtek - torstai - torsdag\n10.07.2025\nЕ

In [13]:
# Import the necessary libraries
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain import hub
import getpass
import os

In [14]:
vector_store_c = Chroma(
    collection_name="EU_parliament",
    embedding_function=embeddings,
    persist_directory="./chroma_ep_follower",
)

In [15]:
session_date = id_list[0][-10:]
session_year = int(session_date[:4])
session_month = int(session_date[5:7])
session_day = int(session_date[-2:])
print(session_day, session_month, session_year, session_date)

31 3 2025 2025-03-31


In [16]:
def embed_and_store_fancy(file_path, vector_store, session_date):
    """Load a PDF file, split it into chunks, and store the chunks in a vector store.
    Session_date is added to the metadata of each chunk."""


    # Load the PDF file as a single document
    loader = PyPDFLoader(file_path, mode='single')
    pdf_text = loader.load()

    # Create a text splitter instance
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2_000,  # chunk size (characters)
        chunk_overlap=400,  # chunk overlap (characters)
        add_start_index=True,  # track index in original document
    )

    # Split the document into smaller chunks
    all_splits = text_splitter.transform_documents(pdf_text)

    session_year = int(session_date[:4])
    session_month = int(session_date[5:7])
    session_day = int(session_date[-2:])

    # Add the session_date to the metadata
    for split in all_splits:
        split.metadata['session_day'] = session_day
        split.metadata['session_month'] = session_month
        split.metadata['session_year'] = session_year

    # Add the chunks to the vector store
    document_ids = vector_store.add_documents(documents=all_splits)

    return document_ids

In [17]:
id_list

['CRE-10-2025-03-31',
 'CRE-10-2025-04-01',
 'CRE-10-2025-04-02',
 'CRE-10-2025-04-03',
 'CRE-10-2025-05-05',
 'CRE-10-2025-05-06',
 'CRE-10-2025-05-07',
 'CRE-10-2025-05-08',
 'CRE-10-2025-05-21',
 'CRE-10-2025-05-22',
 'CRE-10-2025-06-16',
 'CRE-10-2025-06-17',
 'CRE-10-2025-06-18',
 'CRE-10-2025-06-19',
 'CRE-10-2025-07-07',
 'CRE-10-2025-07-08',
 'CRE-10-2025-07-09',
 'CRE-10-2025-07-10',
 'CRE-10-2025-09-08',
 'CRE-10-2025-09-09',
 'CRE-10-2025-09-10',
 'CRE-10-2025-09-11',
 'CRE-9-2019-10-23',
 'CRE-9-2019-10-24',
 'CRE-9-2020-01-16']

### Idea: Extract the dates from the ID, and insert filtering here, retaining only interesting sessions in the db

In [18]:
for id in id_list:
    sess_date = id[-10:]
    file_path = f'EP/{id}_EN.pdf'
    embed_and_store_fancy(file_path, vector_store_c, sess_date)

# embed_and_store_fancy('EP/CRE-10-2025-03-31_EN.pdf', vector_store_c, session_date="2025-03-31")

## Use a LLM to query our chroma vector store

In [19]:
import getpass
import os

if not os.environ.get("GOOGLE_API_KEY"):
  os.environ["GOOGLE_API_KEY"] = getpass.getpass("Enter API key for Google Gemini: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

E0000 00:00:1759241212.749692  243198 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [27]:
def answer_filtered(query, vector_store, llm, prompt_template=None, session_year = 2025, session_month = None):
    """Answer a query using the vector store and the language model."""
    # Retrieve similar documents from the vector store
    if session_month:
        filter = {"$and":[{"session_year": {"$eq": session_year}},
                  {"session_month": {"$eq": session_month}}]
                }
        retrieved_docs = vector_store.similarity_search(query, k=6, filter=filter)
    else:
        filter = {"session_year": session_year}
        retrieved_docs = vector_store.similarity_search(query, k=6, filter=filter)

    # Create the prompt
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)

    # If no prompt template is provided, use the default one
    if not prompt_template:
        prompt_template = hub.pull("rlm/rag-prompt")

    prompt = prompt_template.invoke(
        {"context": docs_content, "question": query}
    )

    # Get the answer from the language model
    answer = llm.invoke(prompt)

    return answer.content

In [31]:
query = "Summarize the discussion on water quality"
answer_filtered(query, vector_store_c, model)

'The discussion emphasizes the importance of water resilience, addressing both quantity and quality. Participants highlighted the need to protect water quality from pollution, including "forever chemicals" like PFAS. There\'s also a call for stronger enforcement against pollution and updating standards to reflect current realities.'