In [1]:
import os
import PyPDF2
import pandas as pd
from pathlib import Path
import numpy as np
from docx import Document


### User Variables

In [3]:
"""
USER: Specify these variables
"""

folder_path = "tweede_pdfs2024"
file_year = 2024
csv_output_name = "tweede_2024"

"""
OPTION: Specify if you would like to reduce the dataset to a specific speech type. 
See the possible attributes here: https://opendata.tweedekamer.nl/documentatie/document
The code will search the first 10 lines of each document (header + a bit extra) and drop those that do not contain this string.
"""
speech_type = "tweeminutendebat"

### Functions

In [2]:
def list_files(folder_path):
    return [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print(f"An error occurred: {e}")
    return text

def extract_text_from_docx(docx_path):
    text = ""
    try:
        doc = Document(docx_path)
        for para in doc.paragraphs:
            if para.text:
                text += para.text + "\n"
    except Exception as e:
        print(f"An error occurred: {e}")
    return text

def get_first_ten_lines(text):
    lines = text.splitlines()
    first_ten = lines[:10]
    return "\n".join(first_ten)



### Code

In [4]:
# Create the list of file names and paths
file_names = list_files(folder_path)

# Create an empty pandas DataFrame with three columns: filename, year, text.
speech_info = pd.DataFrame(columns= ["filename", "year", "text"])

In [5]:
# Fill the df with info from the documents
"""
This code will take 15 or more minutes to run and will generate 227 'EOF marker not found' errors - which is fine
The documents that are generating errors are .docx files mislabled as PDFs.
"""
for name in file_names:
    temp_path = os.path.join(folder_path, name)

    extracted_text = extract_text_from_pdf(temp_path)
    text_ten = get_first_ten_lines (extracted_text)

    data = pd.Series({
        "filename": name,
        "year": file_year,
        "text": text_ten})
    
    new_row = data.to_frame().T
    speech_info = pd.concat([speech_info, new_row], ignore_index=True)

# Display the DataFrame
speech_info

An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found
An error occurred: EOF marker not found


Unnamed: 0,filename,year,text
0,000299a9-e3b9-4997-9a25-4e2b00e61dca.pdf,2024,26 \nStemming motie Democr atisc he zeg- \ng...
1,00035a28-9a80-448e-86c3-9886aa73a9b8.pdf,2024,8 \nUitvoeringsw et digitaledienst enveror- \n...
2,003d76b1-e231-4b4c-b3c2-abbb15c2e1dc.pdf,2024,6 \nRegeling van werkzaamheden (stem- \nming...
3,005e351c-9a1d-416c-ba59-35460a136de4.pdf,2024,15 \nStemming en moties Vreemdeling en- \nen ...
4,006ea61f-8c52-4d91-87c5-ef5da4411a4d.pdf,2024,17 \nStemming Verdere behandeling van \naanh...
...,...,...,...
1443,feb6b77d-74af-4d50-8a5f-f58f73a6b7a7.pdf,2024,7 \nStemming en moties Raad Buitenlandse \nZ...
1444,fedb0619-f944-4a0f-8ab5-3c7245837955.docx,2024,
1445,ff5979bb-7db1-4c1f-8d01-e5127f1d7528.pdf,2024,11 \nBelastingdienst \nVoorzit ter: Van Vroonh...
1446,ff887799-5328-420f-a0f7-9c0f6fca0951.pdf,2024,24 \nStemming en moties Wet leeruitkom- \nste...


In [9]:
# Change empty strings to nan
speech_info['text'] = speech_info['text'].replace(r'^\s*$', np.nan, regex=True)

# Move the EOF error files into their own df, removing them from the old one.
print("nulls in speech df before: ", speech_info['text'].isnull().sum())
print()

df_none = speech_info[speech_info['text'].isnull()].copy()
speech_info.dropna(subset=['text'], inplace=True)

print("nulls in speech df after: ", speech_info['text'].isnull().sum())
print("nulls in .docx df after: ", df_none['text'].isnull().sum())
print()
df_none

nulls in speech df before:  227

nulls in speech df after:  0
nulls in .docx df after:  227



Unnamed: 0,filename,year,text
16,01aeb7a7-eef8-42a9-87fd-a088518abd3a.docx,2024,
20,03014aa5-ab10-4625-a168-57a66c4b479f.docx,2024,
24,04151a06-f5b4-440e-a7dc-a06c1a10114a.docx,2024,
25,041c51b3-eba0-4688-8498-534ce53a088b.docx,2024,
34,05fcde88-4beb-4168-9f6c-08097fba4d34.docx,2024,
...,...,...,...
1421,fb376dc5-23b8-4cfd-b019-7f53bdd06b24.docx,2024,
1424,fbbdb407-23c8-4d04-a3b2-b3368b81457c.docx,2024,
1429,fc3083e6-de8c-41be-a12b-331971fd1708.docx,2024,
1444,fedb0619-f944-4a0f-8ab5-3c7245837955.docx,2024,


In [None]:
# Change the file names in the df_none to .docx
df_none.loc[df_none['text'].isnull(), 'filename'] = (
    df_none.loc[df_none['text'].isnull(), 'filename']
      .str.replace('.pdf', '.docx', regex=False))


# list the file names
files_to_change = df_none['filename'].tolist()

# build file paths (old and new file names) and rename the docx files
for file_name in files_to_change:

    base_name = os.path.splitext(file_name)[0]
    old_file_name = base_name + '.pdf'
    
    new_path = os.path.join(folder_path, file_name)
    old_path = os.path.join(folder_path, old_file_name)
    os.rename(old_path, new_path)

df_none

In [12]:
# Create the list of docx file names and paths
docx_names = df_none["filename"].tolist()

# get the info on the docx, incl. first ten lines of the docx files, and
# add them back to the speech df as new rows
for name in docx_names:
    temp_path = os.path.join(folder_path, name)

    extracted_text = extract_text_from_docx(temp_path)
    text_ten = get_first_ten_lines (extracted_text)

    data = pd.Series({
        "filename": name,
        "year": file_year,
        "text": text_ten})
    
    new_row = data.to_frame().T
    speech_info = pd.concat([speech_info, new_row], ignore_index=True)

print("nulls in speech df w docx info: ", speech_info['text'].isnull().sum())
speech_info.tail(10)

nulls in speech df w docx info:  0


Unnamed: 0,filename,year,text
1438,f4c216cf-e27d-4859-a0ba-8379733f6c01.docx,2024,"\nTweede Kamer, Wetsvoorstel afschaffen gronds..."
1439,f54b797d-ef74-4052-86e6-e5d20b5a2caa.docx,2024,"Stikstof, NPLG en natuur\nStikstof, NPLG en na..."
1440,f77123d0-8453-46fb-ba79-26a2a46290bd.docx,2024,Dakloosheid\nDakloosheid\n\nAan de orde is het...
1441,f7e6bea8-db93-46cc-836f-fdd4fce221b4.docx,2024,Regeling van werkzaamheden\nDe voorzitter:\nIk...
1442,fa9dd2bb-2b8c-433d-89da-0db24a3fc180.docx,2024,Begroting Sociale Zaken en Werkgelegenheid 202...
1443,fb376dc5-23b8-4cfd-b019-7f53bdd06b24.docx,2024,Carbon Capture & Storage (CCS)\nCarbon Capture...
1444,fbbdb407-23c8-4d04-a3b2-b3368b81457c.docx,2024,Exportkredietverzekeringen\nExportkredietverze...
1445,fc3083e6-de8c-41be-a12b-331971fd1708.docx,2024,"\nTweede Kamer, Mensenrechtenbeleid\n\nVERSLAG..."
1446,fedb0619-f944-4a0f-8ab5-3c7245837955.docx,2024,Regeling van werkzaamheden (stemmingen)\nRegel...
1447,ffd70df6-f24c-458b-bdcf-ce3a88c4b3e3.docx,2024,Raad Algemene Zaken d.d. 17 december 2024 en E...


In [19]:
# save the df to csv
speech_info.to_csv(csv_output_name+".csv", index=False)

# OPTION: Reduce dataset to just the specified speech type

In [15]:
# drop rows that do not contain the designated speech type (from top code cell) in the first 10 rows of the document
speech_info = speech_info[speech_info['text'].str.contains("tweeminutendebat", na=False)]
print(speech_info.shape)
speech_info

(247, 3)


Unnamed: 0,filename,year,text
7,00b786eb-afed-48d1-a170-fd385ec7942b.pdf,2024,"6 \nLandbouw , klimaat en voedsel \nLandbouw ..."
9,00e8a26a-ee73-427d-9a0c-09d3f58c8662.pdf,2024,4 \nNationaal Plan Ener giesyst eem \nNationa...
22,04318c33-4aee-4780-965a-703f179cbc09.pdf,2024,6 \nPersonen- en familier echt \nPersonen- e...
24,04d68bb3-e423-4f57-83ca-4322259c58e8.pdf,2024,27 \nGevangenisw ezen en tbs \nGevangeniswez e...
26,04f304bd-484b-4002-92b5-88a8fb96e09e.pdf,2024,6 \nWoningbouw opgave en koopsect or \nWoningb...
...,...,...,...
1439,f54b797d-ef74-4052-86e6-e5d20b5a2caa.docx,2024,"Stikstof, NPLG en natuur\nStikstof, NPLG en na..."
1443,fb376dc5-23b8-4cfd-b019-7f53bdd06b24.docx,2024,Carbon Capture & Storage (CCS)\nCarbon Capture...
1444,fbbdb407-23c8-4d04-a3b2-b3368b81457c.docx,2024,Exportkredietverzekeringen\nExportkredietverze...
1446,fedb0619-f944-4a0f-8ab5-3c7245837955.docx,2024,Regeling van werkzaamheden (stemmingen)\nRegel...


In [18]:
# save the df to csv
temp_name = csv_output_name + speech_type
speech_info.to_csv(temp_name+".csv", index=False)