# Scraping: récupération des PDFs des bulletins AVB

## Imports

In [8]:
import os
import re
import time
import sys
import requests

## Créer un répertoire pour stocker tous les fichiers PDF

In [13]:
pdf_path = '../data/pdf'
# Créer le dossier s'il n'existe pas
if not os.path.exists(pdf_path):
    os.makedirs(pdf_path)

## Récupérer les URLs des pdf de tous les bulletins communaux

https://archives.bruxelles.be/bulletins/date

In [10]:
root_url = "https://archives.bruxelles.be/bulletins/date"
resp = requests.get(root_url)
print(f"Status: {resp.status_code}")
print(f"Encoding: {resp.encoding}")
html = resp.text
print(f"Text length: {len(html)}")

pattern = r"https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/.*\.pdf"
urls = re.findall(pattern, html)
print(f"{len(urls)} PDF files found")

Status: 200
Encoding: utf-8
Text length: 821446
2833 PDF files found


In [11]:
# Impression des 10 premières URLs
print(urls[:10])

['https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_1.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_2.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_3.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_4.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1847_Tome_I1_Part_5.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1848_Tome_I1_Part_1.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1848_Tome_I1_Part_2.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1848_Tome_I1_Part_3.pdf', 'https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1849_Tome_I1_Part_1.pdf', 'https://archief.brussel.be/Colossus/Bulletin

## Télécharger tous les PDFs

In [5]:
start_offset = 0
end_offset = len(urls)
for url in urls[start_offset:end_offset]:
    filename = url.split("/")[-1]
    if not os.path.exists(os.path.join(pdf_path, filename)):
        print(f"Downloading {filename}...")
        start_time = time.time()
        response = requests.get(url)
        print(f"   done in {(time.time() - start_time):.1f} seconds")
        output_file = open(os.path.join(pdf_path, filename), "wb")
        output_file.write(response.content)
    else:
        print(f"{filename} already downloaded")
print("Done")

Bxl_1847_Tome_I1_Part_1.pdf already downloaded
Bxl_1847_Tome_I1_Part_2.pdf already downloaded
Bxl_1847_Tome_I1_Part_3.pdf already downloaded
Bxl_1847_Tome_I1_Part_4.pdf already downloaded
Bxl_1847_Tome_I1_Part_5.pdf already downloaded
Bxl_1848_Tome_I1_Part_1.pdf already downloaded
Bxl_1848_Tome_I1_Part_2.pdf already downloaded
Bxl_1848_Tome_I1_Part_3.pdf already downloaded
Bxl_1849_Tome_I1_Part_1.pdf already downloaded
Bxl_1849_Tome_I1_Part_2.pdf already downloaded
Bxl_1849_Tome_I1_Part_3.pdf already downloaded
Bxl_1849_Tome_I1_Part_4.pdf already downloaded
Bxl_1849_Tome_I1_Part_5.pdf already downloaded
Bxl_1849_Tome_II1_Part_1.pdf already downloaded
Bxl_1849_Tome_II1_Part_2.pdf already downloaded
Bxl_1849_Tome_II1_Part_3.pdf already downloaded
Bxl_1849_Tome_II1_Part_4.pdf already downloaded
Bxl_1849_Tome_II1_Part_5.pdf already downloaded
Bxl_1849_Tome_II1_Part_6.pdf already downloaded
Bxl_1849_Tome_II1_Part_7.pdf already downloaded
Bxl_1850_Tome_I1_Part_1.pdf already downloaded
Bxl_18

ConnectionError: HTTPSConnectionPool(host='archief.brussel.be', port=443): Max retries exceeded with url: /Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1884_Tome_I2_Part_7.pdf (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f818407c400>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

## Vérifier que tous les PDFs ont été téléchargés


Si ce n'est pas le cas, vous pouvez relancer l'étape de téléchargement (elle ignorera les documents déjà téléchargés)

In [21]:
ok_count = 0
for url in urls:
    filename = url.split("/")[-1]
    downloads = os.listdir(pdf_path)
    if filename not in downloads:
        print(f"{filename} is missing!")
    else:
        ok_count += 1
print(f"{ok_count} PDFs found on {len(urls)}!")

Bxl_1866_Tome_II1_Part_5.pdf is missing!
Bxl_1866_Tome_II1_Part_6.pdf is missing!
Bxl_1866_Tome_II1_Part_7.pdf is missing!
Bxl_1867_Tome_I1_Part_1.pdf is missing!
Bxl_1867_Tome_I1_Part_2.pdf is missing!
Bxl_1867_Tome_I1_Part_3.pdf is missing!
Bxl_1867_Tome_I1_Part_4.pdf is missing!
Bxl_1867_Tome_II1_Part_1.pdf is missing!
Bxl_1867_Tome_II1_Part_2.pdf is missing!
Bxl_1867_Tome_II1_Part_3.pdf is missing!
Bxl_1867_Tome_II1_Part_4.pdf is missing!
Bxl_1867_Tome_II1_Part_5.pdf is missing!
Bxl_1867_Tome_II1_Part_6.pdf is missing!
Bxl_1868_Tome_I1_Part_1.pdf is missing!
Bxl_1868_Tome_I1_Part_2.pdf is missing!
Bxl_1868_Tome_I1_Part_3.pdf is missing!
Bxl_1868_Tome_II1_Part_1.pdf is missing!
Bxl_1868_Tome_II1_Part_2.pdf is missing!
Bxl_1868_Tome_II1_Part_3.pdf is missing!
Bxl_1868_Tome_II1_Part_4.pdf is missing!
Bxl_1868_Tome_II1_Part_5.pdf is missing!
Bxl_1868_Tome_II1_Part_6.pdf is missing!
Bxl_1869_Tome_I1_Part_1.pdf is missing!
Bxl_1869_Tome_I1_Part_2.pdf is missing!
Bxl_1869_Tome_I1_Part_3.p

## Pour en savoir plus

- Le web scraping avec Python: https://realpython.com/beautiful-soup-web-scraper-python/
- Tutoriel sur les expressions régulières: https://www.w3schools.com/python/python_regex.asp