In [63]:
import os
from bs4 import BeautifulSoup as bs
import requests
import json
import re
import datetime
import random
import time
from urllib.parse import urljoin

In [51]:
## Search url for resumes

SEARCH_URL_FORMAT = "https://www.ft.dk/da/dokumenter/dokumentlister/referater?startDate={}&endDate={}&pageSize=200"

In [52]:
## Set dates

startdate = "20200101" ## format YYYYMMDD
enddate = "20220130" ## format YYYYMMDD

In [53]:
## Set search string

search_url = SEARCH_URL_FORMAT.format(startdate, enddate)
PAGE_URL_FORMAT = search_url + "&pageNumber={}"

In [70]:
## Results

r = requests.get(search_url)
soup = bs(r.text, "html.parser")

print(soup.find("span", class_ = "results").get_text(strip = True))

302resultater


In [71]:
## Number of pages

no_pages = soup.find("ul", class_ = "pagination pagination-centered text-center").find_all("li")[-1].get_text()

if no_pages is None:
    more_pages = False
else:
    more_pages = True
    no_pages = int(no_pages)
    
if more_pages:
    pagenumbers = list(range(2, no_pages+1)) # Excuding first page as it corresponds to search_url

In [72]:
## Get resume links

resume_links = [link.find('a')['href'] for link in soup.find_all("td", attrs = {"data-title": "Mødedato, -tid og samling"})]

if more_pages:
    for pagenumber in pagenumbers:
        time.sleep(random.uniform(0.5, 1.5))
        page_url = PAGE_URL_FORMAT.format(str(pagenumber))
        r = requests.get(page_url)
        soup = bs(r.text, "html.parser")
        
        page_links = [link.find('a')['href'] for link in soup.find_all("td", attrs = {"data-title": "Mødedato, -tid og samling"})]
        
        resume_links = resume_links + page_links
        
resume_links = [urljoin(search_url, resume_link) for resume_link in resume_links]

In [73]:
len(resume_links)

302

In [74]:
resume_links[0]

'https://www.ft.dk/forhandlinger/20211/20211M053_2022-01-28_1000.htm'

In [94]:
## Resume scraper

def resume_scraper(url):
    
    agendaregex = re.compile('Dagsorden.*')
    
    r = requests.get(url)
    r.encoding = r.apparent_encoding
    soup = bs(r.text, "html.parser")
    
    try:
        title = soup.find("p", class_ = "Titel").get_text()
    except AttributeError:
        title = ""
        
    try:
        subtitle = soup.find("p", class_ = "UnderTitel").get_text()
    except AttributeError:
        subtitle = ""
    
    try:
        agenda = '\n'.join([tag.get_text() for tag in soup.find_all("p", class_ = agendaregex)])
    except AttributeError:
        agenda = ""    
    
    
    resume_dict = {}
    resume_dict['url'] = url
    resume_dict['title'] = title
    resume_dict['subtitle'] = subtitle
    resume_dict['agenda'] = agenda
    resume_dict['text'] = soup.get_text()
    
    return(resume_dict)

In [96]:
## Scrape all sites

resumes = []

for c, resume_link in enumerate(resume_links, start = 1):
    resume = resume_scraper(resume_link)
    
    resumes.append(resume)
    
    progress = "|{0}| {1:.2f} %".format(("="*int(c/len(resume_links) * 50)).ljust(50), c/len(resume_links) * 100)
    print(progress, end = "\r")
    
    time.sleep(random.uniform(0.5, 1))



In [97]:
len(resumes)

302

In [103]:
out_f = os.path.join("..", "data")
out_n = "testdata_20220210.json"
out_p = os.path.join(out_f, out_n)

with open(out_p, "w") as f:
    json.dump(resumes, f)