# Course on webscraping

*By Olav ten Bosch, Darius Keijdener, Dick Windmeijer*

In [2]:
# Imports:
import requests                  # for issueing HTTP requests
from bs4 import BeautifulSoup    # for parsing and navigating HTML results
import time                      # for sleeping between multiple requests
headers = {'user-agent': 'scrapingCourseBot'}

#### Documentation:
- [Requests.py](https://requests.readthedocs.io)
- [Beautifulsoup.py](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
- [CSS selectors](https://www.w3schools.com/cssref/css_selectors.asp)

### Soup with CSS selectors: (select):

In [3]:
# Get page in soup:
r5 = requests.get('https://www.cbs.nl/en-gb', headers=headers)
print(r5.url, r5.status_code)
soup = BeautifulSoup(r5.text, 'lxml')

https://www.cbs.nl/en-gb 200


In [4]:
# Find all elements with a tag:

# find_all:
#print(soup.find_all("h2"))
#print("")
#print(soup.find_all("h3"))

#CSS:
print(soup.select("h2"))
print("")
print(soup.select("h3"))

[<h2 class="teaser-header mt-3">Household consumption over 5 percent up in June</h2>, <h2 class="sr-only">Recent news</h2>, <h2>Expected</h2>, <h2>Featured</h2>, <h2>CBS video</h2>, <h2 class="title">Services</h2>, <h2 class="title">About us</h2>, <h2 class="title">About this site</h2>, <h2 class="sr-only">Follow Statistics Netherlands</h2>]

[<h3>Economic growth 2.6 percent in Q2 2022</h3>, <h3>Exports up by almost 4 percent in June</h3>, <h3>Business confidence down as of Q3</h3>, <h3>Fewer bankruptcies in July </h3>, <h3>Manufacturing output over 6 percent up in June</h3>, <h3>Aviation not yet at 2019 level in Q2</h3>, <h3>Excess mortality in all weeks of July</h3>, <h3>Inflation rate up to 10.3 percent in July</h3>, <h3>Population up by nearly 120 thousand in first half of 2022</h3>, <h3>More asylum applications in Q2 2022</h3>, <h3>Economic outlook virtually unchanged</h3>, <h3>HICP 11.6 percent higher in July</h3>, <h3>Labour market</h3>, <h3>Consumer confidence</h3>, <h3>House p

In [5]:
# Find all elements belonging to a class:

# find_all:
#print(soup.find_all("a", class_="thumbnail"))

# CSS:
print(soup.select("a.thumbnail"))

[<a class="thumbnail row" href="https://www.cbs.nl/en-gb/news/2022/33/economic-growth-2-6-percent-in-q2-2022">
<div class="col-4 col-lg-12 pr-0">
<div class="cbs-image-container">
<svg version="1.1" viewbox="0 0 600 400" width="100%" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"><defs><!-- --></defs><g><rect fill="#d9328a" height="400" stroke="none" width="600" x="0" y="0"></rect></g><g></g><g><g><text fill="#FFF" font-family="Akko, Akko W01 Regular" font-size="70" font-weight="500" text-anchor="end" x="561" y="257"><tspan>+</tspan><tspan class="chart-banner-number-wylZboUYXT" data-number="2">2</tspan><tspan>.</tspan><tspan class="chart-banner-number-wylZboUYXT" data-number="6">6</tspan><tspan>%</tspan></text></g></g><g><g opacity="1"><svg preserveaspectratio="xMinYMin" viewbox="0 0 1000 1000" width="280" x="70" xmlns="http://www.w3.org/2000/svg" y="70"><defs><style></style></defs><path d="M47.05,927.27c-14.12,0-20.1,0-20.1-13.41V877.39c0-13.4,6-14.88,17

In [6]:
# find all elements with an id:

# find_all:
#print(soup.find_all("section", id="aside-main"))

# CSS:
print(soup.select("section#aside-main"))

[<section class="col-12 col-lg-3" id="aside-main">
<aside class="sidebar-pink col-12" role="complementary">
<div class="aside-content">
<h2>Expected</h2>
<a href="https://www.cbs.nl/en-gb/publication-calendar" title="Publication calendar">
<h3>Labour market</h3>
<p class="date"><time datetime="2022-08-18T13:00:00+00:00">18/08/2022 15:00</time></p> <h3>Consumer confidence</h3>
<p class="date"><time datetime="2022-08-22T13:00:00+00:00">22/08/2022 15:00</time></p> <h3>House prices</h3>
<p class="date"><time datetime="2022-08-22T13:00:00+00:00">22/08/2022 15:00</time></p> <h3>Investments</h3>
<p class="date"><time datetime="2022-08-22T13:00:00+00:00">22/08/2022 15:00</time></p> </a>
</div>
</aside>
<aside class="sidebar-red col-12" role="complementary">
<div class="aside-content">
<h2>Featured</h2>
<a href="https://www.cbs.nl/en-gb/publication/2022/20/monitor-of-well-being-the-sustainable-development-goals-2022">
<p class="featured-title">Monitor of Well-being &amp; the Sustainable Develop

In [7]:
# The power of CSS is in traversing the tree in one statement:
# Get all headlines via a CSS selector:
headlines = soup.select("a.thumbnail h3")
for headline in headlines:
    print(headline.text)

Economic growth 2.6 percent in Q2 2022
Exports up by almost 4 percent in June
Business confidence down as of Q3
Fewer bankruptcies in July 
Manufacturing output over 6 percent up in June
Aviation not yet at 2019 level in Q2
Excess mortality in all weeks of July
Inflation rate up to 10.3 percent in July
Population up by nearly 120 thousand in first half of 2022
More asylum applications in Q2 2022
Economic outlook virtually unchanged
HICP 11.6 percent higher in July


In [8]:
# Get all links of thumbnails via CSS:
thumbnails = soup.select("a.thumbnail")
for thumbnail in thumbnails:
    print(thumbnail['href'])

https://www.cbs.nl/en-gb/news/2022/33/economic-growth-2-6-percent-in-q2-2022
https://www.cbs.nl/en-gb/news/2022/33/exports-up-by-almost-4-percent-in-june
https://www.cbs.nl/en-gb/news/2022/33/business-confidence-down-as-of-q3
https://www.cbs.nl/en-gb/news/2022/32/fewer-bankruptcies-in-july
https://www.cbs.nl/en-gb/news/2022/32/manufacturing-output-over-6-percent-up-in-june
https://www.cbs.nl/en-gb/news/2022/31/aviation-not-yet-at-2019-level-in-q2
https://www.cbs.nl/en-gb/news/2022/31/excess-mortality-in-all-weeks-of-july
https://www.cbs.nl/en-gb/news/2022/31/inflation-rate-up-to-10-3-percent-in-july
https://www.cbs.nl/en-gb/news/2022/30/population-up-by-nearly-120-thousand-in-first-half-of-2022
https://www.cbs.nl/en-gb/news/2022/30/more-asylum-applications-in-q2-2022
https://www.cbs.nl/en-gb/news/2022/30/economic-outlook-virtually-unchanged
https://www.cbs.nl/en-gb/news/2022/30/hicp-11-6-percent-higher-in-july


In [9]:
# Get all h3s in the aside section:
aside_H3s = soup.select("section#aside-main h3")
for a in aside_H3s:
    print(a.text)

Labour market
Consumer confidence
House prices
Investments
Population forecast.
