# Examples, course on webscraping

-- * By Olav ten Bosch and Dick Windmeijer *

In [1]:
# Retrieving the English home page of Statistics Netherlands:
import requests
from bs4 import BeautifulSoup
import time

r1 = requests.get('https://www.cbs.nl/en-gb')

print(r1.status_code, r1.headers['content-type'], r1.encoding)

200 text/html; charset=utf-8 utf-8


In [2]:
# Retrieving document:
print(r1.text)


<!DOCTYPE html>
<html xml:lang="en-GB" lang="en-GB">
<head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <meta http-equiv=X-UA-Compatible content="IE=edge" />
    <title>CBS - Statistics Netherlands</title>
    <meta name="DCTERMS:identifier" title="XSD.anyURI" content="https://www.cbs.nl/en-gb" />
    <meta name="DCTERMS.title" content="CBS - Statistics Netherlands" />
    <meta name="DCTERMS.type" title="OVERHEID.Informatietype" content="webpagina" />
    <meta name="DCTERMS.language" title="XSD.language" content="en-GB" />
    <meta name="DCTERMS.authority" title="OVERHEID.Organisatie" content="Statistics Netherlands" />
    <meta name="DCTERMS.creator" title="OVERHEID.Organisatie" content="Statistics Netherlands" />
    <meta name="DCTERMS.modified" title="XSD.dateTime" content="16-08-2018T10:41:02" />
    <meta name="DCTERMS.temporal" content="" />
    <meta name="DCTERMS.spatial" title="OVERHEID:Konin

In [11]:
# Issue an request with parameters:

pars = {'products': 2, 'years': 2}
r2 = requests.get('http://testing-ground.scraping.pro/table?', params=pars)
        
print(r2.url)

http://testing-ground.scraping.pro/table?products=2&years=2


In [18]:
# Retrieving document:
print(r2.text)

<!DOCTYPE html>
<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
    <head>
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
        <title>Web Scraper Testing Ground</title>
        <meta name="description" content="">
        <meta name="viewport" content="width=device-width">
        <link rel="stylesheet" href="/css/normalize.css">
        <link rel="stylesheet" href="/css/main.css">
        <script src="/js/vendor/modernizr-2.6.1.min.js"></script>
        <script src="/js/vendor/jquery-1.9.1.min.js"></script>
        <script src="/js/vendor/jquery-ui-1.10.2.min.js"></script>
        <script src="/js/plugins.js"></script>
        <script src="/js/main.js"></script>

        <link rel="stylesheet" href="/css/Q

In [3]:
# Retrieving English home page of Statistics Netherlands:
r3 = requests.get('https://www.cbs.nl/en-gb')
print(r3.text)


<!DOCTYPE html>
<html xml:lang="en-GB" lang="en-GB">
<head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <meta http-equiv=X-UA-Compatible content="IE=edge" />
    <title>CBS - Statistics Netherlands</title>
    <meta name="DCTERMS:identifier" title="XSD.anyURI" content="https://www.cbs.nl/en-gb" />
    <meta name="DCTERMS.title" content="CBS - Statistics Netherlands" />
    <meta name="DCTERMS.type" title="OVERHEID.Informatietype" content="webpagina" />
    <meta name="DCTERMS.language" title="XSD.language" content="en-GB" />
    <meta name="DCTERMS.authority" title="OVERHEID.Organisatie" content="Statistics Netherlands" />
    <meta name="DCTERMS.creator" title="OVERHEID.Organisatie" content="Statistics Netherlands" />
    <meta name="DCTERMS.modified" title="XSD.dateTime" content="16-08-2018T10:41:02" />
    <meta name="DCTERMS.temporal" content="" />
    <meta name="DCTERMS.spatial" title="OVERHEID:Konin

In [4]:

soup3 = BeautifulSoup(r3.text, 'html.parser')
print(soup3.title.text)

CBS - Statistics Netherlands


In [5]:
# Get the URLS to all news articles of CBS:
articles = soup3.find_all("div", class_='thumbnail')
for article in articles:
    link = article.find("a")['href']
    print(link)

/en-gb/news/2018/39/producer-confidence-hardly-changes
/en-gb/news/2018/39/1-5-of-manufacturing-exports-are-carry-along-trade
/en-gb/news/2018/37/1-in-5-obese-adults-satisfied-with-body-weight
/en-gb/news/2018/38/budget-surplus-up-to-1-9-percent-of-gdp-in-mid-2018
/en-gb/news/2018/38/house-prices-over-9-percent-higher-in-august
/en-gb/news/2018/38/gdp-growth-rate-0-8-percent-in-q2-2018
/en-gb/news/2018/38/mobile-phones-less-often-secure-than-computers
/en-gb/news/2018/38/economic-situation-unchanged
/en-gb/news/2018/38/investments-over-6-percent-up-in-july
/en-gb/news/2018/38/employment-rate-virtually-the-same-for-six-months
/en-gb/news/2018/38/household-consumption-nearly-3-percent-up-in-july


In [19]:
headlines = soup3.select("div.thumbnail h3")

print(headlines)

[<h3>Producer confidence hardly changes</h3>, <h3>1/5 of manufacturing exports are carry-along trade</h3>, <h3>1 in 5 obese adults satisfied with body weight</h3>, <h3>Budget surplus up to 1.9 percent of GDP in mid-2018</h3>, <h3>House prices over 9 percent higher in August</h3>, <h3>GDP growth rate 0.8 percent in Q2 2018</h3>, <h3>Mobile phones less often secure than computers</h3>, <h3>Economic situation unchanged</h3>, <h3>Investments over 6 percent up in July</h3>, <h3>Employment rate virtually the same for six months</h3>, <h3>Household consumption nearly 3 percent up in July</h3>]


In [6]:
# Get all texts of news articles of CBS:
articles = soup3.find_all("div", class_='thumbnail')
links3 = []
for article in articles:
    links3.append(article.find("a")['href'])

for link in links3:
    r = requests.get('https://www.cbs.nl'+link)
    soup = BeautifulSoup(r.text, 'html.parser')
    leadtext = soup.find('section', class_='leadtext').text
    print(leadtext)
    time.sleep(1) # in robots.txt CBS advises a delay of 1 second


                According to Statistics Netherlands (CBS), producer confidence among Dutch manufacturers has hardly changed. Confidence stood at 5.7 in September, down from 5.9 in August. In September, manufacturers were less positive about their future output than in the previous month.
            

                One-fifth of all exports by Dutch manufacturers consist of products that are not self-manufactured, but supplied along with their own products, a phenomenon referred to as carry-along trade. This is reported by Statistics Netherlands (CBS) in the latest edition of the Internationalisation Monitor, based on new research into this phenomenon.
            

                Almost half of the total Dutch adult population are moderately or severely overweight. In the period 2015-2017, two in five severely overweight (obese) people indicated that they were dissatisfied with their body weight, while one in five said that they were satisfied. This is reported by Statistics Nether