# Course on webscraping

*By Olav ten Bosch, Darius Keijdener*

In [None]:
# Imports:
import requests                     # for issueing HTTP requests
from bs4 import BeautifulSoup       # for parsing and navigating HTML results
import time                         # for sleeping between multiple requests

#### Documentation:
- [Requests.py](https://requests.readthedocs.io)
- [Beautifulsoup.py](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

### Request, headers, user-agent, parameters, sleeping:

In [None]:
# Retrieving home page of Statistics Netherlands:
r1 = requests.get('https://www.cbs.nl/en-gb')

#r1.headers['content-type']
print(r1.status_code, r1.headers['content-type'], r1.encoding)
#print(r1.headers)
#print(r1.text)

In [None]:
# Retrieving home page of Statistics Netherlands with user-agent string:
headers = {'user-agent': 'scrapingCourseBot'}
r2 = requests.get('https://www.cbs.nl/en-gb', headers=headers)

# Headers of the request:
print(r2.request.headers)

# Headers of the response:
#print(r2.headers)

In [None]:
# Issue a request with parameters:
pars = {'q': 'A2'}
#pars = {'q': 'A20'}
#pars = {'q': 'A13'}
r3 = requests.get('https://www.anwb.nl/verkeer/filelijst?', params=pars, headers=headers)        
print(r3.url)
#print(r3.text)

In [None]:
### In a loop, always add some idle time (time.sleep) to not overload server:
for road in ["A13", "A10", "A12", "A16", "A2", "A4"]:
    pars = {'q': road}
    r4 = requests.get('https://www.anwb.nl/verkeer/filelijst?', params=pars, headers=headers)
    print(r4.url, r4.status_code)
    
    # WAIT IN BETWEEN REQUESTS!:
    time.sleep(1)

In [None]:
# We can also query other types of files
# For example one picture from a website:
r5 = requests.get("https://cdn.cbs.nl/images/445343647647317a4f6c5166546d424a7778375475413d3d/900x450.jpg", headers=headers)
# for textual data use .text (Unicode)
r5.text
# for binary data use .content
r5.content

In [None]:
# The .content attribute returns the raw binary string, which can be interpreted by other functions.
from IPython.display import Image   # for viewing images in a python notebook
Image(r5.content)

In [None]:
# Or an Excel file
# from page: https://www.cbs.nl/nl-nl/maatwerk/2022/26/tabellen-tozo-definitief-juli-t-m-september-2021
r6 = requests.get("https://www.cbs.nl/-/media/_excel/2022/26/bus-tozo-2021-q3-definitief.xlsx", headers=headers)
r6.content

In [None]:
# This is how to save the result into a file. 
# Since we have binary data we use the "b" here, leave this out for textual data. 
# Always be mindfull when you download and save files, especially in binary!
if r6.status_code == 200:
    with open("bestand_tozo.xls", "wb") as file:
        file.write(r6.content)

In [None]:
# This works only if you are running this locally, pandas is not installed in the binder machines
import pandas as pd
pd.read_excel("bestand_tozo.xls")

### Beautifulsoup using commands *find, find_all*:

In [None]:
# Get page with requests:
r4 = requests.get('https://www.cbs.nl/en-gb', headers=headers)
print(r4.url, r4.status_code)
#print(r4.request.headers)
#print(r4.text)

In [None]:
# Using soup to parse the html page:
soup = BeautifulSoup(r4.text, 'lxml')              # use lxml, is faster and more relaxed in parsing 
# find returns the first element:
print(soup.find("h2"))
#print(soup.find("h2").text)
#print(soup.find("h3").text)

In [None]:
# find the first element which belongs to a class:
part = soup.find("a", class_="thumbnail")
print(part)

In [None]:
# find the first element with an id:
aside = soup.find("section", id="aside-main")
print(aside)

In [None]:
# You can use find on a find result:
print(part.find("h3").text)
print(soup.find("a", class_="thumbnail").find("h3").text)

In [None]:
# how to get a URL from an a tag:
print(part['href'])

In [None]:
# find_all returns a list:
print(soup.find_all("h2"))
print("")
print(soup.find_all("h3"))

In [None]:
# You can iterate through a list output of find_all:
# Get the URLS to all news articles of CBS using find_all and find:
articles = soup.find_all("a", class_='thumbnail')
for article in articles:
    link = article['href']   # we retrieve the attribute href of the a tag
    print(link)

In [None]:
# Follow the links and get all texts of the news articles:
articles = soup.find_all("a", class_='thumbnail')
links3 = []
for article in articles:
    links3.append(article['href'])

for link in links3:
    r = requests.get(link, headers=headers)
    #print(r.url)
    soup2 = BeautifulSoup(r.text, 'lxml')
    leadtext = soup2.find('section', class_='leadtext')
    if leadtext is None: continue
    print(leadtext.text)
    time.sleep(1) # in robots.txt CBS advises a delay of 1 second