# Course on webscraping

*By Olav ten Bosch, Dick Windmeijer and Marijn Detiger*

In [None]:
# Imports:
import requests                  # for issueing HTTP requests
from bs4 import BeautifulSoup    # for parsing and navigating HTML results
import time                      # for sleeping between multiple requests

#### Documentation:
- [Requests.py](http://docs.python-requests.org)
- [Beautifulsoup.py](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

### Request, headers, user-agent, parameters, sleeping:

In [None]:
# Retrieving home page of Statistics Netherlands:
r1 = requests.get('https://www.cbs.nl/en-gb')

#r1.headers['content-type']
print(r1.status_code, r1.headers['content-type'], r1.encoding)
#print(r1.headers)
#print(r1.text)

In [None]:
# Retrieving home page of Statistics Netherlands with user-agent string:
headers = {'user-agent': 'scrapingCourseBot'}
r2 = requests.get('https://www.cbs.nl/en-gb', headers=headers)

# Headers of the request:
print(r2.request.headers)

# Headers of the response:
print(r2.headers)

In [None]:
# Issue a request with parameters:
pars = {'products': 2, 'years': 2}
r3 = requests.get('http://testing-ground.scraping.pro/table?', params=pars, headers=headers)        
#print(r3.url)
#print(r3.text)

In [None]:
# In a loop, always add some idle time (time.sleep) to not overload server:
for products in range(1, 6):
    for years in range(1, 6):
        pars = {'products': products, 'years': years}
        r4 = requests.get('http://testing-ground.scraping.pro/table?', params=pars, headers=headers)
        print(r4.url, r4.status_code)
        time.sleep(1)

### Soup with native syntax (find, find_all):

In [None]:
# Using soup find and find_all to access parts of page:
r4 = requests.get('https://www.cbs.nl/en-gb')
soup = BeautifulSoup(r4.text, 'lxml')              # use lxml, is faster and more relaxed in parsing 

# find returns the first element:
print(soup.find("h2"))
#print(soup.find("h2").text)
#print(soup.find("h3").text)

In [None]:
# find the first element which belongs to a class:
part = soup.find("div", class_="thumbnail")
print(part)

In [None]:
# find the first element with an id:
aside = soup.find("section", id="aside-main")
#print(aside)

In [None]:
# You can use find on a find result:
print(part.find("h3").text)
print(soup.find("div", class_="thumbnail").find("h3").text)

In [None]:
# how to get a URL from an a tag:
print(part.find("a")['href'])

In [None]:
# find_all returns a list:
print(soup.find_all("h2"))
print("")
print(soup.find_all("h3"))

In [None]:
# You can iterate through a list output of find_all:
# Get the URLS to all news articles of CBS using find_all and find:
articles = soup.find_all("div", class_='thumbnail')
for article in articles:
    link = article.find("a")['href']   # we retrieve the attribute href of the a tag
    print(link)

In [None]:
# Follow the links and get all texts of the news articles:
articles = soup.find_all("div", class_='thumbnail')
links3 = []
for article in articles:
    links3.append(article.find("a")['href'])

for link in links3:
    r = requests.get('https://www.cbs.nl'+link)
    #print(r.url)
    soup2 = BeautifulSoup(r.text, 'lxml')
    leadtext = soup2.find('section', class_='leadtext')
    if leadtext is None: continue
    print(leadtext.text)
    time.sleep(1) # in robots.txt CBS advises a delay of 1 second

### Soup with CSS selectors: (select):

In [None]:
# Get page in soup:
r5 = requests.get('https://www.cbs.nl/en-gb')
soup = BeautifulSoup(r5.text, 'lxml')

In [None]:
# Find all elements with a tag:

# find_all:
#print(soup.find_all("h2"))
#print(soup.find_all("h3"))

#CSS:
print(soup.select("h2"))
print(soup.select("h3"))

In [None]:
# Find all elements belonging to a class:

# find_all:
#print(soup.find_all("div", class_="thumbnail"))

# CSS:
print(soup.select("div.thumbnail"))

In [None]:
# find all elements with an id:

# find_all:
#print(soup.find_all("section", id="aside-main"))

# CSS:
print(soup.select("section#aside-main"))

In [None]:
# The power of CSS is in traversing the tree in one statement:
# Get all headlines via a CSS selector:
headlines = soup.select("div.thumbnail h3")
for headline in headlines:
    print(headline.text)

In [None]:
# Get all links of thumbnails via CSS:
thumbnails = soup.select("div.thumbnail a")
for thumbnail in thumbnails:
    print(thumbnail['href'])

In [None]:
# Get all h3s in the aside section:
aside_H3s = soup.select("section#aside-main h3")
for a in aside_H3s:
    print(a.text)