# Course on webscraping

*By Olav ten Bosch, Darius Keijdener, Dick Windmeijer*

In [None]:
# Imports:
import requests                     # for issueing HTTP requests
from bs4 import BeautifulSoup       # for parsing and navigating HTML results
import time                         # for sleeping between multiple requests

# Not related to webscraping
from IPython.display import Image   # for viewing images in a python notebook

#### Documentation:
- [Requests.py](http://docs.python-requests.org)

### Request, headers, user-agent, parameters, sleeping:

In [None]:
# Retrieving home page of Statistics Netherlands:
r1 = requests.get('https://www.cbs.nl/en-gb')

#r1.headers['content-type']
print(r1.status_code, r1.headers['content-type'], r1.encoding)
#print(r1.headers)
#print(r1.text)

In [None]:
# Retrieving home page of Statistics Netherlands with user-agent string:
headers = {'user-agent': 'scrapingCourseBot'}
r2 = requests.get('https://www.cbs.nl/en-gb', headers=headers)

# Headers of the request:
print(r2.request.headers)

# Headers of the response:
print(r2.headers)

In [None]:
# Issue a request with parameters:
pars = {'products': 2, 'years': 2}
r3 = requests.get('http://testing-ground.webscraping.pro/table-middle.html?', params=pars, headers=headers)        
print(r3.url)
#print(r3.text)

In [None]:
# In a loop, always add some idle time (time.sleep) to not overload server:
for products in range(1, 4):
    for years in range(1, 3):
        pars = {'products': products, 'years': years}
        r4 = requests.get('http://testing-ground.webscraping.pro/table-middle.html?', params=pars, headers=headers)
        print(r4.url, r4.status_code)
        time.sleep(1)

In [None]:
# We can also querry other types of files
r5 = requests.get("http://testing-ground.webscraping.pro/img/logo.png", headers=headers)
r5.text

In [None]:
# This is not just text.
r5.content

In [None]:
# The .content attribute returns a binary string, which can be interpreted by other functions.
Image(r5.content)

In [None]:
r6 = requests.get("https://www.cbs.nl/-/media/_excel/2021/34/tozo_reg_2021_juni.xls", headers=headers)
r6.text

In [None]:
# Note that we write the binary. If you are merely writinng text, save respons.text and omit the "b" option.
# Always be mindfull when you download and save files, especially in binary!
if r6.status_code == 200:
    with open("bestand_tozo.xls", "wb") as file:
        file.write(r6.content)

In [None]:
# This works only if you are running this locally, pandas is not installed in the binder machines
import pandas as pd
pd.read_excel("bestand_tozo.xls")