# Programming for Data Science Summary
## Chapter 11 - Web Scraping

In [49]:
import requests as re
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.by import By
import time

### Requests
Motivation: Get raw text data from websites, or post data to websites

In [10]:
url = r'https://www.google.com'

r = re.get(url) # Make a GET request from an url
r = re.post(url, data={'test':'test'}) # Make a POST requests containing data

r.status_code # Obtain status code of any type of requests
r.text # Obtain text data from the request
r.json # Get response in JSON format, works when APIs return JSON data structures

<bound method Response.json of <Response [405]>>

### BeautifulSoup
Motivation: Parse and explore raw HTML text data, usually the crucial step in web scraping

In [40]:
url = r'https://en.wikipedia.org/wiki/Web_scraping'
r = re.get(url)
raw_html = r.text

soup = BS(raw_html, 'html.parser') # Instead of html.parser you can put xml.parser, if raw text is XML structured

e = soup.find('h1', {'id': 'firstHeading'}) # Find first element of h1 such that its id is firstHeading (or any other specified parameter)
e_down = e.find('span') # Find first element of type span "inside" the previously found element (navigate downwards)
e_up = e_down.find_parent('h1') # Find parent of element with such type

e = soup.find('a')
e_next = e.find_next('a') # Find next element <a> (navigate sideways)
e_prev = e.find_previous('a') # Find previous element <a>

elements = soup.find_all('p') # Return a list of elements type p

e.text # Retrieve text content from an element
e.get('href') # Get an attribute
e['href'] # Equivalent but will raise an error if it does not exist

display()

### Selenium
Motivation: Interact with dynamic websites

In [None]:
url = r'https://www.novaims.unl.pt/'

driver = webdriver.Firefox() # Opens a Firefox window
    # Other browsers can be used, as long as they support selenium

driver.get(url) # Makes a GET request
driver.page_source # Gets current HTML source

time.sleep(1)

xpath = r'/html/body/div[2]/div/div/div/div/div[3]/div[1]/button[3]'
e = driver.find_element(By.XPATH, xpath)
    # Other options for by: By.NAME, By.TAGNAME, ...
e.text # Get text from element
e.click() # Click button
    # Other options: e.send_keys(s), e.get_attribute(attribute), ...

time.sleep(1)

driver.get('https://www.google.com')
driver.back() # Go backward in browser history
driver.forward() # Go forward

# You can also switch between windows and frames, but we won't cover that...

driver.quit() # Closes the browser window
