## Section 5: Parsing And Extraction

### BeautifulSoup

In [83]:
# !pip install beautifulsoup4

In [84]:
import requests
from bs4 import BeautifulSoup

In [85]:
url = "https://books.toscrape.com"
resp = requests.get(url)
resp

In [86]:
soup = BeautifulSoup(resp.content, "html.parser")

### Tags

In [87]:
soup.title

In [88]:
print(soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en-us">
 <!--<![endif]-->
 <head>
  <title>
   All products | Books to Scrape - Sandbox
  </title>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="24th Jun 2016 09:29" name="created"/>
  <meta content="" name="description"/>
  <meta content="width=device-width" name="viewport"/>
  <meta content="NOARCHIVE,NOCACHE" name="robots"/>
  <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
  <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->
  <link href="static/oscar/favicon.ico" rel="shortcut icon"/>
  <link href="static/oscar/css/styles.css" rel="stylesheet" type="tex

In [89]:
soup.h1

In [90]:
first_div = soup.div
first_div

In [91]:
type(first_div)

In [92]:
first_div.attrs

In [93]:
first_div.div.div.attrs

### Parents, Children, And Descendants

In [94]:
ul = soup.ul
ul

In [95]:
list(ul.children)

In [96]:
from bs4.element import NavigableString


def no_nav_strings(iterable):
    return list(filter(lambda x: type(x) != NavigableString, iterable))

In [97]:
no_nav_strings(ul.children)

In [98]:
ul_descendants = no_nav_strings(ul.descendants)
ul_descendants

In [99]:
ul_descendants[1]

In [100]:
ul_descendants[1].parent

### Siblings

In [101]:
ul

In [102]:
ul.li.next_sibling

In [103]:
ul.li.next_sibling.next_sibling

In [104]:
ul.li.next_sibling.next_sibling.previous_sibling

### Extracting text

In [105]:
soup.ul

In [106]:
soup.ul.text

In [107]:
soup.ul.string  # children/descendants text not included

In [108]:
soup.ul.a.string, type(soup.ul.a.string)

In [109]:
soup.ul.text

In [110]:
soup.ul.get_text()

In [111]:
soup.ul.get_text(strip=True)

In [112]:
soup.ul.get_text(separator="==>")

In [113]:
soup.ul.get_text(separator="==>", strip=True)

### All strings

In [114]:
all_strings = list(soup.strings)
all_strings[:5]

In [115]:
soup.stripped_strings

In [116]:
all_strings = list(soup.stripped_strings)
all_strings[:5]

### Search

In [117]:
len(soup.find_all())

In [118]:
len(soup.find_all("h1"))

In [119]:
len(soup.find_all("h3"))

In [120]:
len(soup.find_all(["h1", "h3"]))

In [121]:
price_tags = soup.find_all("p", attrs={"class": "price_color"})
price_tags

In [122]:
[price_tag.text for price_tag in price_tags]

In [123]:
soup.find_all("p", class_="price_color")  # instead of attrs

In [124]:
rating_stars = soup.find_all("p", attrs={"class": "star-rating"})
len(rating_stars)

In [125]:
rating_stars = soup.find_all("p", attrs={"class": "star-rating Four"})
len(rating_stars)

In [126]:
rating_stars = soup.find_all("p", attrs={"class": "star-rating Five"})
len(rating_stars)

In [127]:
rating_stars = soup.find_all("p", attrs={"class": lambda x: x in ("Four", "Five")})
len(rating_stars)

### Challenge

- Extract the following elements from the 1st page of books.toscrape.com:

  * full book title 
  * price as float
  * rating as int  

- Data should be stored as python list of dictionaries, where each book is a dictionary  

-  e.g.
```python  
  {
    'title': 'Mesaerion: The Best Science Fiction Stories 1800-1849',
    'price': 37.59,
    'rating': 1
  }
``` 

### Solution

In [128]:
def fetch():
    url = "https://books.toscrape.com/"
    resp = requests.get(url)
    soup = BeautifulSoup(resp.content)
    return soup


def parse(soup):
    items = []
    articles = soup.find_all("article", class_="product_pod")
    for article in articles:
        title = article.find("h3").find("a")["title"]
        price = float(article.find("p", class_="price_color").text.strip("£"))
        word_to_int_mapping = {"One": 1, "Two": 2, "Three": 3, "Four": 4, "Five": 5}
        rating_word = article.find("p", class_="star-rating")["class"][-1]
        rating = word_to_int_mapping.get(rating_word)
        item = {"title": title, "price": price, "rating": rating}
        items.append(item)
    return items

In [129]:
soup = fetch()
books = parse(soup)
books

In [130]:
# alternate approach to cleaning price, using regex
# advantage: works with different currencies
article = soup.find_all("article", class_="product_pod")[0]
raw_price = article.find("p", class_="price_color").text
raw_price

In [131]:
import re

price = float(re.sub("[^0-9.]", "", raw_price))
price

### pandas

In [132]:
# !pip install pandas

In [133]:
import pandas as pd

In [134]:
df = pd.DataFrame(books)
df

Unnamed: 0,title,price,rating
0,A Light in the Attic,51.77,3
1,Tipping the Velvet,53.74,1
2,Soumission,50.1,1
3,Sharp Objects,47.82,4
4,Sapiens: A Brief History of Humankind,54.23,5
5,The Requiem Red,22.65,1
6,The Dirty Little Secrets of Getting Your Dream...,33.34,4
7,The Coming Woman: A Novel Based on the Life of...,17.93,3
8,The Boys in the Boat: Nine Americans and Their...,22.6,4
9,The Black Maria,52.15,1


In [135]:
# average of all prices
df.price.mean()

In [136]:
# books where price is less than 20
df[df.price < 20].title.values

In [137]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_csv.html
df.to_csv("books.csv", index=False)

In [138]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_json.html
df.to_json("books.json", orient="records")

### orient options for to_json

In [139]:
import os

base = "to_json_orient_options"
os.makedirs(base, exist_ok=True)

In [140]:
orient_options = ["split", "records", "index", "columns", "values"]
books_sample = df[:5]

In [141]:
for orient_option in orient_options:
    books_sample.to_json(f"{base}/{orient_option}.json", orient=orient_option)

### Functional Search Patterns

In [142]:
soup.find_all(id="messages")

In [143]:
soup.find_all(attrs={"id": "messages"})

In [144]:
len(soup.find_all(attrs={"id": lambda x: x is not None}))

In [145]:
len(soup.find_all(id=lambda x: x is not None))

In [146]:
len(soup.find_all(lambda x: x.has_attr("id")))

In [147]:
def fiction_category_anchor(tag):
    return tag.name == "a" and "category" in tag["href"] and "Fiction" in tag.text

In [148]:
fiction = soup.find_all(fiction_category_anchor)
fiction

In [149]:
len(fiction)

### Text Search

In [150]:
soup.find_all(string="Fiction")

In [151]:
import re

re.compile("Fiction", re.I)

In [152]:
soup.find_all(string=re.compile("Fiction", re.I))

In [153]:
text_matches = soup.find_all(string=re.compile("Fiction", re.I))

In [154]:
[text.strip() for text in text_matches]

### Searching By CSS

In [155]:
book_tags = soup.find_all("article", attrs={"class": "product_pod"})

In [156]:
titles = []
for tag in book_tags:
    title = tag.find("h3").find("a")["title"]
    titles.append(title)

titles

In [157]:
title_tags = soup.select("article.product_pod > h3 > a")
titles = [tag["title"] for tag in title_tags]
titles

In [158]:
soup.select("[title]")

In [159]:
soup.select("[title*=Human]")

In [160]:
len(soup.select("button.btn-primary[data-loading-text][class*=primary]"))

In [161]:
len(soup.select("button"))

### Just One Tag

In [162]:
soup.find_all("a", limit=1)

In [163]:
soup.find("a")

In [164]:
soup.find_all("a", limit=1)[0] is soup.find("a")

In [165]:
soup.select("a", limit=1)

In [166]:
soup.select_one("a")

In [167]:
soup.select("a", limit=1)[0] is soup.select_one("a")