### Web Scrapping 101

In [1]:
import requests

In [2]:
base_url = "http://books.toscrape.com/index.html"

home_page = requests.get(base_url)

# We can check if the request we made is successful or not
if home_page.status_code == 200:
    print("Success")
else:
    print(f"Failed to load. Status code : {home_page.status_code}")

Success


In [5]:
print(home_page.content[:500])

b'<!DOCTYPE html>\n<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->\n<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->\n<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->\n<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->\n    <head>\n        <title>\n    All products | Books to Scrape - Sandbox\n</title>\n\n        <meta http-equiv="content-type" content="text/html; charset=UTF-8" /'


In [6]:
from bs4 import BeautifulSoup

In [9]:
soup = BeautifulSoup(
        markup = home_page.content,
        parser = "html.parser"
        )

In [10]:
books = soup.find_all(name = 'li', class_ = 'col-xs-6 col-sm-4 col-md-3 col-lg-3')

In [13]:
len(books)

20

In [15]:
book_one = books[0]

In [26]:
# book_one

In [17]:
book_one_anchor = book_one.findChild('a')

In [18]:
book_one_anchor

<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>

In [19]:
book_one_url = book_one_anchor.get('href')

In [20]:
book_one_url

'catalogue/a-light-in-the-attic_1000/index.html'

In [21]:
book_one.findChild('a').get('href')

'catalogue/a-light-in-the-attic_1000/index.html'

In [22]:
base_url

'http://books.toscrape.com/index.html'

In [23]:
from urllib.parse import urljoin

In [24]:
book_one_url = urljoin(base_url, book_one_url)

In [25]:
book_one_url

'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'

In [27]:
the_book_one_page = requests.get(book_one_url).content
book_one_soup = BeautifulSoup(the_book_one_page)

In [31]:
title = book_one_soup.find('h1').text

In [33]:
title

'A Light in the Attic'

In [34]:
book_one_table = book_one_soup.find_all('tr')

In [37]:
len(book_one_table)

7

In [40]:
book_one_table[2]

<tr>
<th>Price (excl. tax)</th><td>£51.77</td>
</tr>

In [42]:
book_one_data = {
    "Title" : title
}

for book in book_one_table:
    key = book.find('th').text
    value = book.find('td').text
    
    book_one_data[key] = value


In [43]:
book_one_data

{'Title': 'A Light in the Attic',
 'UPC': 'a897fe39b1053632',
 'Product Type': 'Books',
 'Price (excl. tax)': '£51.77',
 'Price (incl. tax)': '£51.77',
 'Tax': '£0.00',
 'Availability': 'In stock (22 available)',
 'Number of reviews': '0'}

In [45]:
def scrape_book(book_url):
    book_page = requests.get(book_url).content
    book_soup = BeautifulSoup(book_page)
    
    book_data = {}
    
    title = book_soup.find('h1').text
    book_data['title'] = title
    
    book_table_data = book_soup.find_all('tr')
    
    # For product information iterate and get all key value pairs
    for book in book_table_data:
        key = book.find('th').text
        value = book.find('td').text
        
        book_data[key] = value
    
    return book_data

In [46]:
scrape_book('http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html')

{'title': 'A Light in the Attic',
 'UPC': 'a897fe39b1053632',
 'Product Type': 'Books',
 'Price (excl. tax)': '£51.77',
 'Price (incl. tax)': '£51.77',
 'Tax': '£0.00',
 'Availability': 'In stock (22 available)',
 'Number of reviews': '0'}

In [48]:
scrape_book('https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html')

{'title': 'Tipping the Velvet',
 'UPC': '90fa61229261140a',
 'Product Type': 'Books',
 'Price (excl. tax)': '£53.74',
 'Price (incl. tax)': '£53.74',
 'Tax': '£0.00',
 'Availability': 'In stock (20 available)',
 'Number of reviews': '0'}

In [49]:
def scrape_page(base_url):
    page = requests.get(base_url).content
    page_soup = BeautifulSoup(page)
    
    books = soup.find_all(name = 'li', class_ = 'col-xs-6 col-sm-4 col-md-3 col-lg-3')
    
    book_list = []
    
    for book in books:
        relative_path = book.findChild('a').get('href')
        book_url = urljoin(base_url, relative_path)
        
        book_data = scrape_book(book_url)
        
        book_list.append(book_data)
        
    return book_list

In [51]:
books = scrape_page('http://books.toscrape.com/index.html')

In [52]:
len(books)

20