In [None]:
ai_html = """
<html>
  <head>
   <title>
     Web Scraping 101 - by aiadventures
   </title>
  </head>
  <body>
    <div id="course">
      <h3> Courses at
        <a href="www.aiadventures.in">aiadventures</a>
      </h3>
      <ul>
        <li>Python</li>
        <li>Data Science</li>
        <li>Machine Learning</li>
        <li>Deep Learning</li>
        <li>Computer Vision</li>
      </ul>
    </div>
    <div class="follow_us">
      <h3> Follow Us </h3>
      <ul>
        <li><a href="https://www.instagram.com/aiadventures.pune">Instagram</a></li>
        <li><a href="https://www.linkedin.com/company/aiadventures">LinkedIn</a></li>
        <li><a href="https://medium.com/aiadventures">Medium</a></li>
        <li><a href="https://www.youtube.com/channel/UCPZqWUIXZAs926TBRclhUGw">Youtube</a></li>
      </ul>
    </div>
  </body>
</html>
"""

In [None]:
from IPython.core.display import display, HTML
display(HTML(ai_html))

In [None]:
# import the BeautifulSoup Library
from bs4 import BeautifulSoup as bs

In [None]:
# create a soup object
soup = bs(ai_html)
soup

<html>
<head>
<title>
     Web Scraping 101 - by aiadventures
   </title>
</head>
<body>
<div id="course">
<h3> Courses at
        <a href="www.aiadventures.in">aiadventures</a>
</h3>
<ul>
<li>Python</li>
<li>Data Science</li>
<li>Machine Learning</li>
<li>Deep Learning</li>
<li>Computer Vision</li>
</ul>
</div>
<div class="follow_us">
<h3> Follow Us </h3>
<ul>
<li><a href="https://www.instagram.com/aiadventures.pune">Instagram</a></li>
<li><a href="https://www.linkedin.com/company/aiadventures">LinkedIn</a></li>
<li><a href="https://medium.com/aiadventures">Medium</a></li>
<li><a href="https://www.youtube.com/channel/UCPZqWUIXZAs926TBRclhUGw">Youtube</a></li>
</ul>
</div>
</body>
</html>

The output looks exactly the same. But under the hood, the complete string has being parsed and organised in the form of a tree, for easy access. For example,

In [None]:
soup.title

<title>
     Web Scraping 101 - by aiadventures
   </title>

In [None]:
soup.head

<head>
<title>
     Web Scraping 101 - by aiadventures
   </title>
</head>

## Selecting Tags

Searching by Tag names

In [None]:
soup.find("title")

<title>
     Web Scraping 101 - by aiadventures
   </title>

find() only returns the first tag or element

In [None]:
soup.find_all("div")

[<div id="course">
 <h3> Courses at
         <a href="www.aiadventures.in">aiadventures</a>
 </h3>
 <ul>
 <li>Python</li>
 <li>Data Science</li>
 <li>Machine Learning</li>
 <li>Deep Learning</li>
 <li>Computer Vision</li>
 </ul>
 </div>,
 <div class="follow_us">
 <h3> Follow Us </h3>
 <ul>
 <li><a href="https://www.instagram.com/aiadventures.pune">Instagram</a></li>
 <li><a href="https://www.linkedin.com/company/aiadventures">LinkedIn</a></li>
 <li><a href="https://medium.com/aiadventures">Medium</a></li>
 <li><a href="https://www.youtube.com/channel/UCPZqWUIXZAs926TBRclhUGw">Youtube</a></li>
 </ul>
 </div>]

find_all() returns a list of all the tags or elements.

In [None]:
soup.find_all(["h3","ul"])

[<h3> Courses at
         <a href="www.aiadventures.in">aiadventures</a>
 </h3>,
 <ul>
 <li>Python</li>
 <li>Data Science</li>
 <li>Machine Learning</li>
 <li>Deep Learning</li>
 <li>Computer Vision</li>
 </ul>,
 <h3> Follow Us </h3>,
 <ul>
 <li><a href="https://www.instagram.com/aiadventures.pune">Instagram</a></li>
 <li><a href="https://www.linkedin.com/company/aiadventures">LinkedIn</a></li>
 <li><a href="https://medium.com/aiadventures">Medium</a></li>
 <li><a href="https://www.youtube.com/channel/UCPZqWUIXZAs926TBRclhUGw">Youtube</a></li>
 </ul>]

Select multiple tags by passing a list of tags to the find_all function

## Serching by Tag attributes

In [None]:
soup.find("div",id="course")

<div id="course">
<h3> Courses at
        <a href="www.aiadventures.in">aiadventures</a>
</h3>
<ul>
<li>Python</li>
<li>Data Science</li>
<li>Machine Learning</li>
<li>Deep Learning</li>
<li>Computer Vision</li>
</ul>
</div>

In [None]:
soup.find("div", class_="follow_us")

<div class="follow_us">
<h3> Follow Us </h3>
<ul>
<li><a href="https://www.instagram.com/aiadventures.pune">Instagram</a></li>
<li><a href="https://www.linkedin.com/company/aiadventures">LinkedIn</a></li>
<li><a href="https://medium.com/aiadventures">Medium</a></li>
<li><a href="https://www.youtube.com/channel/UCPZqWUIXZAs926TBRclhUGw">Youtube</a></li>
</ul>
</div>

In [None]:
soup.find('div',id=True)

<div id="course">
<h3> Courses at
        <a href="www.aiadventures.in">aiadventures</a>
</h3>
<ul>
<li>Python</li>
<li>Data Science</li>
<li>Machine Learning</li>
<li>Deep Learning</li>
<li>Computer Vision</li>
</ul>
</div>

The above code selects all the "div" which has "id" attribute

## Regular Expressions

In [None]:
import re
soup.find(re.compile("div"), class_=re.compile('follow_us'))

<div class="follow_us">
<h3> Follow Us </h3>
<ul>
<li><a href="https://www.instagram.com/aiadventures.pune">Instagram</a></li>
<li><a href="https://www.linkedin.com/company/aiadventures">LinkedIn</a></li>
<li><a href="https://medium.com/aiadventures">Medium</a></li>
<li><a href="https://www.youtube.com/channel/UCPZqWUIXZAs926TBRclhUGw">Youtube</a></li>
</ul>
</div>

## Accessing Information

Every tag has 3 major components:

1) Tag name

2) Text between the open & close tags, called Inner text.

3) Tag attributes and its values


In [None]:
title_tag = soup.find("title")
title_tag

<title>
     Web Scraping 101 - by aiadventures
   </title>

### Tag name

In [None]:
title_tag.name

'title'

### Inner text

In [None]:
title_tag.text

'\n     Web Scraping 101 - by aiadventures\n   '

### Attribute values

In [None]:
a_tag = soup.find('a')
a_tag

<a href="www.aiadventures.in">aiadventures</a>

Once you have the tag, just think of it as a dictionary. You can easily access any attribute by passing it as a key.

In [None]:
a_tag['href']

'www.aiadventures.in'

Once you know how to extract attribute values, you can easily extract all the links by running the following code

In [None]:
[a_tag['href'] for a_tag in soup.find_all('a')]

['www.aiadventures.in',
 'https://www.instagram.com/aiadventures.pune',
 'https://www.linkedin.com/company/aiadventures',
 'https://medium.com/aiadventures',
 'https://www.youtube.com/channel/UCPZqWUIXZAs926TBRclhUGw']

## Scraping a Real web page

Request Library is used to fetch the html code for a web page

In [None]:
import requests

In [None]:
url = 'http://books.toscrape.com/'
response = requests.get(url)
response

<Response [200]>

Now we can successfully access the html code for this web page. Let's look at the top 1000 characters

In [None]:
print(response.text[:1000])

<!DOCTYPE html>
<!--[if lt IE 7]>      <html lang="en-us" class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]>         <html lang="en-us" class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html lang="en-us" class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html lang="en-us" class="no-js"> <!--<![endif]-->
    <head>
        <title>
    All products | Books to Scrape - Sandbox
</title>

        <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
        <meta name="created" content="24th Jun 2016 09:29" />
        <meta name="description" content="" />
        <meta name="viewport" content="width=device-width" />
        <meta name="robots" content="NOARCHIVE,NOCACHE" />

        <!-- Le HTML5 shim, for IE6-8 support of HTML elements -->
        <!--[if lt IE 9]>
        <script src="//html5shim.googlecode.com/svn/trunk/html5.js"></script>
        <![endif]-->

        
            <link rel="shortcut icon" href="static/oscar/favicon.

In [None]:
# type on response text
type(response.text)

str

Since, the test is simply a pythin string, i can directly pass it to BeautifulSoup to get the soup object

In [None]:
soup = bs(response.text)
type(soup)

bs4.BeautifulSoup

### Site Title

In [None]:
soup.find('title').text.strip()

'All products | Books to Scrape - Sandbox'

### Extract Information about the book
Its time to inspect all the HTML tag and to identify the book tag so that we can extract information about the books.

tag: Article
class name = product_pod

In [None]:
books_tag = soup.find_all("article",class_ = "product_pod")

In [None]:
# Let's loook at the number of books in the page
len(books_tag)

20

There are 20 books in this page

Let's try to find out information about a single book

In [None]:
book_tag = books_tag[0]
book_tag

<article class="product_pod">
<div class="image_container">
<a href="catalogue/a-light-in-the-attic_1000/index.html"><img alt="A Light in the Attic" class="thumbnail" src="media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"/></a>
</div>
<p class="star-rating Three">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a></h3>
<div class="product_price">
<p class="price_color">Â£51.77</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In stock
    
</p>
<form>
<button class="btn btn-primary btn-block" data-loading-text="Adding..." type="submit">Add to basket</button>
</form>
</div>
</article>

In [None]:
# Book title is under h3, present insdie 'a' tag
title_tag = book_tag.find('a',title=True)
title_tag

<a href="catalogue/a-light-in-the-attic_1000/index.html" title="A Light in the Attic">A Light in the ...</a>

In [None]:
title_tag['title']

'A Light in the Attic'

Similarly extract ratings, price & book_link.

In [None]:
#rating
rating = book_tag.find("p")['class'][1]
rating

'Three'

In [None]:
# Price
price = book_tag.find('p',class_="price_color").text[1:]       # It's all just string manipulation
price

'£51.77'

In [None]:
#Book_link
link = "http://books.toscrape.com/" + book_tag.find('a')['href']
link

'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'

Create a function to get the above information about all the books by just pasing book_tag as an argument

In [None]:
def get_book_details(book_tag):
  title = book_tag.find('a',title=True)['title']
  rating = book_tag.find("p")['class'][1]
  price = book_tag.find('p',class_="price_color").text[1:]       # It's all just string manipulation
  link = "http://books.toscrape.com/" + book_tag.find('a')['href']

  return title, rating, price, link


### Write some more functions

In [None]:
# get_soup function will take URL and return a soup object
def get_soup(url):
  resp = requests.get(url)
  if resp.status_code == 200:
    return bs(resp.text)
  else:
    return None

# get_books function will return information about a book
def get_books(url):
  soup = get_soup(url)
  book_tags = soup.find_all('article',class_='product_pod')

  books = []
  for book_tag in book_tags:
    books.append(get_book_details(book_tag))

  return books

In [None]:
url= "http://books.toscrape.com/"
books = get_books(url)
print(f'There are {len(books)} books in this page')

There are 20 books in this page


Let's have a look at books

In [None]:
books[:3]

[('A Light in the Attic',
  'Three',
  '£51.77',
  'http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html'),
 ('Tipping the Velvet',
  'One',
  '£53.74',
  'http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'),
 ('Soumission',
  'One',
  '£50.10',
  'http://books.toscrape.com/catalogue/soumission_998/index.html')]

Now we can make pandas DataFrame that would extract all the 1000 books from the website

In [None]:
import pandas as pd

def get_all_books(page = 3):
    books = []
    for i in range(1, page+1):
        ## This is how the url changes with every page
        url = f'http://books.toscrape.com/catalogue/page-{i}.html'
        soup = get_soup(url)
        if soup:
            book_tags = soup.find_all('article', class_='product_pod')

            for book_tag in book_tags:
                books.append(get_book_details(book_tag))

    books = pd.DataFrame(books, columns=['title', 'rating', 'price', 'link'])
    return books

Scrape first 3 pages to test our code

In [None]:
df = get_all_books(3)
df.head()

Unnamed: 0,title,rating,price,link
0,A Light in the Attic,Three,£51.77,http://books.toscrape.com/a-light-in-the-attic...
1,Tipping the Velvet,One,£53.74,http://books.toscrape.com/tipping-the-velvet_9...
2,Soumission,One,£50.10,http://books.toscrape.com/soumission_998/index...
3,Sharp Objects,Four,£47.82,http://books.toscrape.com/sharp-objects_997/in...
4,Sapiens: A Brief History of Humankind,Five,£54.23,http://books.toscrape.com/sapiens-a-brief-hist...


In [None]:
df.shape

(60, 4)

## FINAL CODE

In [None]:
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

def get_soup(url):
    """Takes URL and returns a soup object"""
    try:
        resp = requests.get(url)
    except:
        return None

    if resp.status_code == 200:
        return bs(resp.text)
    else:
        return None

def get_each_book_info(link):
  soup = get_soup(link)
  if soup:
    try:
      ## Price exclusive tax
      price_exc = soup.find("th",string="Price (excl. tax)").find_next("td").text.strip()[1:]
    except:
      return None

    try:
      ## Price inclusive tax
      price_incl = soup.find("th",string="Price (incl. tax)").find_next("td").text.strip()[1:]
    except:
      return None

    try:
      ## TAX
      tax = soup.find("th",string="Tax").find_next("td").text.strip()[1:]
    except:
      return None

    try:
      ## Availability
      Availability = soup.find("th",string="Availability").find_next("td").text.strip()
    except:
      return None

    try:
      ## Number of Reviews
      no_reviews = soup.find("th",string="Number of reviews").find_next("td").text.strip()
    except:
      return None

  return price_exc, price_incl, tax, Availability, no_reviews


def get_details(book_tag):
    ## title
    try:
        title = book_tag.find('a', title=True)['title']
    except:
        title = None

    ## rating
    try:
        rating = book_tag.find('p')['class'][1]
    except:
        rating = None

    ## Price
    try:
        price = book_tag.find('p', class_='price_color').text[1:]
    except:
        price = None

    ## Link
    try:
        link = 'http://books.toscrape.com/catalogue/' + book_tag.find('a')['href']
        price_excl, price_incl, tax, Availability, No_reviews = get_each_book_info(link)

    except:
        price = None

    return title, rating, price, link,price_excl, price_incl, tax, Availability, No_reviews



def get_all_books(page = 3):
    books = []
    for i in range(1, page+1):
        url = f'http://books.toscrape.com/catalogue/page-{i}.html'
        soup = get_soup(url)
        if soup:
            try:
                book_tags = soup.find_all('article', class_='product_pod')

                for book_tag in book_tags:
                    books.append(get_details(book_tag))
            except:
                print(f'Error reading page {i} . . .')

            time.sleep(1) # sleep before making the next request

    books = pd.DataFrame(books, columns=['title', 'rating', 'price', 'link','price_excl','price_incl','Tax','Availability','No_of_Reviews'])
    return books

In [None]:
df = get_all_books(50)
df.head(5)

Unnamed: 0,title,rating,price,link,price_excl,price_incl,Tax,Availability,No_of_Reviews
0,A Light in the Attic,Three,£51.77,http://books.toscrape.com/catalogue/a-light-in...,£51.77,£51.77,£0.00,In stock (22 available),0
1,Tipping the Velvet,One,£53.74,http://books.toscrape.com/catalogue/tipping-th...,£53.74,£53.74,£0.00,In stock (20 available),0
2,Soumission,One,£50.10,http://books.toscrape.com/catalogue/soumission...,£50.10,£50.10,£0.00,In stock (20 available),0
3,Sharp Objects,Four,£47.82,http://books.toscrape.com/catalogue/sharp-obje...,£47.82,£47.82,£0.00,In stock (20 available),0
4,Sapiens: A Brief History of Humankind,Five,£54.23,http://books.toscrape.com/catalogue/sapiens-a-...,£54.23,£54.23,£0.00,In stock (20 available),0


Now save the DataFrame in the csv format



In [None]:
df.to_csv('Books.csv',index=False)