In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Set up Splinter

executable_path = {'executable_path': ChromeDriverManager().install()}

browser = Browser('chrome', **executable_path, headless=False)

# Scrape the Title

In [3]:
# Visit the Quotes to Scrape site

# This code tells Splinter which site we want to visit by assigning the link to a URL.
# After executing the cell avove, we will use BeautifulSoup to parse the HTML. In the
# next cell, we'll two more lines of code.

url = 'http://quotes.toscrape.com/'
browser.visit(url)

In [4]:
# Parse the HTML

html = browser.html
html_soup = soup(html, 'html.parser')

# Now we've parsed all of the HTML on the page. This means that BeautifulSoup has taken a look at the 
# different components and can now access them. Specifically, BeautifulSoup parses HTML text and then 
# stores it as an object. 

# In our code, we're using 'html.parser' to parse the information, but there are other options available, as well.

# In our next cell, we will find the title and extract it.

In [5]:
# Scrape the title
title = html_soup.find('h2').text
title

# What we done with the last 2 lines of code is:

#  1. We used our html_soup object we created and chained find() to it to search for the <h2/> tag.
#  2. We've also extracted only the text within the HTML tags by adding .text to the end of the code.

# We've completed our first actual scrape. Let's practice again, this time using Splinter ti scrape
# tags to go with the title we just pulled.

'Top Ten tags'

# Scrape All of the Tags

# ![image.png](attachment:image.png)

# ![image.png](attachment:image.png)


In [6]:
# Scrape the top ten tags
tag_box = html_soup.find('div', class_='tags-box')
# tag_box
tags = tag_box.find_all('a', class_='tag')

for tag in tags:
    word = tag.text
    print(word)

love
inspirational
life
humor
books
reading
friendship
friends
truth
simile


# Scrape Across Pages

# ![image.png](attachment:image.png)


# ![image-2.png](attachment:image-2.png)

In [7]:
url = 'http://quotes.toscrape.com/'
browser.visit(url)

# ![image.png](attachment:image.png)

In [8]:
# We'll use range(1,6) in our for loop to visit the first five pages of the website.

for x in range(1, 6):
   html = browser.html
   quote_soup = soup(html, 'html.parser')
   quotes = quote_soup.find_all('span', class_='text')
   for quote in quotes:
      print('page:', x, '----------')
      print(quote.text)
   browser.links.find_by_partial_text('Next').click()

page: 1 ----------
“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
page: 1 ----------
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
page: 1 ----------
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
page: 1 ----------
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
page: 1 ----------
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
page: 1 ----------
“Try not to become a man of success. Rather become a man of value.”
page: 1 ----------
“It is better to be hated for what you are than to be loved for what you are not.”
page: 1 ----------
“I have not failed. I've just found 10,000 ways that won't work.”
page: 1 ----------
“A woman is like a tea bag; you never know how strong it is u

# ![image.png](attachment:image.png)

In [9]:
# It's important to note that there are many ways that BeautifulSoup can
# seach for text, but the syntax is typically the same:
# we look for the tag first, then an attribute.
# We can search for itmes using only a tag, such as <span /> or <h1 />,
# but a class or id makes the seach much more specific.

# By including an attribute, we have a far better chance of scraping the data we want.


In [10]:
url = 'http://books.toscrape.com/'
browser.visit(url)

In [22]:
for x in range(1, 20):
   html = browser.html
   quote_soup = soup(html, 'html.parser')
 #  quotes = quote_soup.find_all('span', class_='quote')
   quotes = quote_soup.find_all( 'ol', class_='row')
   # for quote in quotes:
   print('page:', x, '----------')
   print(quotes)
   # browser.links.find_by_partial_text('Next').click()

page: 1 ----------
[<ol class="row">
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod">
<div class="image_container">
<a href="../../../dont-be-a-jerk-and-other-practical-advice-from-dogen-japans-greatest-zen-master_890/index.html"><img alt="Don't Be a Jerk: And Other Practical Advice from Dogen, Japan's Greatest Zen Master" class="thumbnail" src="../../../../media/cache/95/30/953013d044aa313cc162dec414f3969a.jpg"/></a>
</div>
<p class="star-rating Two">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="../../../dont-be-a-jerk-and-other-practical-advice-from-dogen-japans-greatest-zen-master_890/index.html" title="Don't Be a Jerk: And Other Practical Advice from Dogen, Japan's Greatest Zen Master">Don't Be a Jerk: ...</a></h3>
<div class="product_price">
<p class="price_color">£37.97</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In sto

page: 12 ----------
[<ol class="row">
<li class="col-xs-6 col-sm-4 col-md-3 col-lg-3">
<article class="product_pod">
<div class="image_container">
<a href="../../../dont-be-a-jerk-and-other-practical-advice-from-dogen-japans-greatest-zen-master_890/index.html"><img alt="Don't Be a Jerk: And Other Practical Advice from Dogen, Japan's Greatest Zen Master" class="thumbnail" src="../../../../media/cache/95/30/953013d044aa313cc162dec414f3969a.jpg"/></a>
</div>
<p class="star-rating Two">
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
<i class="icon-star"></i>
</p>
<h3><a href="../../../dont-be-a-jerk-and-other-practical-advice-from-dogen-japans-greatest-zen-master_890/index.html" title="Don't Be a Jerk: And Other Practical Advice from Dogen, Japan's Greatest Zen Master">Don't Be a Jerk: ...</a></h3>
<div class="product_price">
<p class="price_color">£37.97</p>
<p class="instock availability">
<i class="icon-ok"></i>
    
        In st