# Webscraping on http://quotes.toscrape.com/

**Importing the necessary libraries to scrape websites**

In [2]:
import requests
import bs4

**Use requests and BeautifulSoup libraries to connect to website and get the HMTL text from homepage**

In [3]:
result = requests.get("http://quotes.toscrape.com/")

In [33]:
# grab first 500 characters, just to get a rough outline of html doc
result.text[:500]

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md'

In [7]:
# convert into BeautifulSoup object
soup = bs4.BeautifulSoup(result.text,"lxml")

## TASK: Extract names of all unique authors on first page

In [14]:
# empty set to begin with
authors = set()

# add each author to set
for item in soup.select(".author"):
    authors.add(item.getText())

print("Page 1:")
authors

Page 1:


{'Albert Einstein',
 'André Gide',
 'Eleanor Roosevelt',
 'J.K. Rowling',
 'Jane Austen',
 'Marilyn Monroe',
 'Steve Martin',
 'Thomas A. Edison'}

## TASK: Extract names of all unique authors for each page on website

**All pages on website follow a format of 'http://quotes.toscrape.com/page/1/',
where the number 1 represents the page number that can be replaced with any number**

In [35]:
base_url = 'http://quotes.toscrape.com/page/{}/'

In [36]:
base_url.format('1') # example

'http://quotes.toscrape.com/page/1/'

**Any pages after the last page (there are 10 pages total), will contain text that reads "No Quotes Found!". The class that contains this text is 'col-md-8'**

In [41]:
res = requests.get(base_url.format('11')) # go to page 11
soup = bs4.BeautifulSoup(res.text,'lxml')

soup.select('.col-md-8')

[<div class="col-md-8">
 <h1>
 <a href="/" style="text-decoration: none">Quotes to Scrape</a>
 </h1>
 </div>,
 <div class="col-md-8">
 
 No quotes found!
 
     <nav>
 <ul class="pager">
 <li class="previous">
 <a href="/page/10/"><span aria-hidden="true">←</span> Previous</a>
 </li>
 </ul>
 </nav>
 </div>]

In [49]:
# iterator used to refer to page number
i = 0

while True:
    '''
    Loop through each page and print unique authors on that page
    '''
    
    i += 1 # go to next page
    authors = set() # set for name of authors
    no_quotes = False # boolean to check if there are quotes on page
    
    res = requests.get(base_url.format(i))
    soup = bs4.BeautifulSoup(res.text,'lxml')

    # check if the "No quotes found!" is on current page
    for div_class in soup.select('.col-md-8'):
        if ("No quotes found!" in div_class.text) == True:
            no_quotes = True
            print("Last page is",i-1)
            break
        else:
            pass
    
    # End loop if there are no quotes
    if no_quotes == True:
        break
    else:
        pass
    
    # add each unique author to set
    for item in soup.select(".author"):
        authors.add(item.text)
        
    print("Page " + str(i) + ":")
    print(authors,"\n")

Page 1:
{'Thomas A. Edison', 'Eleanor Roosevelt', 'J.K. Rowling', 'André Gide', 'Jane Austen', 'Marilyn Monroe', 'Steve Martin', 'Albert Einstein'} 

Page 2:
{'Friedrich Nietzsche', 'Mark Twain', 'Allen Saunders', 'Douglas Adams', 'Bob Marley', 'J.K. Rowling', 'Dr. Seuss', 'Marilyn Monroe', 'Elie Wiesel', 'Albert Einstein'} 

Page 3:
{'Jim Henson', 'Mother Teresa', 'Bob Marley', 'J.K. Rowling', 'Dr. Seuss', 'Pablo Neruda', 'Garrison Keillor', 'Ralph Waldo Emerson', 'Albert Einstein'} 

Page 4:
{'Mother Teresa', 'Charles M. Schulz', 'Bob Marley', 'J.K. Rowling', 'Dr. Seuss', 'William Nicholson', 'George Eliot', 'Jorge Luis Borges', 'Albert Einstein'} 

Page 5:
{'Martin Luther King Jr.', 'J.K. Rowling', 'James Baldwin', 'George R.R. Martin', 'Marilyn Monroe', 'C.S. Lewis', 'Albert Einstein'} 

Page 6:
{'Haruki Murakami', 'Ernest Hemingway', 'Eleanor Roosevelt', 'Stephenie Meyer', 'Helen Keller', 'Alexandre Dumas fils', 'George Bernard Shaw', 'Jane Austen', 'Marilyn Monroe', 'Albert Einst

## TASK: Create a list of all the quotes on the first page

In [51]:
result = requests.get("http://quotes.toscrape.com/")
soup = bs4.BeautifulSoup(result.text,"lxml")

In [53]:
# empty list to begin with
quotes = []

# add each quote to list
for quote in soup.select(".text"):
    quotes.append(quote.text)

print("Page 1:")
quotes

Page 1:


['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

**We could also print out the quotes with their corresponding author**

In [62]:
for author in soup.select(".author"):
    print(author.text)

Albert Einstein
J.K. Rowling
Albert Einstein
Jane Austen
Marilyn Monroe
Albert Einstein
André Gide
Thomas A. Edison
Eleanor Roosevelt
Steve Martin


In [79]:
# add each author and their quote
for i, author in enumerate(soup.select(".author")):
    print(author.text + ": " + soup.select(".text")[i].text + "\n")

Albert Einstein: “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”

J.K. Rowling: “It is our choices, Harry, that show what we truly are, far more than our abilities.”

Albert Einstein: “There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”

Jane Austen: “The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”

Marilyn Monroe: “Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”

Albert Einstein: “Try not to become a man of success. Rather become a man of value.”

André Gide: “It is better to be hated for what you are than to be loved for what you are not.”

Thomas A. Edison: “I have not failed. I've just found 10,000 ways that won't work.”

Eleanor Roosevelt: “A woman is like a tea bag; you never know how strong it is until it's in hot wat

## TASK: Extract top ten tags shown on the top right of page

In [68]:
# class used for the tags
soup.select('.tag-item')

[<span class="tag-item">
 <a class="tag" href="/tag/love/" style="font-size: 28px">love</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/inspirational/" style="font-size: 26px">inspirational</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/life/" style="font-size: 26px">life</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/humor/" style="font-size: 24px">humor</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/books/" style="font-size: 22px">books</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/reading/" style="font-size: 14px">reading</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friendship/" style="font-size: 10px">friendship</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friends/" style="font-size: 8px">friends</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/truth/" style="font-size: 8px">truth</a>
 </span>,
 <span class="tag-item">
 <a class="

**Note that each tag text contains \n at both ends of the string. So we'll use indexing and slicing to remove the newlines**

In [72]:
soup.select(".tag-item")[0].text

'\nlove\n'

In [73]:
soup.select(".tag-item")[0].text[1:-1]

'love'

In [78]:
for i,item in enumerate(soup.select(".tag-item")):
    print(i+1,":",item.text[1:-1]) # slicing used to remove newlines

1 : love
2 : inspirational
3 : life
4 : humor
5 : books
6 : reading
7 : friendship
8 : friends
9 : truth
10 : simile
