## Collect all the quotes and authors from quotes.toscrape.com

In [1]:
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
from tqdm import tqdm

url = 'http://quotes.toscrape.com/'
# get response status
response = requests.get(url)
print(response)

# create a BeautifulSoup object and fill the content from our url
soup = BeautifulSoup(response.content, 'html.parser')
soup.title.string

<Response [200]>


'Quotes to Scrape'

In [2]:
# get all the text from the page
print(soup.get_text()[3:60]) # we print till 60'th index as our data is too large


Quotes to Scrape








Quotes to Scrape




Login







In [3]:
# get the html structure of the page
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Quotes to Scrape
  </title>
  <link href="/static/bootstrap.min.css" rel="stylesheet"/>
  <link href="/static/main.css" rel="stylesheet"/>
 </head>
 <body>
  <div class="container">
   <div class="row header-box">
    <div class="col-md-8">
     <h1>
      <a href="/" style="text-decoration: none">
       Quotes to Scrape
      </a>
     </h1>
    </div>
    <div class="col-md-4">
     <p>
      <a href="/login">
       Login
      </a>
     </p>
    </div>
   </div>
   <div class="row">
    <div class="col-md-8">
     <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
      <span class="text" itemprop="text">
       “The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
      </span>
      <span>
       by
       <small class="author" itemprop="author">
        Albert Einstein
       </small>
       <a href="/author/Albert

In [4]:
# get all the links from page
soup.find_all('a')

for link in soup.find_all('a'):
    print(link.get('href'))

/
/login
/author/Albert-Einstein
/tag/change/page/1/
/tag/deep-thoughts/page/1/
/tag/thinking/page/1/
/tag/world/page/1/
/author/J-K-Rowling
/tag/abilities/page/1/
/tag/choices/page/1/
/author/Albert-Einstein
/tag/inspirational/page/1/
/tag/life/page/1/
/tag/live/page/1/
/tag/miracle/page/1/
/tag/miracles/page/1/
/author/Jane-Austen
/tag/aliteracy/page/1/
/tag/books/page/1/
/tag/classic/page/1/
/tag/humor/page/1/
/author/Marilyn-Monroe
/tag/be-yourself/page/1/
/tag/inspirational/page/1/
/author/Albert-Einstein
/tag/adulthood/page/1/
/tag/success/page/1/
/tag/value/page/1/
/author/Andre-Gide
/tag/life/page/1/
/tag/love/page/1/
/author/Thomas-A-Edison
/tag/edison/page/1/
/tag/failure/page/1/
/tag/inspirational/page/1/
/tag/paraphrased/page/1/
/author/Eleanor-Roosevelt
/tag/misattributed-eleanor-roosevelt/page/1/
/author/Steve-Martin
/tag/humor/page/1/
/tag/obvious/page/1/
/tag/simile/page/1/
/page/2/
/tag/love/
/tag/inspirational/
/tag/life/
/tag/humor/
/tag/books/
/tag/reading/
/tag/fri

In [5]:
allquotes = soup.find_all('span', class_='text')
allquotes

[<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>,
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>,
 <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>,
 <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>,
 <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>,
 <span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>,
 <span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.

In [6]:
# get the text only
quotes = []
for quote in allquotes:
        print(quote.text)
        quotes.append(quote.text)

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


In [7]:
# get the authors' names
allauthors = soup.find_all('small', class_='author')
authors  = []
for author in allauthors:
        print(author.text)
        authors.append(author.text)

Albert Einstein
J.K. Rowling
Albert Einstein
Jane Austen
Marilyn Monroe
Albert Einstein
André Gide
Thomas A. Edison
Eleanor Roosevelt
Steve Martin


## Scrape through all the pages

In [8]:
page = 1
while page != 11:
    url = f"http://quotes.toscrape.com/page/{page}/"
    print(url)
    page = page + 1

http://quotes.toscrape.com/page/1/
http://quotes.toscrape.com/page/2/
http://quotes.toscrape.com/page/3/
http://quotes.toscrape.com/page/4/
http://quotes.toscrape.com/page/5/
http://quotes.toscrape.com/page/6/
http://quotes.toscrape.com/page/7/
http://quotes.toscrape.com/page/8/
http://quotes.toscrape.com/page/9/
http://quotes.toscrape.com/page/10/


In [9]:
page = 1

allQuotes = []
allAuthors = []

while page != 11:
    url = f"http://quotes.toscrape.com/page/{page}/"
    
    response = requests.get(url)
    html = response.content
    soup = BeautifulSoup(html, "lxml")
    
    for quote in soup.find_all("span", class_="text"):
        allQuotes.append(quote.text)
        
    for author in soup.find_all('small', class_='author'):
        allAuthors.append(author.text)
        
    page = page + 1

In [10]:
# remove reduntant characters from sentences
characters = str.maketrans("", "", "”“")
quotes = []
quotes = [s.translate(characters) for s in allQuotes]

In [11]:
# create a table from scrapped data
scrapedQuotes = pd.DataFrame({'Authors': allAuthors, 'Quotes': quotes})
scrapedQuotes

Unnamed: 0,Authors,Quotes
0,Albert Einstein,The world as we have created it is a process o...
1,J.K. Rowling,"It is our choices, Harry, that show what we tr..."
2,Albert Einstein,There are only two ways to live your life. One...
3,Jane Austen,"The person, be it gentleman or lady, who has n..."
4,Marilyn Monroe,"Imperfection is beauty, madness is genius and ..."
...,...,...
95,Harper Lee,You never really understand a person until you...
96,Madeleine L'Engle,You have to write the book that wants to be wr...
97,Mark Twain,Never tell the truth to people who are not wor...
98,Dr. Seuss,"A person's a person, no matter how small."
