# static HTML webpages scrapping using Beautifulsoup, requests and pandas libraries 

In [1]:
#importing libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests

#using the requests library, we will get the page we want to scrape and extract it’s HTML:

a = requests.get('http://quotes.toscrape.com/')

#pass the site’s HTML text to BeautifulSoup, which will parse this raw data so it can be easily scraped:

soup_object = BeautifulSoup(a.text)

#All of the site’s data is now stored in the soup object

#extract all of the text available on the web page

print(soup_object.get_text())




Quotes to Scrape








Quotes to Scrape




Login






“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
by Albert Einstein
(about)


            Tags:
            
change
deep-thoughts
thinking
world



“It is our choices, Harry, that show what we truly are, far more than our abilities.”
by J.K. Rowling
(about)


            Tags:
            
abilities
choices



“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
by Albert Einstein
(about)


            Tags:
            
inspirational
life
live
miracle
miracles



“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
by Jane Austen
(about)


            Tags:
            
aliteracy
books
classic
humor



“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
by Marilyn Monroe


In [2]:
#BeautifulSoup has methods like find() and findAll() that you can use to extract specific HTML tags from the web page.
#extract all the data in this class
for i in soup_object.findAll("div",{"class":"quote"}):
    print((i.find("span",{"class":"text"})).text)

“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
“It is our choices, Harry, that show what we truly are, far more than our abilities.”
“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”
“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”
“Try not to become a man of success. Rather become a man of value.”
“It is better to be hated for what you are than to be loved for what you are not.”
“I have not failed. I've just found 10,000 ways that won't work.”
“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
“A day without sunshine is like, you know, night.”


In [3]:
#scrape author names
#use the find() and findAll() functions to extract all the author names within this tag

for i in soup_object.findAll("div",{"class":"quote"}):
    print((i.find("small",{"class":"author"})).text)

Albert Einstein
J.K. Rowling
Albert Einstein
Jane Austen
Marilyn Monroe
Albert Einstein
André Gide
Thomas A. Edison
Eleanor Roosevelt
Steve Martin


In [4]:
#scrape the tags listed on the site.
for i in soup_object.findAll("div",{"class":"tags"}):
    print((i.find("meta"))['content'])

change,deep-thoughts,thinking,world
abilities,choices
inspirational,life,live,miracle,miracles
aliteracy,books,classic,humor
be-yourself,inspirational
adulthood,success,value
life,love
edison,failure,inspirational,paraphrased
misattributed-eleanor-roosevelt
humor,obvious,simile


In [5]:
#the site has ten pages, and we need to collect the same data from all of them.
#create three empty arrays so we can store the data collected

quotes = []
authors = []
tags = []

#create a loop that ranges from 1–10, and iterate through every page on the site. We will run the exact same lines of code we created earlier. The only difference is that instead of printing the output, we will now append it to an array
for pages in range(1,10):    
    a = requests.get('http://quotes.toscrape.com/page/'+str(pages))
    soup_object = BeautifulSoup(a.text)    
    for i in soup_object.findAll("div",{"class":"quote"}):
        quotes.append((i.find("span",{"class":"text"})).text)  
    for j in soup_object.findAll("div",{"class":"quote"}):
        authors.append((j.find("small",{"class":"author"})).text)    
    for k in soup_object.findAll("div",{"class":"tags"}):
        tags.append((k.find("meta"))['content'])


In [6]:
#consolidate all the data collected into a Pandas dataframe:
scraped_data_df = pd.DataFrame(
    {'Quotes':quotes,
     'Authors':authors,
     'Tags':tags
    })

In [7]:
scraped_data_df

Unnamed: 0,Quotes,Authors,Tags
0,“The world as we have created it is a process ...,Albert Einstein,"change,deep-thoughts,thinking,world"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"abilities,choices"
2,“There are only two ways to live your life. On...,Albert Einstein,"inspirational,life,live,miracle,miracles"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"aliteracy,books,classic,humor"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"be-yourself,inspirational"
...,...,...,...
85,“Some day you will be old enough to start read...,C.S. Lewis,"age,fairytales,growing-up"
86,“We are not necessarily doubting that God will...,C.S. Lewis,god
87,“The fear of death follows from the fear of li...,Mark Twain,"death,life"
88,“A lie can travel half way around the world wh...,Mark Twain,"misattributed-mark-twain,truth"
