## Python Web Scraping Practice
### Website URL: http://qoutes.toscrape.com

In [1]:
import requests
import bs4
import lxml

### Get the html text from the homepage

In [2]:
req = requests.get('http://quotes.toscrape.com')

In [3]:
soup = bs4.BeautifulSoup(req.text,'lxml')

In [4]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="t

### Get names of all authors on the first page

In [5]:
soup.select('.author')

[<small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">J.K. Rowling</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">Jane Austen</small>,
 <small class="author" itemprop="author">Marilyn Monroe</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">André Gide</small>,
 <small class="author" itemprop="author">Thomas A. Edison</small>,
 <small class="author" itemprop="author">Eleanor Roosevelt</small>,
 <small class="author" itemprop="author">Steve Martin</small>]

In [6]:
type(soup.select('.author'))

bs4.element.ResultSet

In [7]:
soup.select('.author')

[<small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">J.K. Rowling</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">Jane Austen</small>,
 <small class="author" itemprop="author">Marilyn Monroe</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">André Gide</small>,
 <small class="author" itemprop="author">Thomas A. Edison</small>,
 <small class="author" itemprop="author">Eleanor Roosevelt</small>,
 <small class="author" itemprop="author">Steve Martin</small>]

In [8]:
len(soup.select('.author'))

10

In [9]:
soup.select('.author')[0]

<small class="author" itemprop="author">Albert Einstein</small>

In [10]:
soup.select('.author')[0].text

'Albert Einstein'

In [11]:
authors = []

for i in range(len(soup.select('.author'))):
    authors.append(soup.select('.author')[i].text)

In [12]:
authors

['Albert Einstein',
 'J.K. Rowling',
 'Albert Einstein',
 'Jane Austen',
 'Marilyn Monroe',
 'Albert Einstein',
 'André Gide',
 'Thomas A. Edison',
 'Eleanor Roosevelt',
 'Steve Martin']

In [13]:
# Eliminating dublicates
authors = set(authors)

In [14]:
authors

{'Albert Einstein',
 'André Gide',
 'Eleanor Roosevelt',
 'J.K. Rowling',
 'Jane Austen',
 'Marilyn Monroe',
 'Steve Martin',
 'Thomas A. Edison'}

In [15]:
# Another Way
authors = set()
for name in soup.select('.author'):
    authors.add(name.text)

In [16]:
authors

{'Albert Einstein',
 'André Gide',
 'Eleanor Roosevelt',
 'J.K. Rowling',
 'Jane Austen',
 'Marilyn Monroe',
 'Steve Martin',
 'Thomas A. Edison'}

### Create a list of all quotes on the first page

In [17]:
n = len(soup.select('.text'))
n

10

In [18]:
soup.select('.text')[0]

<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>

In [19]:
quotes = []

for i in range(n):
    quotes.append(soup.select('.text')[i].text)

In [20]:
quotes

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

In [21]:
# Another Way
quotes = []

for quote in soup.select('.text'):
    quotes.append(quote.text)

In [22]:
quotes

['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
 '“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
 '“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
 '“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
 "“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
 '“Try not to become a man of success. Rather become a man of value.”',
 '“It is better to be hated for what you are than to be loved for what you are not.”',
 "“I have not failed. I've just found 10,000 ways that won't work.”",
 "“A woman is like a tea bag; you never know how strong it is until it's in hot water.”",
 '“A day without sunshine is like, you know, night.”']

## Extract top ten tags

In [23]:
soup.select('.tag-item')

[<span class="tag-item">
 <a class="tag" href="/tag/love/" style="font-size: 28px">love</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/inspirational/" style="font-size: 26px">inspirational</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/life/" style="font-size: 26px">life</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/humor/" style="font-size: 24px">humor</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/books/" style="font-size: 22px">books</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/reading/" style="font-size: 14px">reading</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friendship/" style="font-size: 10px">friendship</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/friends/" style="font-size: 8px">friends</a>
 </span>,
 <span class="tag-item">
 <a class="tag" href="/tag/truth/" style="font-size: 8px">truth</a>
 </span>,
 <span class="tag-item">
 <a class="

In [24]:
soup.select('.tag-item')[0].text

'\nlove\n'

In [25]:
print(soup.select('.tag-item')[0].text)


love



In [26]:
n = len(soup.select('.tag-item'))
n

10

In [27]:
tags = []

for i in range(n):
    tags.append(soup.select('.tag-item')[i].text)

In [28]:
# better way
tags = []

for tag in soup.select('.tag-item'):
    tags.append(tag.text)

In [29]:
for tag in tags:
    print(tag)


love


inspirational


life


humor


books


reading


friendship


friends


truth


simile



## Get all the unique authors on all the pages 

In [30]:
base_url = 'http://quotes.toscrape.com/page/{}/'

In [31]:
# page 3
page = base_url.format(3)
page

'http://quotes.toscrape.com/page/3/'

In [32]:
req = requests.get(page)
soup = bs4.BeautifulSoup(req.text,'lxml')
soup

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“I love you without knowing how, or when, or from where. I love you simply, without problems or pride: I love you in this way because I do not know any other way of loving but this, in which there is no I or you, so intimate that your hand upon my chest is my hand, so intimate that when I fall asleep your eyes close.”</span>
<span>by <small class="author" itemprop="author">Pablo Neruda</small>
<a h

In [33]:
soup.select('.author')

[<small class="author" itemprop="author">Pablo Neruda</small>,
 <small class="author" itemprop="author">Ralph Waldo Emerson</small>,
 <small class="author" itemprop="author">Mother Teresa</small>,
 <small class="author" itemprop="author">Garrison Keillor</small>,
 <small class="author" itemprop="author">Jim Henson</small>,
 <small class="author" itemprop="author">Dr. Seuss</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">J.K. Rowling</small>,
 <small class="author" itemprop="author">Albert Einstein</small>,
 <small class="author" itemprop="author">Bob Marley</small>]

In [34]:
soup.select('.author')[0].text

'Pablo Neruda'

In [35]:
base_url = 'http://quotes.toscrape.com/page/{}/'

In [36]:
req = requests.get('http://quotes.toscrape.com/')
soup = bs4.BeautifulSoup(req.text,'lxml')
base_url = 'http://quotes.toscrape.com/page/{}/' 
# use "set" to avoid dublicates (instead of list)
authors = set()

for i in range(1,11):
    page = base_url.format(i)
    req = requests.get(page)
    soup = bs4.BeautifulSoup(req.text,'lxml')

    for author in soup.select('.author'):
        authors.add(author.text)

In [37]:
authors

{'Albert Einstein',
 'Alexandre Dumas fils',
 'Alfred Tennyson',
 'Allen Saunders',
 'André Gide',
 'Ayn Rand',
 'Bob Marley',
 'C.S. Lewis',
 'Charles Bukowski',
 'Charles M. Schulz',
 'Douglas Adams',
 'Dr. Seuss',
 'E.E. Cummings',
 'Eleanor Roosevelt',
 'Elie Wiesel',
 'Ernest Hemingway',
 'Friedrich Nietzsche',
 'Garrison Keillor',
 'George Bernard Shaw',
 'George Carlin',
 'George Eliot',
 'George R.R. Martin',
 'Harper Lee',
 'Haruki Murakami',
 'Helen Keller',
 'J.D. Salinger',
 'J.K. Rowling',
 'J.M. Barrie',
 'J.R.R. Tolkien',
 'James Baldwin',
 'Jane Austen',
 'Jim Henson',
 'Jimi Hendrix',
 'John Lennon',
 'Jorge Luis Borges',
 'Khaled Hosseini',
 "Madeleine L'Engle",
 'Marilyn Monroe',
 'Mark Twain',
 'Martin Luther King Jr.',
 'Mother Teresa',
 'Pablo Neruda',
 'Ralph Waldo Emerson',
 'Stephenie Meyer',
 'Steve Martin',
 'Suzanne Collins',
 'Terry Pratchett',
 'Thomas A. Edison',
 'W.C. Fields',
 'William Nicholson'}

In [38]:
len(authors)

50

In [39]:
print(authors)

{'Albert Einstein', 'J.R.R. Tolkien', 'Douglas Adams', 'W.C. Fields', 'George Bernard Shaw', 'George Carlin', 'E.E. Cummings', 'Jane Austen', 'Mother Teresa', 'Helen Keller', 'Charles Bukowski', 'Ayn Rand', 'Alfred Tennyson', 'J.D. Salinger', 'Eleanor Roosevelt', 'Charles M. Schulz', 'William Nicholson', 'Martin Luther King Jr.', 'Stephenie Meyer', 'George R.R. Martin', 'Ernest Hemingway', 'Elie Wiesel', 'Ralph Waldo Emerson', 'Jimi Hendrix', 'J.M. Barrie', 'Garrison Keillor', 'Khaled Hosseini', 'Harper Lee', "Madeleine L'Engle", 'Terry Pratchett', 'Allen Saunders', 'Jorge Luis Borges', 'Thomas A. Edison', 'Jim Henson', 'George Eliot', 'Marilyn Monroe', 'J.K. Rowling', 'Dr. Seuss', 'John Lennon', 'Haruki Murakami', 'André Gide', 'Steve Martin', 'James Baldwin', 'Alexandre Dumas fils', 'Mark Twain', 'Suzanne Collins', 'Friedrich Nietzsche', 'C.S. Lewis', 'Bob Marley', 'Pablo Neruda'}


In [40]:
# LETS SEE WHAT IF WE DONT KNOW THE TOTAL NO. OF PAGES

In [41]:
req = requests.get('http://quotes.toscrape.com')
soup = bs4.BeautifulSoup(req.text,'lxml')

In [42]:
soup.select('.next')

[<li class="next">
 <a href="/page/2/">Next <span aria-hidden="true">→</span></a>
 </li>]

In [43]:
soup.select('.next') == []

False

In [44]:
# True means next button is present on the current page and it is not the last page
soup.select('.next') != [] 

True

In [45]:
base_url = 'http://quotes.toscrape.com/page/{}/' 
authors = set()
i = 1

while(soup.select('.next') != []):
    page = base_url.format(i)    
    req = requests.get(page)
    soup = bs4.BeautifulSoup(req.text,'lxml')

    for author in soup.select('.author'):
        authors.add(author.text)
    
    i += 1

In [46]:
len(authors)

50

In [47]:
len(authors)

50

In [48]:
authors

{'Albert Einstein',
 'Alexandre Dumas fils',
 'Alfred Tennyson',
 'Allen Saunders',
 'André Gide',
 'Ayn Rand',
 'Bob Marley',
 'C.S. Lewis',
 'Charles Bukowski',
 'Charles M. Schulz',
 'Douglas Adams',
 'Dr. Seuss',
 'E.E. Cummings',
 'Eleanor Roosevelt',
 'Elie Wiesel',
 'Ernest Hemingway',
 'Friedrich Nietzsche',
 'Garrison Keillor',
 'George Bernard Shaw',
 'George Carlin',
 'George Eliot',
 'George R.R. Martin',
 'Harper Lee',
 'Haruki Murakami',
 'Helen Keller',
 'J.D. Salinger',
 'J.K. Rowling',
 'J.M. Barrie',
 'J.R.R. Tolkien',
 'James Baldwin',
 'Jane Austen',
 'Jim Henson',
 'Jimi Hendrix',
 'John Lennon',
 'Jorge Luis Borges',
 'Khaled Hosseini',
 "Madeleine L'Engle",
 'Marilyn Monroe',
 'Mark Twain',
 'Martin Luther King Jr.',
 'Mother Teresa',
 'Pablo Neruda',
 'Ralph Waldo Emerson',
 'Stephenie Meyer',
 'Steve Martin',
 'Suzanne Collins',
 'Terry Pratchett',
 'Thomas A. Edison',
 'W.C. Fields',
 'William Nicholson'}

## Thank You