# Importing libraries

In [1]:
import csv
import requests
from bs4 import BeautifulSoup

# Providing the url of the page, whose content is to be scrapped. Then accessing it using request.

In [2]:
url='http://quotes.toscrape.com'
req= requests.get(url)

In [3]:
page=req.text

In [4]:
page

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\t<meta charset="UTF-8">\n\t<title>Quotes to Scrape</title>\n    <link rel="stylesheet" href="/static/bootstrap.min.css">\n    <link rel="stylesheet" href="/static/main.css">\n</head>\n<body>\n    <div class="container">\n        <div class="row header-box">\n            <div class="col-md-8">\n                <h1>\n                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>\n                </h1>\n            </div>\n            <div class="col-md-4">\n                <p>\n                \n                    <a href="/login">Login</a>\n                \n                </p>\n            </div>\n        </div>\n    \n\n<div class="row">\n    <div class="col-md-8">\n\n    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">\n        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>\n        <sp

The above html page text looks messy. This can be beautified using beautiful soup.

In [5]:
soup= BeautifulSoup(page,'html.parser')  #Beautification using beautiful soup.

In [6]:
soup

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>Quotes to Scrape</title>
<link href="/static/bootstrap.min.css" rel="stylesheet"/>
<link href="/static/main.css" rel="stylesheet"/>
</head>
<body>
<div class="container">
<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>
<div class="row">
<div class="col-md-8">
<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
<span>by <small class="author" itemprop="author">Albert Einstein</small>
<a href="/author/Albert-Einstein">(about)</a>
</span>
<div class="tags">
            Tags:
            <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
<a class="

In [7]:
quote= soup.findAll('div',{'class':'quote'}) #Extracting only the quotes from the div class in which it is present.

In [8]:
quote

[<div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
 <span>by <small class="author" itemprop="author">Albert Einstein</small>
 <a href="/author/Albert-Einstein">(about)</a>
 </span>
 <div class="tags">
             Tags:
             <meta class="keywords" content="change,deep-thoughts,thinking,world" itemprop="keywords"/>
 <a class="tag" href="/tag/change/page/1/">change</a>
 <a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
 <a class="tag" href="/tag/thinking/page/1/">thinking</a>
 <a class="tag" href="/tag/world/page/1/">world</a>
 </div>
 </div>,
 <div class="quote" itemscope="" itemtype="http://schema.org/CreativeWork">
 <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>
 <span>by <small class="author" itempr

# Scrapping the data and storing it in a list.
First element in the list is the quote
Second element in the list is the name of the author

In [9]:
scrapped=[]

for i in quote:
    text= i.find('span',class_='text').text
    author=i.find('small',class_='author').text
    scrapped.append([text,author])

In [10]:
scrapped

[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”',
  'Albert Einstein'],
 ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”',
  'J.K. Rowling'],
 ['“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”',
  'Albert Einstein'],
 ['“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”',
  'Jane Austen'],
 ["“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”",
  'Marilyn Monroe'],
 ['“Try not to become a man of success. Rather become a man of value.”',
  'Albert Einstein'],
 ['“It is better to be hated for what you are than to be loved for what you are not.”',
  'André Gide'],
 ["“I have not failed. I've just found 10,000 ways that won't work.”",
  'Thomas A. Edison'],
 ["“A woman is like a tea ba

# Storing the scrapped data in a csv file

In [11]:
with open('Quotes1.csv','w',encoding='utf-8',newline='') as file:
    writer=csv.writer(file)
    writer.writerow(["Quote","Writer"])
    
    for i in scrapped:
        writer.writerow(i)
    

# Reading the csv file using pandas

In [34]:
import pandas as pd
df=pd.read_csv('Quotes1.csv')
df

Unnamed: 0,Quote,Writer
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe
5,“Try not to become a man of success. Rather be...,Albert Einstein
6,“It is better to be hated for what you are tha...,André Gide
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt
9,"“A day without sunshine is like, you know, nig...",Steve Martin


# In the present url, there are 10 pages present. To access all of the urls, fake-user agent is used.

In [14]:
!pip install fake-useragent

Collecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Building wheels for collected packages: fake-useragent
  Building wheel for fake-useragent (setup.py): started
  Building wheel for fake-useragent (setup.py): finished with status 'done'
  Created wheel for fake-useragent: filename=fake_useragent-0.1.11-py3-none-any.whl size=13489 sha256=02e02136f51f3437de129c52048efbf51d1c737f0334096dd205734e6c63b189
  Stored in directory: c:\users\s540 a2in\appdata\local\pip\cache\wheels\ed\f7\62\50ab6c9a0b5567267ab76a9daa9d06315704209b2c5d032031
Successfully built fake-useragent
Installing collected packages: fake-useragent
Successfully installed fake-useragent-0.1.11


In [38]:
## importing bs4, requests, fake_useragent and csv modules
import bs4
import requests
from fake_useragent import UserAgent
import csv

## create an array with URLs
urls = ['http://quotes.toscrape.com', 'http://quotes.toscrape.com/page/2/','http://quotes.toscrape.com/page/3/',
       'http://quotes.toscrape.com/page/4/','http://quotes.toscrape.com/page/5/','http://quotes.toscrape.com/page/6/',
       'http://quotes.toscrape.com/page/7/','http://quotes.toscrape.com/page/8/','http://quotes.toscrape.com/page/9/',
       'http://quotes.toscrape.com/page/10/']

## initializing the UserAgent object
user_agent = UserAgent()

## starting the loop
for url in urls:
    ## getting the reponse from the page using get method of requests module
    page = requests.get(url, headers={"user-agent": user_agent.chrome})

    ## storing the content of the page in a variable
    html = page.content

    ## creating BeautifulSoup object
    soup = bs4.BeautifulSoup(html, "html.parser")

    ## Then parse the HTML, extract any data
    ## write it to a file
    
    quotes = soup.findAll('div', class_='quote')
    
    scraped = []
    for quote in quotes:
        text = quote.find('span', class_='text').text
        author = quote.find('small', class_='author').text
        scraped.append([text, author])
        
        with open('Quotes2.csv','a',encoding='utf-8',newline='') as file:
            writer=csv.writer(file)
            writer.writerow(["Quote","Writer"])
    
            for i in scraped:
                writer.writerow(i)
    

    

In [39]:
df2=pd.read_csv("Quotes2.csv")
df2

Unnamed: 0,Quote,Writer
0,"“The truth."" Dumbledore sighed. ""It is a beaut...",J.K. Rowling
1,“I'm the one that's got to die when it's time ...,Jimi Hendrix
2,“To die will be an awfully big adventure.”,J.M. Barrie
3,“It takes courage to grow up and become who yo...,E.E. Cummings
4,“But better to get hurt by the truth than comf...,Khaled Hosseini
...,...,...
655,“You never really understand a person until yo...,Harper Lee
656,“You have to write the book that wants to be w...,Madeleine L'Engle
657,“Never tell the truth to people who are not wo...,Mark Twain
658,"“A person's a person, no matter how small.”",Dr. Seuss


In [40]:
df2.shape

(660, 2)