# Web Scraping in Python with BeautifulSoup

Accessing the HTML content from webpage

In [3]:
import requests
URL = "http://www.values.com/inspirational-quotes"
r = requests.get(URL)
print(r.content)

b'<!DOCTYPE html>\n<html class="no-js" dir="ltr" lang="en-US">\n    <head>\n        <title>Inspirational Quotes - Motivational Quotes - Leadership Quotes | PassItOn.com</title>\n        <meta charset="utf-8">\n        <meta http-equiv="content-type" content="text/html; charset=utf-8" />\n        <meta http-equiv="X-UA-Compatible" content="IE=edge" />\n        <meta name="viewport" content="width=device-width,initial-scale=1.0" />\n        <meta name="description" content="The Foundation for a Better Life | Pass It On.com">\n        <link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">\n        <link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">\n        <link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">\n        <link rel="manifest" href="/site.webmanifest">\n        <link rel="mask-icon" href="/safari-pinned-tab.svg" color="#c8102e">\n        <meta name="msapplication-TileColor" content="#c8102e">\n        <meta name=

Parsing the HTML content

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.content, 'html5lib') # If this line causes an error, run 'pip install html5lib' or install html5lib
print(soup.prettify())) # print the parsed data of html  

<!DOCTYPE html>
<html class="no-js" dir="ltr" lang="en-US">
 <head>
  <title>
   Inspirational Quotes - Motivational Quotes - Leadership Quotes | PassItOn.com
  </title>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width,initial-scale=1.0" name="viewport"/>
  <meta content="The Foundation for a Better Life | Pass It On.com" name="description"/>
  <link href="/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
  <link href="/favicon-32x32.png" rel="icon" sizes="32x32" type="image/png"/>
  <link href="/favicon-16x16.png" rel="icon" sizes="16x16" type="image/png"/>
  <link href="/site.webmanifest" rel="manifest"/>
  <link color="#c8102e" href="/safari-pinned-tab.svg" rel="mask-icon"/>
  <meta content="#c8102e" name="msapplication-TileColor"/>
  <meta content="#ffffff" name="theme-color"/>
  <link crossorigin="anonymous" href="https://stackp

In [4]:
#It will print title
print(soup.title)

<title>Inspirational Quotes - Motivational Quotes - Leadership Quotes | PassItOn.com</title>


In [5]:
#It will print title text
print(soup.title.text)  

Inspirational Quotes - Motivational Quotes - Leadership Quotes | PassItOn.com


In [7]:
#It will print all links along with its attributes.
for link in soup.find_all("a"):  
  print("Inner Text is: {}".format(link.text))  
  print("Title is: {}".format(link.get("title")))  
  print("href is: {}".format(link.get("href")))  

Inner Text is: 
Title is: Pass It On®
href is: /
Inner Text is: Quotes
Title is: None
href is: /inspirational-quotes
Inner Text is: Videos
Title is: None
href is: /inspirational-stories-tv-spots
Inner Text is: Billboards 
Title is: None
href is: /inspirational-sayings-billboards
Inner Text is: Official Billboards
Title is: None
href is: /inspirational-sayings-billboards
Inner Text is: Is In You® Billboards
Title is: None
href is: /is-in-you
Inner Text is: Create Your Own Billboard
Title is: None
href is: /your-billboards
Inner Text is: Newspapers
Title is: None
href is: /positive-good-news-columns
Inner Text is: Radio
Title is: None
href is: /radio
Inner Text is: Blog
Title is: None
href is: /passiton-blog
Inner Text is: Language 
Title is: None
href is: #
Inner Text is: Spanish | Español
Title is: None
href is: https://www.pasala.org/
Inner Text is: Portuguese | Português
Title is: None
href is: https://www.umavidamelhor.org/
Inner Text is: Chinese | 中文
Title is: None
href is: https:/

In [None]:
import csv
quotes=[]  # a list to store quotes
   
table = soup.find('div', attrs = {'id':'all_quotes'}) 
   
for row in table.findAll('div',
                         attrs = {'class':'col-6 col-lg-3 text-center margin-30px-bottom sm-margin-30px-top'}):
    quote = {}
    quote['theme'] = row.h5.text
    quote['url'] = row.a['href']
    quote['img'] = row.img['src']
    quote['lines'] = row.img['alt'].split(" #")[0]
    quote['author'] = row.img['alt'].split(" #")[1]
    quotes.append(quote)
   
filename = '/content/inspirational_quotes.csv'
with open(filename, 'w', newline='') as f:
    w = csv.DictWriter(f,['theme','url','img','lines','author'])
    w.writeheader()
    for quote in quotes:
        w.writerow(quote)

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
