In [2]:
# Beautiful Soup
# Beautiful Soup is a Python library to analyze or parse HTML documents. Beautiful 
# soup structures HTML/XML data as a tree with every tag being a different branch.
# It provides valuable tools to extract info from HTML files which makes it
# useful for web scraping

# This structure considers a hierarchical tag relation. 
# 1 - The HTML tag is considered parentNode, and is considered the parentNode for 
# every other tag.
# 2 - The first nested tag is considered the firstChild and the inmediate nested tag
# from a reference tag is considered the nextSibling. 
# 3 - If there are not nested tags within a tag that tag will be considered the 
# lastChild. Every
# 4 - Every nested tad have a upper level tag considered previousSibling

In [15]:

# Beautiful soup
from bs4 import BeautifulSoup
import requests

# HTML string
html_b = """

<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
<body>
<p class="title">
    <b>The Dormouse's story</b>
</p>

<p class="story">
Once upon a time there were three little sisters; and their names were:

    <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
    <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
    <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.
</p>

<p class="story">The story continues</p>
</body>
</html>
"""

# Beautiful soup pasrese data from an HTML string like the one shown above
soup_a = BeautifulSoup(html_b)
print (soup_a)
print (type(soup_a))

<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were:

    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

and they lived at the bottom of a well.
</p>
<p class="story">The story continues</p>
</body>
</html>

<class 'bs4.BeautifulSoup'>


In [4]:

# Beautiful soup has a method called 'prettify()' that places correct indentation
# on an HTML string
print (soup_a.prettify())


<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were:
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;

and they lived at the bottom of a well.
  </p>
  <p class="story">
   The story continues
  </p>
 </body>
</html>



In [5]:
# Tags can be accessed by using tag names a attributes
print (soup_a.title)


<title>The Dormouse's story</title>


In [6]:
# When parent tags are called bring all content within the open and closing tag
print (soup_a.body)


<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were:

    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

and they lived at the bottom of a well.
</p>
<p class="story">The story continues</p>
</body>


In [7]:
# In case there are several intances of a tag BS will return the 1st one
print (soup_a.a)


<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>


In [8]:
# There can be called tag names
print (soup_a.body.name)


body


In [9]:
# There can be called parent tag
print (soup_a.a.parent)


<p class="story">
Once upon a time there were three little sisters; and their names were:

    <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
    <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;

and they lived at the bottom of a well.
</p>


In [10]:
# BS has several parser can be used over malformed HTML data and make it work
# Note that by default BS parsers is for XML
html_c = """
    <h1><a /><b><th <td>
"""
soup_b = BeautifulSoup(html_c)
print(soup_b.prettify())


<html>
 <body>
  <h1>
   <a>
   </a>
   <b>
   </b>
   <th>
   </th>
  </h1>
 </body>
</html>


In [11]:
# To specify the parser there has to be provided in the form of a string
# XML parser
soup_c = BeautifulSoup(html_c, 'lxml')
print(soup_c.prettify())


<html>
 <body>
  <h1>
   <a>
   </a>
   <b>
   </b>
   <th>
   </th>
  </h1>
 </body>
</html>


In [12]:
# HTML 'default' parser
soup_d = BeautifulSoup(html_c, 'html.parser')
print(soup_d.prettify())

<h1>
 <a>
 </a>
 <b>
  <th <td="">
  </th>
 </b>
</h1>


In [13]:
# HTML5 parser
soup_e = BeautifulSoup(html_c, 'html5lib')
print(soup_e.prettify())

<html>
 <head>
 </head>
 <body>
  <h1>
   <a>
    <b>
    </b>
   </a>
  </h1>
 </body>
</html>


In [19]:
# Data can be requested using an HTTP request from requests
uri_j = 'https://www.google.com/'
resp_data_n = requests.get(uri_j)
resp_data_n

<Response [200]>

In [23]:
soup_f = BeautifulSoup(resp_data_n.text, 'html5lib')
print(soup_f.prettify())

<!DOCTYPE html>
<html itemscope="" itemtype="http://schema.org/WebPage" lang="es-419">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"/>
  <title>
   Google
  </title>
  <script nonce="f4lToEJKqcHOkvt1MjUUBw">
   (function(){window.google={kEI:'6bfsYuzMH7GDwbkP8LylmAs',kEXPI:'0,18167,1284363,56879,6058,207,4804,2316,383,246,5,5367,1123753,1197789,303173,77529,16114,17444,1953,9287,17572,4858,1362,9291,3023,4751,12834,4998,13228,14469,22741,262,4819,1593,1279,2742,149,1103,840,6297,108,3406,606,2023,1777,520,14670,3227,2845,7,25552,8218,1851,6398,9358,3,576,6460,803,13320,4,1528,2304,7039,20309,4764,2658,4164,3193,13658,4437,16786,5809,2548,4094,17,4035,3,3541,1,11942,30212,2,28138,11623,5679,1021,2380,20981,6,2,7752,4568,6259,23417,1247,5841,14968,4332,6089,1395,445,2,2,1,17312,9320,8155,6582,799,3043,11637,2163,5178,9710,4745,7,1922,5706,4073,23,6865,5526,6716

In [31]:
# Extracting data from a HTML file
# From an open HTML there can be parsed the HTML content

with open("resources/GiantPanda.html") as html_d:
    soup_g = BeautifulSoup(html_d, "html5lib")
    print(soup_g.prettify())


<html>
 <head>
  <title>
   "Giant Panda"
  </title>
 </head>
 <body>
  <h1>
   The giant panda also known as panda bear or simply panda
  </h1>
  <h2 style="color:blue;">
   Giant Panda
  </h2>
  <h3 style="background-color:yellow;color:red;">
   The name "giant panda" is sometimes used to distinguish it from the
   <br/>
   red panda.
  </h3>
  <h6>
   The giant panda lives in a few mountain ranges in central China
  </h6>
  <b class="panda">
   pandas were thought to be rare and noble creatures
  </b>
  <div>
   <a href="https://en.wikipedia.org/wiki/Giant_panda">
    Link to Wikipedia page
   </a>
  </div>
  <br/>
  <br/>
  <i>
   <!--Here is a Panda image from Wikipedia -->
  </i>
  <div>
   <img alt="panda not found" src="Panda.jpeg"/>
  </div>
  <div>
   <p class="panda highlight">
    Giant pandas in the wild will occasionally eat other grasses, wild tubers, or even meat in the form of birds
   </p>
  </div>
 </body>
</html>


In [32]:
# HTML tags are a class itelf
tag_a = soup_g.title
print(tag_a)
print(type(tag_a))

<title>"Giant Panda"</title>
<class 'bs4.element.Tag'>


In [33]:
# If an HTML string has to be modified it can be done using BS
print('1st tag name: ',soup_g.h1)
soup_g.h1.name = 'reign'
print('2nd tag name: ',soup_g.reign)

<h1>The giant panda also known as panda bear or simply panda</h1>
<reign>The giant panda also known as panda bear or simply panda</reign>


In [38]:
# Attributes from a tag are displayed as a dict
print(soup_g.a.attrs)
print(type(soup_g.a.attrs))

{'href': 'https://en.wikipedia.org/wiki/Giant_panda'}
<class 'dict'>


In [36]:
# Tag atributes can be selected using the the keywords
# This is useful to get links
print(soup_g.a['href'])

https://en.wikipedia.org/wiki/Giant_panda


In [40]:
# BS can be used to get images
print(soup_g.img)
print(soup_g.img.attrs)
print(soup_g.img['src'])

<img alt="panda not found" src="Panda.jpeg"/>
{'src': 'Panda.jpeg', 'alt': 'panda not found'}
Panda.jpeg


In [42]:
# BS can be used to get classes
print(soup_g.p)
print(soup_g.p.attrs)
print(soup_g.p['class'])

<p class="panda highlight">Giant pandas in the wild will occasionally eat other grasses, wild tubers, or even meat in the form of birds</p>
{'class': ['panda', 'highlight']}
['panda', 'highlight']


In [44]:
# BS can be used to get tag text
print(soup_g.p.string)

Giant pandas in the wild will occasionally eat other grasses, wild tubers, or even meat in the form of birds


In [45]:
# These strings have their own class and set of methods and attributes
print(type(soup_g.p.string))
print(dir(soup_g.p.string))

<class 'bs4.element.NavigableString'>


In [48]:
# Elements can be filtered using Find and Find All
import re

with open("resources/TomJerry_Story.html") as html_e:
    soup_h = BeautifulSoup(html_e, "lxml")

In [49]:
# The 'find' method allows to get the 1st occurance of a tag
find_a = soup_h.find('p')
print(find_a)
print(type(find_a))

<p class="title"><b>Tom and Jerry</b></p>
<class 'bs4.element.Tag'>


In [50]:
# The 'findAll()' method allows to get every occurance of a tag
find_b = soup_h.findAll('p')
print(find_b)
print(type(find_b))
# Either if there is more than one occurance of a tag there will be returned a list

[<p class="title"><b>Tom and Jerry</b></p>, <p class="comedy animated series">
        Tom and Jerry is an American animated series of comedy short films created by 
        <a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William_Hanna</a> and  
        <a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>. 
        It centers on a rivalry between the title characters
        <a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>, a cat, and 
        <a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>, a mouse.</p>, <p class="comedy story">
            The series features comic fights between an iconic pair of adversaries, 
            a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's 
            numerous attempts to capture Jerry and the mayhem and destruction that follows. 
            Tom rarely suc

In [51]:
# If there are no occurances there will be returned None
find_c = soup_h.find('an ape')
print(find_c)


None


In [52]:
# This methods work as well with attributes
find_d = soup_h.find(src = 'TomAndJerry.jpg')
print (find_d)
find_e = soup_h.findAll(src = 'TomAndJerry.jpg')
print (find_e)

<img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>
[<img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>]


In [54]:
# These methods can be leveraged by the usage of regular expression
for tag in soup_h.findAll(re.compile('^a')):
    print (tag)

<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William_Hanna</a>
<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>
<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>
<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>


In [55]:
# This iterations can be useful 
for tag in soup_h.findAll(re.compile('^a')):
    print (tag['href'])

https://en.wikipedia.org/wiki/William_Hanna
https://en.wikipedia.org/wiki/Joseph_Barbera
https://en.wikipedia.org/wiki/Tom_Cat
https://en.wikipedia.org/wiki/Jerry_Mouse


In [56]:
# The 're.compile()' can be used to match every string used by BS to find an 
# element
for tag in soup_h.findAll('a', attrs = {'class':re.compile('^cre')}):
    print (tag)

<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William_Hanna</a>
<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>


In [57]:
# Additional example 
for tag in soup_h.findAll(re.compile('i')):
    print (tag)

<title> The story of Tom and Jerry </title>
<img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>


In [58]:
# Additional example 
for tag in soup_h.findAll(id = re.compile('2')):
    print (tag)

<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>


In [59]:
# There can be found multiple elements using lists of values
for tag in soup_h.findAll(['a','img']):
    print (tag)

<img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>
<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William_Hanna</a>
<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>
<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>
<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>


In [60]:
# The element search can be leveraged by using custom functions and filters
# Note that the methods used to find elements keep tags that return 'True'
# to a matching criteria
for tag in soup_h.findAll(True):
    print (tag.name)


html
head
title
body
p
b
img
p
a
a
a
a
p


In [62]:
# As well as with Python's filter function there can be created a custom function
# that serves a criteria to filter each element

def funct_a (tag):
    return tag.has_attr('src') and not tag.has_attr('href')

for tag in soup_h.findAll(funct_a):
    print (tag)

<img alt="cartoon_image" height="300" src="TomAndJerry.jpg" width="300"/>


In [65]:
def funct_b (href):
    return href and not re.compile('Tom_Cat').search(href)

for tag in soup_h.findAll(href=funct_b):
    print (tag)

<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William_Hanna</a>
<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>
<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>


In [66]:
def funct_c (href):
    return href and re.compile('wikipedia').search(href)

for tag in soup_h.findAll(href=funct_c):
    print (tag)

<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">William_Hanna</a>
<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">Joseph Barbera</a>
<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">Tom</a>
<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">Jerry</a>


In [75]:
# Extracting links from a Real-World website
# Data declaration
uri_k = 'https://mashable.com/archive/static-website-generators/'

# Data requested
resp_data_p = requests.get(uri_k)

# BS parsing
soup_i = BeautifulSoup(resp_data_p.text, 'lxml')

In [76]:
# Link list
# This list contains every secure link within this web page
list_a = soup_i.findAll('a', attrs = {'href': re.compile('^https')})

for item in list_a:
    print (item['href'])



https://mashable.com
https://mashable.com/tech
https://mashable.com/science
https://mashable.com/life
https://mashable.com/category/social-good
https://mashable.com/entertainment
https://mashable.com/deals
https://mashable.com/category/apps-and-software
https://mashable.com/category/cybersecurity
https://mashable.com/category/mobile
https://mashable.com/category/smart-home
https://mashable.com/category/social-media
https://mashable.com/category/tech-industry
https://mashable.com/category/transportation
https://mashable.com/tech
https://mashable.com/category/space
https://mashable.com/category/climate-change
https://mashable.com/category/environment
https://mashable.com/science
https://mashable.com/category/digital-culture
https://mashable.com/category/family-parenting
https://mashable.com/category/health-wellness
https://mashable.com/category/sex-dating-relationships
https://mashable.com/life
https://mashable.com/category/activism
https://mashable.com/category/gender
https://mashable.c

In [80]:
# This list contains every link related to this same website
list_b = soup_i.findAll('a', attrs = {'href': re.compile('^https://mashable.com/')})

for item in list_b:
    print (item['href'])

https://mashable.com/tech
https://mashable.com/science
https://mashable.com/life
https://mashable.com/category/social-good
https://mashable.com/entertainment
https://mashable.com/deals
https://mashable.com/category/apps-and-software
https://mashable.com/category/cybersecurity
https://mashable.com/category/mobile
https://mashable.com/category/smart-home
https://mashable.com/category/social-media
https://mashable.com/category/tech-industry
https://mashable.com/category/transportation
https://mashable.com/tech
https://mashable.com/category/space
https://mashable.com/category/climate-change
https://mashable.com/category/environment
https://mashable.com/science
https://mashable.com/category/digital-culture
https://mashable.com/category/family-parenting
https://mashable.com/category/health-wellness
https://mashable.com/category/sex-dating-relationships
https://mashable.com/life
https://mashable.com/category/activism
https://mashable.com/category/gender
https://mashable.com/category/lgbtq
htt

In [79]:
# This list contains every link in this same website 
list_c = soup_i.findAll('a', attrs = {'href': True})

for item in list_c:
    if not item['href'].startswith('http'):
        item = uri_k + item['href'].strip('/')
    else:
        item = item['href']
    print(item)

https://mashable.com
https://mashable.com/archive/static-website-generators/series/self-made
https://mashable.com/archive/static-website-generators/series/best-of-2022
https://mashable.com/tech
https://mashable.com/science
https://mashable.com/life
https://mashable.com/category/social-good
https://mashable.com/entertainment
https://mashable.com/deals
https://mashable.com/archive/static-website-generators/series/self-made
https://mashable.com/archive/static-website-generators/series/best-of-2022
https://mashable.com/category/apps-and-software
https://mashable.com/category/cybersecurity
https://mashable.com/category/mobile
https://mashable.com/category/smart-home
https://mashable.com/category/social-media
https://mashable.com/category/tech-industry
https://mashable.com/category/transportation
https://mashable.com/tech
https://mashable.com/category/space
https://mashable.com/category/climate-change
https://mashable.com/category/environment
https://mashable.com/science
https://mashable.com

In [85]:
# This list contains every image in this website 
list_d = soup_i.findAll('img')

for item in list_d:
    print (item.get('src'))

https://helios-i.mashable.com/imagery/archives/00s0gVKdAC9EPTZERS7TvPb/hero-image.fill.size_1248x702.v1647024413.jpg
https://c.evidon.com/pub/icong1.png


In [87]:
# There's a way to parse a segment of a file using a tool called SoupStrainer
from bs4 import SoupStrainer

html_f = """

<html>
<head>
    <title> The story of Tom and Jerry </title>
</head>
 <body>
    <p class = "title"><b>Tom and Jerry</b></p>
    <img src = "TomAndJerry.jpg" width = "300" height = "300" alt = "cartoon_image"/>
    <p class = "comedy animated series">
        Tom and Jerry is an American animated series of comedy short films created by 
        <a href = "https://en.wikipedia.org/wiki/William_Hanna" class = "creator" id = "link1" >William_Hanna</a> and  
        <a href = "https://en.wikipedia.org/wiki/Joseph_Barbera" class = "creator" id = "link2" >Joseph Barbera</a>. 
        It centers on a rivalry between the title characters
        <a href = "https://en.wikipedia.org/wiki/Tom_Cat" class = "character" id = "link3" >Tom</a>, a cat, and 
        <a href = "https://en.wikipedia.org/wiki/Jerry_Mouse" class = "character" id = "link4" >Jerry</a>, a mouse.</p> 
    
        <div>
                <img src = "https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" 
                     width = "300" height = "300" alt = "creator_image" ></img>
                <img src = "https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" 
                     width = "300" height = "300" alt = "creator_image"></img>
                <img src = "https://upload.wikimedia.org/wikipedia/en/2/2f/Jerry_Mouse.png"></img>
                <img src = "https://upload.wikimedia.org/wikipedia/en/f/f6/Tom_Tom_and_Jerry.png" alt = "Tom_image"></img>
        </div>
        <p class = "comedy story">
            The series features comic fights between an iconic pair of adversaries, 
            a house cat (Tom) and a mouse (Jerry). The plots of each short usually center on Tom's 
            numerous attempts to capture Jerry and the mayhem and destruction that follows. 
            Tom rarely succeeds in catching Jerry, mainly because of Jerry's cleverness, 
            cunning abilities, and luck. 
        </p>
        <i>Tom and Jerry show is a full length comedy show</i>
 </body>
</html>

"""

# The SoupStrainer object allows to take the segment/tag needed and ignore the 
# rest
strained_a = SoupStrainer('div')
print (strained_a)
print (type(strained_a))

div|{}
<class 'bs4.element.SoupStrainer'>


In [89]:
# BS parser
soup_j = BeautifulSoup(html_f, 'lxml', parse_only = strained_a)
soup_j.prettify()

'<div>\n <img alt="creator_image" height="300" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>\n <img alt="creator_image" height="300" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>\n <img src="https://upload.wikimedia.org/wikipedia/en/2/2f/Jerry_Mouse.png"/>\n <img alt="Tom_image" src="https://upload.wikimedia.org/wikipedia/en/f/f6/Tom_Tom_and_Jerry.png"/>\n</div>'

In [91]:
# The SoupStrainer object allows to take the segment/tag needed and ignore the 
# rest
strained_b = SoupStrainer('a')

# BS parser
soup_k = BeautifulSoup(html_f, 'lxml', parse_only = strained_b)
soup_k.prettify()

'<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">\n William_Hanna\n</a>\n<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">\n Joseph Barbera\n</a>\n<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">\n Tom\n</a>\n<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">\n Jerry\n</a>'

In [92]:
# Soup strainer works not only with class but with attributes
strained_c = SoupStrainer(alt = 'creator_image')

# BS parser
soup_l = BeautifulSoup(html_f, 'lxml', parse_only = strained_c)
soup_l.prettify()

'<img alt="creator_image" height="300" src="https://upload.wikimedia.org/wikipedia/commons/d/d2/William_Hanna_1977.jpg" width="300"/>\n<img alt="creator_image" height="300" src="https://upload.wikimedia.org/wikipedia/commons/6/67/JBarbera.jpg" width="300"/>\n'

In [97]:
# Soup strainer works not only with class but with classes and tag IDs
strained_d = SoupStrainer(id = re.compile('link'))

# BS parser
soup_m = BeautifulSoup(html_f, 'lxml', parse_only = strained_d)
soup_m.prettify()

'<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">\n William_Hanna\n</a>\n<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">\n Joseph Barbera\n</a>\n<a class="character" href="https://en.wikipedia.org/wiki/Tom_Cat" id="link3">\n Tom\n</a>\n<a class="character" href="https://en.wikipedia.org/wiki/Jerry_Mouse" id="link4">\n Jerry\n</a>'

In [100]:
# Soup strainer works not only with class but with attributes
strained_c = SoupStrainer(class_ = 'creator')

# BS parser
soup_l = BeautifulSoup(html_f, 'lxml', parse_only = strained_c)
soup_l.prettify()

'<a class="creator" href="https://en.wikipedia.org/wiki/William_Hanna" id="link1">\n William_Hanna\n</a>\n<a class="creator" href="https://en.wikipedia.org/wiki/Joseph_Barbera" id="link2">\n Joseph Barbera\n</a>'