# Search Using Beautiful Soup

## Searching in Beautiful Soup
• find()
• find_all()
• find_parent()
• find_parents()
• find_next_sibling()
• find_next_siblings()
• find_previous_sibling()
• find_previous_siblings()

## Searching with find()

In [2]:
from bs4 import BeautifulSoup

with open("ecologicalpyramid.html","r") as ecological_pyramid:
    soup = BeautifulSoup(ecological_pyramid,"lxml")

producer_entries = soup.find("ul")
print(producer_entries.li.div.string)

plants


### Searching for tags

In [40]:
tag_li = soup.find("li")
print(type(tag_li))

<class 'bs4.element.Tag'>


### Searching for text

In [4]:
search_for_stringonly = soup.find(text="fox")
print(search_for_stringonly)

fox


### Searching based on regular expressions

In [38]:
import re
from bs4 import BeautifulSoup

email_id_example = """<br/>
<div>The below HTML has the information that has email ids.</div>
abc@example.com
<div>xyz@example.com</div>
<span>foo@example.com</span>
"""
soup_reg = BeautifulSoup(email_id_example,"lxml")

emailid_regexp = re.compile("\w+@\w+\.\w+")
first_email_id = soup_reg.find(text=emailid_regexp)
print(first_email_id)


abc@example.com



### Searching based on attribute values of a tag

#### Finding the first primary consumer

In [8]:
primary_consumers = soup.find(id="primaryconsumers")
print(primary_consumers.li.div.string)

deer


#### Searching based on custom attributes

In [24]:
customattr = """<p data-custom="custom">custom attributeexample</p>"""
print(customattr)
customsoup = BeautifulSoup(customattr,'lxml')
customsoup.find(data-custom="custom")

SyntaxError: keyword can't be an expression (<ipython-input-24-5afb9b7b71f2>, line 4)

The error is thrown because Python variables cannot contain a - character and the
data-custom variable that we passed contained a - character.

In [23]:
using_attrs = customsoup.find(attrs={'data-custom':'custom'})
print(using_attrs)

<p data-custom="custom">custom attributeexample</p>


#### Searching based on the CSS class

In [42]:
css_class = soup.find(attrs={'class':'primaryconsumerlist'})
print(css_class)

<li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>


In [43]:
css_class = soup.find(class_ = "primaryconsumerlist" )
print(css_class)

<li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>


### Searching using functions defined

In [44]:
def is_secondary_consumers(tag):
    return tag.has_attr('id') and tag.get('id') == 'secondaryconsumers'

secondary_consumer = soup.find(is_secondary_consumers)
print(secondary_consumer.li.div.string)

fox


### Applying searching methods in combination

In [49]:
combination_example = """<p class="identical">
Example of p tag with class identical
</p>
<div class="identical">
Example of div tag with class identical
</div>"""

combination_example_soup = BeautifulSoup(combination_example,'lxml')

identical_div= combination_example_soup.find("div", class_='identical')
print(identical_div)

<div class="identical">
Example of div tag with class identical
</div>


## Searching with find_all()

### Finding all tertiary consumers

In [3]:
all_tertiaryconsumers = soup.find_all(class_="tertiaryconsumerlist")

for tertiaryconsumer in all_tertiaryconsumers:
    print(tertiaryconsumer.div.string)

lion
tiger


In [4]:
all_texts = soup.find_all(text=True)
print(all_texts)

['\n', '\n', '\n', '\n', '\n', 'plants', '\n', '100000', '\n', '\n', '\n', 'algae', '\n', '100000', '\n', '\n', '\n', '\n', '\n', 'deer', '\n', '1000', '\n', '\n', '\n', 'rabbit', '\n', '2000', '\n', '\n', '\n', '\n', '\n', 'fox', '\n', '100', '\n', '\n', '\n', 'bear', '\n', '100', '\n', '\n', '\n', '\n', '\n', 'lion', '\n', '80', '\n', '\n', '\n', 'tiger', '\n', '50', '\n', '\n', '\n', '\n']


In [5]:
all_texts_in_list = soup.find_all(text=["plants","algae"])
print(all_texts_in_list)

['plants', 'algae']


In [7]:
div_li_tags = soup.find_all(["div","li"])
print(div_li_tags)

[<div class="ecopyramid">
<ul id="producers">
<li class="producerlist">
<div class="name">plants</div>
<div class="number">100000</div>
</li>
<li class="producerlist">
<div class="name">algae</div>
<div class="number">100000</div>
</li>
</ul>
<ul id="primaryconsumers">
<li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>
<li class="primaryconsumerlist">
<div class="name">rabbit</div>
<div class="number">2000</div>
</li>
</ul>
<ul id="secondaryconsumers">
<li class="secondaryconsumerlist">
<div class="name">fox</div>
<div class="number">100</div>
</li>
<li class="secondaryconsumerlist">
<div class="name">bear</div>
<div class="number">100</div>
</li>
</ul>
<ul id="tertiaryconsumers">
<li class="tertiaryconsumerlist">
<div class="name">lion</div>
<div class="number">80</div>
</li>
<li class="tertiaryconsumerlist">
<div class="name">tiger</div>
<div class="number">50</div>
</li>
</ul>
</div>, <li class="producerlist">
<div class="name">plants<

In [8]:
all_css_class = soup.find_all(class_=["producerlist","primaryconsumerlist"])
print(all_css_class)

[<li class="producerlist">
<div class="name">plants</div>
<div class="number">100000</div>
</li>, <li class="producerlist">
<div class="name">algae</div>
<div class="number">100000</div>
</li>, <li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>, <li class="primaryconsumerlist">
<div class="name">rabbit</div>
<div class="number">2000</div>
</li>]


In [9]:
div_li_tags = soup.find_all( ["div","li"], recursive=False)
print(div_li_tags)

[]


## Searching for Tags in relation

### Searching for the parent tags

In [30]:
primaryconsumers = soup.find_all(class_="primaryconsumerlist")
primaryconsumer = primaryconsumers[0]
parent_ul = primaryconsumer.find_parents('ul')
print(parent_ul)

[<ul id="primaryconsumers">
<li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>
<li class="primaryconsumerlist">
<div class="name">rabbit</div>
<div class="number">2000</div>
</li>
</ul>]


In [31]:
parent_p = primaryconsumer.find_parent("ul")
print(parent_p)

<ul id="primaryconsumers">
<li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>
<li class="primaryconsumerlist">
<div class="name">rabbit</div>
<div class="number">2000</div>
</li>
</ul>


In [33]:
immediateprimary_consumer_parent = primaryconsumer.find_parent()
immediateprimary_consumer_parent

<ul id="primaryconsumers">
<li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>
<li class="primaryconsumerlist">
<div class="name">rabbit</div>
<div class="number">2000</div>
</li>
</ul>

### Searching for siblings

In [34]:
producers= soup.find(id='producers')
next_siblings = producers.find_next_siblings()
print(next_siblings)

[<ul id="primaryconsumers">
<li class="primaryconsumerlist">
<div class="name">deer</div>
<div class="number">1000</div>
</li>
<li class="primaryconsumerlist">
<div class="name">rabbit</div>
<div class="number">2000</div>
</li>
</ul>, <ul id="secondaryconsumers">
<li class="secondaryconsumerlist">
<div class="name">fox</div>
<div class="number">100</div>
</li>
<li class="secondaryconsumerlist">
<div class="name">bear</div>
<div class="number">100</div>
</li>
</ul>, <ul id="tertiaryconsumers">
<li class="tertiaryconsumerlist">
<div class="name">lion</div>
<div class="number">80</div>
</li>
<li class="tertiaryconsumerlist">
<div class="name">tiger</div>
<div class="number">50</div>
</li>
</ul>]


### Searching for next

In [37]:
first_div = soup.div
all_li_tags = first_div.find_all_next("li")
all_li_tags

[<li class="producerlist">
 <div class="name">plants</div>
 <div class="number">100000</div>
 </li>,
 <li class="producerlist">
 <div class="name">algae</div>
 <div class="number">100000</div>
 </li>,
 <li class="primaryconsumerlist">
 <div class="name">deer</div>
 <div class="number">1000</div>
 </li>,
 <li class="primaryconsumerlist">
 <div class="name">rabbit</div>
 <div class="number">2000</div>
 </li>,
 <li class="secondaryconsumerlist">
 <div class="name">fox</div>
 <div class="number">100</div>
 </li>,
 <li class="secondaryconsumerlist">
 <div class="name">bear</div>
 <div class="number">100</div>
 </li>,
 <li class="tertiaryconsumerlist">
 <div class="name">lion</div>
 <div class="number">80</div>
 </li>,
 <li class="tertiaryconsumerlist">
 <div class="name">tiger</div>
 <div class="number">50</div>
 </li>]

### Searching for previous
Searching for previous is the opposite case of next, where we can find the previous
object associated with a particular object. We can use the find_all_previous()
method to find all the previous objects associated with the current object and find_
previous() to find the previous object associated with the current object.

## Using search methods to scrape information from a web page

In [74]:
import urllib.request 
from bs4 import BeautifulSoup
url = "http://www.packtpub.com/books"
page = urllib2.urlopen(url)
soup_packtpage = BeautifulSoup(page)
page.close()

<!DOCTYPE html>
<html lang="en">
<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# product: http://ogp.me/ns/product#">
<script>
    var BASE_URL = 'https://www.packtpub.com/';
    var require = {
        "baseUrl": "https://www.packtpub.com/static/version1588085190/frontend/Packt/default/en_GB"
    };
</script>
<meta charset="utf-8"/><script type="text/javascript">(window.NREUM||(NREUM={})).loader_config={licenseKey:"NRJS-0f4d86b78cc0c8047b9",applicationID:"475968873"};window.NREUM||(NREUM={}),__nr_require=function(e,n,t){function r(t){if(!n[t]){var i=n[t]={exports:{}};e[t][0].call(i.exports,function(n){var i=e[t][1][n];return r(i||n)},i,i.exports)}return n[t].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<t.length;i++)r(t[i]);return r}({1:[function(e,n,t){function r(){}function i(e,n,t){return function(){return o(e,[u.now()].concat(f(arguments)),n?null:this,t),n?void 0:this}}var o=e("handle"),a=e(4),f=e(5),c=e("ee").get("tracer"),u=e("loader")

In [73]:
all_books = soup.find(class_ = "col-md-12")
print(all_books)

None


In [71]:
help soup

SyntaxError: invalid syntax (<ipython-input-71-ce87c886f351>, line 1)