## Get static site soup

### HTML soup

In [None]:
from bs4 import BeautifulSoup
import requests

def getsoup(url):
    r = requests.get(url)
    if r.status_code != 200:
        print('Failed to get page. Status Code:', r.status_code)
        return None
    try:
        return BeautifulSoup(r.content, 'lxml')
    except Exception as e:
        return BeautifulSoup(r.content, 'html.parser')

url = "https://jivraj-18.github.io/tds-jan-2025-mock-roe-1/"
soup = getsoup(url)
print(soup.prettify())

### XML soup

In [None]:
def getxmlsoup(xml_fname):
    with open(xml_fname, 'r') as f:
        xml = f.read()
        try:
            return BeautifulSoup(xml, 'lxml-xml')
        except Exception as e:
            return BeautifulSoup(xml, 'xml')

xml_fname = "P1File-1.xml"
xml_soup = getxmlsoup(xml_fname=xml_fname)
print(xml_soup.prettify())

## Navigating the tree (attribute-based)

#### Example tag

In [26]:
div_tag = soup.div
div_tag #returns the div_tag

<div class="t">
<div class="r">
<div class="c">YEAR</div>
<div class="c">ST_NAME</div>
</div>
<div class="r">
<div class="c"><a href="0UAqFzWs.html">1955</a></div>
<div class="c"><a href="0UAqFzWs.html">ANDHRA PRADESH</a></div>
</div>
<div class="r">
<div class="c"><a href="DK4FrUMp.html">1957</a></div>
<div class="c"><a href="DK4FrUMp.html">ANDHRA PRADESH</a></div>
</div>
<div class="r">
<div class="c"><a href="48Y3tT3Q.html">1962</a></div>
<div class="c"><a href="48Y3tT3Q.html">ANDHRA PRADESH</a></div>
</div>
<div class="r">
<div class="c"><a href="DgAL47D1.html">1967</a></div>
<div class="c"><a href="DgAL47D1.html">ANDHRA PRADESH</a></div>
</div>
<div class="r">
<div class="c"><a href="qXIaSyZP.html">1972</a></div>
<div class="c"><a href="qXIaSyZP.html">ANDHRA PRADESH</a></div>
</div>
<div class="r">
<div class="c"><a href="aE1pu1lJ.html">1978</a></div>
<div class="c"><a href="aE1pu1lJ.html">ANDHRA PRADESH</a></div>
</div>
<div class="r">
<div class="c"><a href="o7XBetF5.html">1983<

#### .contents and .children

In [None]:
div_tag.contents #loads all direct children into memory at once

In [None]:
for child in div_tag.children: # returns an iterator over all direct children, memory efficient
    print(child.name)

#### .descendants

In [None]:
for desc in div_tag.descendants: #returns an iterator over all children and their children recursively
    print(desc.name)

#### .parent and .parents

In [None]:
div_tag.parent #returns the immediate parent of the tag
div_tag.parents #returns an iterator over a list of all parents of the tag

#### Siblings

In [None]:
div_tag.next_sibling #returns the next sibling of the tag
div_tag.previous_sibling #returns the previous sibling of the tag

div_tag.next_siblings #returns an iterator over all next siblings of the tag
div_tag.previous_siblings #returns an iterator over all previous siblings of the tag

In [None]:
div_tag.next_element #returns the next element of the tag
div_tag.next_elements #returns an iterator over all next elements of the tag

div_tag.previous_element #returns the previous element of the tag
div_tag.previous_elements #returns an iterator over all previous elements of the tag

## Searching the tree (method-based)

In [None]:
import re
name = '' // re.compile('')
attrs = {

}
recursive = False
string = '' or re.compile('')
limit = None

# search by tag
# kwargs are evaluated as html attributes to be matched. Regex is allowed here too
div_tag.find_all(name, attrs, recursive, string, limit, **kwargs)
div_tag.find(name, attrs, recursive, string, **kwargs)


# finding parents
div_tag.find_parents(name, attrs, string, limit, **kwargs)
div_tag.find_parent(name, attrs, string, **kwargs)

# finding next siblings
div_tag.find_next_siblings(name, attrs, string, limit, **kwargs)
div_tag.find_next_sibling(name, attrs, string, **kwargs)

# finding previous siblings
div_tag.find_previous_siblings(name, attrs, string, limit, **kwargs)
div_tag.find_previous_sibling(name, attrs, string, **kwargs)


# finding all after current
div_tag.find_all_next(name, attrs, string, limit, **kwargs)
div_tag.find_next(name, attrs, string, **kwargs)

# finding all before current        
div_tag.find_all_previous(name, attrs, string, limit, **kwargs)
div_tag.find_previous(name, attrs, string, **kwargs)

## Searching using css selector

In [None]:
# finding using css selector
selector = 'div'
limit = None
kwargs = {}
div_tag.select(selector, limit, **kwargs)