In [1]:
from bs4 import BeautifulSoup

In [3]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol>
    <li class="special">This list item is special.</li>
    <li class="special">This list item is also special.</li>
    <li>This list item is not special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
"""


In [None]:
soup = BeautifulSoup(html, 'html.parser')
# techniškai mūsų html kodas tampa objektu
soup

In [5]:
print(soup.body.h3)

<h3 data-example="yes">hi</h3>


In [6]:
print(soup.find('div'))

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [7]:
print(soup.find_all('div'))

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, <div data-example="yes">bye</div>]


In [8]:
# aukščiau gavome sąrašą, kurį galime iteruotis
for element in soup.find_all('div'):
    print(element)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<div data-example="yes">bye</div>


In [9]:
# Galima filtruotis pagal css klasę
soup.find_all(class_="special")

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>]

In [10]:
soup.find_all(attrs={'data-example': 'yes'})

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]

In [11]:
soup.select('div')

[<div id="first">
 <h3 data-example="yes">hi</h3>
 <p>more text.</p>
 </div>,
 <div data-example="yes">bye</div>]

In [12]:
soup.select('#first')

[<div id="first">
 <h3 data-example="yes">hi</h3>
 <p>more text.</p>
 </div>]

In [13]:
soup.select('.special')

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>]

In [14]:
# css elementai ie6komi lau=tiniuose skliaustuose
soup.select('[data-example]')

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]

# Teksto gaudymas

In [15]:
# Pasiimame norim1 elemeną ir susidedame į kintamąjį
ol_li = soup.select('ol li')
ol_li

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>,
 <li>This list item is not special.</li>]

In [16]:
ol_li[0].get_text()

'This list item is special.'

In [17]:
# iteruojame
for item in ol_li:
    print(item.get_text())

This list item is special.
This list item is also special.
This list item is not special.


In [18]:
items = []
for item in ol_li:
    items.append(item.get_text())
items

['This list item is special.',
 'This list item is also special.',
 'This list item is not special.']

In [19]:
hi_buy = soup.select('[data-example]')
for item in hi_buy:
    print(item.get_text())

hi
bye


In [20]:
# Atsispausdiname visus atributus
for item in hi_buy:
    print(item.attrs)

{'data-example': 'yes'}
{'data-example': 'yes'}


In [21]:
for item in hi_buy:
    print(item.attrs['data-example'])

yes
yes


In [22]:
# išsitraukiame iki atskyro elemento
for item in hi_buy:
    print(item.name, item.attrs['data-example'])

h3 yes
div yes


In [23]:
soup.find('div').attrs["id"]

'first'

# Navigacija tarp elementų

In [24]:
# Norime gauti elemeto turinį. Tuo būdu gali pažiūrėti realų to elemento turinį ir 
# pasinagrinėti jį. Pritaikiti galima kodo optimuzavimui, rašant pyton kodą
soup.div.contents

['\n', <h3 data-example="yes">hi</h3>, '\n', <p>more text.</p>, '\n']

In [25]:
li = soup.find('li')
li

<li class="special">This list item is special.</li>

In [26]:
soup.find('ol').contents

['\n',
 <li class="special">This list item is special.</li>,
 '\n',
 <li class="special">This list item is also special.</li>,
 '\n',
 <li>This list item is not special.</li>,
 '\n']

In [27]:
li.next_sibling.next_sibling

<li class="special">This list item is also special.</li>

In [28]:
li.next_sibling

'\n'

In [29]:
li.next_element

'This list item is special.'

In [30]:
li.parent.parent

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>

# Navigacija per metodus

In [31]:
li.find_next_siblings()

[<li class="special">This list item is also special.</li>,
 <li>This list item is not special.</li>]

In [32]:
li.find_next_siblings(class_='not-special')

[]

In [33]:
li.find_parent().find_previous_sibling().attrs['id']

'first'

In [34]:
# "Yes" elemento išgavimas
soup.body.next_element.next_element.next_element.next_element.attrs['data-example']

'yes'

In [1]:
import requests


In [2]:
google = requests.get('https://google.com')
google.content

b'<!DOCTYPE html><html lang="lt" dir="ltr"><head><style nonce="VOA8L5GgndreG72HMUjAiQ">\na, a:link, a:visited, a:active, a:hover {\n  color: #1a73e8;\n  text-decoration: none;\n}\nbody {\n  font-family: Roboto,RobotoDraft,Helvetica,Arial,sans-serif;\n  text-align: center;\n  -ms-text-size-adjust: 100%;\n  -moz-text-size-adjust: 100%;\n  -webkit-text-size-adjust: 100%;\n}\n.box {\n  border: 1px solid #dadce0;\n  box-sizing: border-box;\n  border-radius: 8px;\n  margin: 24px auto 5px auto;\n  max-width: 800px;\n  padding: 24px;\n}\nh1 {\n  color: #2c2c2c;\n  font-size: 24px;\n  hyphens: auto;\n  margin: 24px 0;\n}\n.icaCallout {\n  background-color: #f8f9fa;\n  padding: 12px 16px;\n  border-radius: 10px;\n  margin-bottom: 10px;\n}\np, .sub, .contentText, .icaCallout {\n  color: #5f6368;;\n  font-size: 14px;\n  line-height: 20px;\n  letter-spacing: 0.2px;\n  text-align: left;\n}\n.signin {\n  text-align: right;\n}\n.saveButtonContainer,\n.saveButtonContainerNarrowScreen {\n  width: 100%;\

In [75]:
google.status_code

200

In [79]:
# Pasitikriname ar kodas yra 200. 404 b8t7 ne OK
if google.status_code==200:
    google_soup = BeautifulSoup(google.content, 'html.parser')
else:
    google_soup = google.status_code

In [80]:
# Bandydami žingsnis po žingsnio prisikasame prie norimo elento
google_soup.body.select('.box')[0].select_one('img').attrs['src']

'//www.gstatic.com/images/branding/googlelogo/1x/googlelogo_color_68x28dp.png'

In [81]:
python = requests.get('https://python.org')
python.status_code

200

In [None]:
pysoup = BeautifulSoup(python.content, 'html.parser')
pysoup.body

In [None]:
pymenu = pysoup.select_one('.menu').select('li')
pymenu

In [85]:
for item in pymenu:
    link = item.select_one('a')
    # print(f'{item.attrs['title']} = {link.get.text()}')
    if 'title' in link.attrs:
        print(link.attrs['title'], end=' = ')
        print(link.get_text(), end=', goes to ')
        print(link.attrs['href'])


The Python Programming Language = Python, goes to /
The Python Software Foundation = PSF, goes to /psf-landing/
Python Documentation = Docs, goes to https://docs.python.org
Python Package Index = PyPI, goes to https://pypi.org/
Python Job Board = Jobs, goes to /jobs/
