# BeautifulSoup

In [51]:
from bs4 import BeautifulSoup

In [52]:
html = """
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>First HTML Page</title>
</head>
<body>
  <div id="first">
    <h3 data-example="yes">hi</h3>
    <p>more text.</p>
  </div>
  <ol>
    <li class="special">This list item is special.</li>
    <li class="special">This list item is also special.</li>
    <li>This list item is not special.</li>
  </ol>
  <div data-example="yes">bye</div>
</body>
</html>
"""

In [53]:
html

'\n<!DOCTYPE html>\n<html lang="en">\n<head>\n  <meta charset="UTF-8">\n  <title>First HTML Page</title>\n</head>\n<body>\n  <div id="first">\n    <h3 data-example="yes">hi</h3>\n    <p>more text.</p>\n  </div>\n  <ol>\n    <li class="special">This list item is special.</li>\n    <li class="special">This list item is also special.</li>\n    <li>This list item is not special.</li>\n  </ol>\n  <div data-example="yes">bye</div>\n</body>\n</html>\n'

In [54]:
soup = BeautifulSoup(html, "html.parser")
print(type(soup))

<class 'bs4.BeautifulSoup'>


In [55]:
soup


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<title>First HTML Page</title>
</head>
<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>
</html>

In [56]:
print(soup.body) # spausdina tik body

# <body>
# <div id="first">
# <h3 data-example="yes">hi</h3>
# <p>more text.</p>
# </div>
# <ol>
# <li class="special">This list item is special.</li>
# <li class="special">This list item is also special.</li>
# <li>This list item is not special.</li>
# </ol>
# <div data-example="yes">bye</div>
# </body>

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.</li>
<li class="special">This list item is also special.</li>
<li>This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>


In [57]:
print(soup.body.div)

# <div id="first">
# <h3 data-example="yes">hi</h3>
# <p>more text.</p>
# </div>

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


## find(), find_all()

In [58]:
print(soup.find('div'))

# <div id="first">
# <h3 data-example="yes">hi</h3>
# <p>more text.</p>
# </div>

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>


In [59]:
print(soup.find_all('div')) # rado pirma ir antra 'div'
    

# [<div id="first">
# <h3 data-example="yes">hi</h3>
# <p>more text.</p>
# </div>, <div data-example="yes">bye</div>]

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, <div data-example="yes">bye</div>]


In [60]:
for element in soup.find_all('div'):
    print(element)

<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<div data-example="yes">bye</div>


In [61]:
print(soup.find_all(class_='special'))

# [<li class="special">This list item is special.</li>, <li class="special">This list item is also special.</li>]

[<li class="special">This list item is special.</li>, <li class="special">This list item is also special.</li>]


In [62]:
soup.find(attrs={'data-example':'yes'})

<h3 data-example="yes">hi</h3>

In [63]:
soup.find_all(attrs={'data-example':'yes'})

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]

# Išrinkimas CSS stiliumi

In [64]:
print(soup.select('#first'))

# [<div id="first">
# <h3 data-example="yes">hi</h3>
# <p>more text.</p>
# </div>]

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>]


In [65]:
print(soup.select('.special'))

# [<li class="special">This list item is special.</li>, <li class="special">This list item is also special.</li>]

[<li class="special">This list item is special.</li>, <li class="special">This list item is also special.</li>]


In [66]:
print(soup.select('div'))

# [<div id="first">
# <h3 data-example="yes">hi</h3>
# <p>more text.</p>
# </div>, <div data-example="yes">bye</div>]

[<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>, <div data-example="yes">bye</div>]


In [67]:
print(soup.select('[data-example]'))

# [<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]

[<h3 data-example="yes">hi</h3>, <div data-example="yes">bye</div>]


## get_text()

In [68]:
# Kuomet norime ištraukti patį tekstą iš HTML bloko, darome taip:

element = soup.select('.special')[0]
print(element.get_text())

# This list item is special.

This list item is special.


In [69]:
ol_li = soup.select('ol li')
ol_li

[<li class="special">This list item is special.</li>,
 <li class="special">This list item is also special.</li>,
 <li>This list item is not special.</li>]

In [70]:
# Galime ir praiteruoti:
elements = soup.select('.special')
for element in elements:
  print(element.get_text())

# This list item is special.
# This list item is also special.

This list item is special.
This list item is also special.


In [71]:
ol_li[0].get_text()

'This list item is special.'

In [72]:
for item in ol_li:
    item.append(item.get_text())

In [73]:
item = []
for item in ol_li:
    item.append(item.get_text())
item

<li>This list item is not special.This list item is not special.This list item is not special.This list item is not special.</li>

In [74]:
hi_bye = soup.select('[data-example]')
for item in hi_bye:
    print(item.get_text())

hi
bye


In [75]:
for item in hi_bye:
    print(item.name, item.attrs['data-example'])

h3 yes
div yes


## .name, .attrs

In [76]:
# norėdami gauti blokų pavadinimus pagal (pvz) klasę, darysime taip:
elements = soup.select('.special')
for element in elements:
  print(element.name)

# li
# li

li
li


In [77]:
# norėdami išgauti bloko atributus, naudosime .attrs:
elements = soup.select('meta')
print(elements[0].attrs)

# {'charset': 'UTF-8'}

{'charset': 'UTF-8'}


In [78]:
# Atributų reikšmes galime sužinoti tokiu būdu:
attribute = soup.find('div')['id']
print(attribute)

# first

first


In [79]:
soup.find('div').attrs['id']

'first'

## Navigacija tarp HTML elementų

In [80]:
print(soup.div.contents)

# ['\n', <h3 data-example="yes">hi</h3>, '\n', <p>more text.</p>, '\n']

['\n', <h3 data-example="yes">hi</h3>, '\n', <p>more text.</p>, '\n']


In [81]:
# būdas turi trūkumą, nes į sąrašą įtraukia naujos eilutės simbolius. Panagrinėkime html atkarpą:

# <body>
#   <div id="first">
#     <h3 data-example="yes">hi</h3>
#     <p>more text.</p>
#   </div>
#   <ol>
#     <li class="special">This list item is special.</li>
#     <li class="special">This list item is also special.</li>
#     <li>This list item is not special.</li>
#   </ol>
#   <div data-example="yes">bye</div>
# </body>


In [82]:
li = soup.find('li')
li

<li class="special">This list item is special.This list item is special.This list item is special.This list item is special.</li>

In [83]:
li.next_sibling.next_sibling

<li class="special">This list item is also special.This list item is also special.This list item is also special.This list item is also special.</li>

In [84]:
li.next_element

'This list item is special.'

In [85]:
for element in li.parent.next_elements:
    print(element)



<li class="special">This list item is special.This list item is special.This list item is special.This list item is special.</li>
This list item is special.
This list item is special.
This list item is special.This list item is special.


<li class="special">This list item is also special.This list item is also special.This list item is also special.This list item is also special.</li>
This list item is also special.
This list item is also special.
This list item is also special.This list item is also special.


<li>This list item is not special.This list item is not special.This list item is not special.This list item is not special.</li>
This list item is not special.
This list item is not special.
This list item is not special.This list item is not special.




<div data-example="yes">bye</div>
bye








In [86]:
# body turi tris 'vaikus' - div, ol, div, vienas kitam jie yra 'broliai'(siblings), nes yra viename hierarchijos lygyje.
# li = soup.find('li')
# print(li.next_sibling.next_sibling) #todėl, kad '\n' užskaito už siblingą

# <li class="special">This list item is also special.</li>

In [87]:
# suradome sekantį pirmojo li elemento 'brolį' (analogiškai galima dirbti su previous_sibling)
print(li.parent)

# <ol>
# <li class="special">This list item is special.</li>
# <li class="special">This list item is also special.</li>
# <li>This list item is not special.</li>
# </ol>

<ol>
<li class="special">This list item is special.This list item is special.This list item is special.This list item is special.</li>
<li class="special">This list item is also special.This list item is also special.This list item is also special.This list item is also special.</li>
<li>This list item is not special.This list item is not special.This list item is not special.This list item is not special.</li>
</ol>


In [88]:
li.parent.parent

<body>
<div id="first">
<h3 data-example="yes">hi</h3>
<p>more text.</p>
</div>
<ol>
<li class="special">This list item is special.This list item is special.This list item is special.This list item is special.</li>
<li class="special">This list item is also special.This list item is also special.This list item is also special.This list item is also special.</li>
<li>This list item is not special.This list item is not special.This list item is not special.This list item is not special.</li>
</ol>
<div data-example="yes">bye</div>
</body>

In [89]:
li.find_next_sibling()

<li class="special">This list item is also special.This list item is also special.This list item is also special.This list item is also special.</li>

In [90]:
print(li.find_next_sibling(class_=''))

<li>This list item is not special.This list item is not special.This list item is not special.This list item is not special.</li>


## Kombinacijos

In [93]:
# naviguojant galime kurti tokias ir panašias grandines:

li.find_parent().find_previous_sibling().attrs['id']

'first'

In [101]:
soup.body.next_element.next_element.next_element.next_element.attrs['data-example']

'yes'

In [95]:
res = soup.body.next_element.next_element.next_element.next_element.get_text()
print(res)

hi


## Request

In [115]:
# Requests yra Python biblioteka darbui su HTTP užklausomis.
# Norint pradėti darbą, reikia importuoti requests:
import requests

In [116]:
# Dabar susikurkime objektą, kuris bus atsakas į mūsų užklausą:
google = requests.get('http://google.com')
google


<Response [200]>

In [105]:
google.status_code

200

In [117]:
google.content

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="tr"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="mDkeFUt594woAhEFzwqQkA">(function(){window.google={kEI:\'n3NjY-imFMulptQPwdm1wAM\',kEXPI:\'0,1302536,56873,6059,206,4804,2316,383,246,5,5367,1123753,1197731,380760,16114,28684,22431,1361,12311,2823,14765,4998,13226,3849,10622,22741,5081,887,706,1279,2742,149,1103,840,1983,214,4100,3514,606,2023,1733,43,521,14670,3227,2845,7,33770,1851,6397,9359,3,346,230,6459,150,13974,4,1528,2304,7039,27731,7355,11444,2216,21223,5827,2530,4094,4052,3,3541,1,42154,2,28138,11623,5679,1020,2381,12485,16257,4568,6258,23418,1252,5835,14967,4333,7484,445,2,2,1,1385,15921,9326,8155,6680,701,15970,873,7829,11804,7,1922,5784,3995,20639,1140,9543,4832,11642,5496,700,4,1,2,2,2,2,8652,8793,88,9237,2,2017,14,82,949,2941,751,201,1869,

In [109]:
if google.status_code == 200:
    google_soup = BeautifulSoup(google.content, 'html.parser')
else:
    google_soup = google.status_code

In [114]:
google_soup.body.select('.box')[0].select_one('img').attrs['src']

IndexError: list index out of range

In [119]:
python = requests.get('https://python.org')
python.status_code

200

In [120]:
pysoup = BeautifulSoup(python.content, "html.parser")
pysoup.body

<body class="python home" id="homepage">
<div id="touchnav-wrapper">
<div class="do-not-print" id="nojs">
<p><strong>Notice:</strong> While JavaScript is not essential for this website, your interaction with the content will be limited. Please turn JavaScript on for the full experience. </p>
</div>
<!--[if lte IE 8]>
            <p>
                <strong>Notice:</strong> Your browser is <em>ancient</em>. Please
                <a href="http://browsehappy.com/">upgrade to a different browser</a> to experience a better web.
            </p>
        </div>
        <![endif]-->
<!-- Sister Site Links -->
<div class="top-bar do-not-print" id="top">
<nav class="meta-navigation container" role="navigation">
<div class="skip-link screen-reader-text">
<a href="#content" title="Skip to content">Skip to content</a>
</div>
<a aria-hidden="true" class="jump-link" href="#python-network" id="close-python-network">
<span aria-hidden="true" class="icon-arrow-down"><span>▼</span></span> Close
        

In [121]:
pymenu=pysoup.select_one('.menu').select('li')
pymenu

[<li class="python-meta current_item selectedcurrent_branch selected">
 <a class="current_item selectedcurrent_branch selected" href="/" title="The Python Programming Language">Python</a>
 </li>,
 <li class="psf-meta">
 <a href="/psf-landing/" title="The Python Software Foundation">PSF</a>
 </li>,
 <li class="docs-meta">
 <a href="https://docs.python.org" title="Python Documentation">Docs</a>
 </li>,
 <li class="pypi-meta">
 <a href="https://pypi.org/" title="Python Package Index">PyPI</a>
 </li>,
 <li class="jobs-meta">
 <a href="/jobs/" title="Python Job Board">Jobs</a>
 </li>,
 <li class="shop-meta">
 <a href="/community-landing/">Community</a>
 </li>]

In [123]:
for item in pymenu:
    link=item.select_one('a')
    # print(f"{link.attrs['title']}) = {link.get_text()}")
    if 'title' in link.attrs:
        print(link.attrs['title'], end=" = ")
    print(link.get_text(), end=", goes to ")
    print(link.attrs['href'])

The Python Programming Language = Python, goes to /
The Python Software Foundation = PSF, goes to /psf-landing/
Python Documentation = Docs, goes to https://docs.python.org
Python Package Index = PyPI, goes to https://pypi.org/
Python Job Board = Jobs, goes to /jobs/
Community, goes to /community-landing/


## Praktinis

In [125]:
r_prusija = requests.get('https://lt.wikipedia.org/wiki/Pr%C5%ABsija')
if r_prusija.status_code == 200:
    prusija = BeautifulSoup(r_prusija.content, "html.parser")
else:
    prusija = r_prusija.status_code
prusija

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="lt">
<head>
<meta charset="utf-8"/>
<title>Prūsija – Vikipedija</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":[",\t."," \t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","sausio","vasario","kovo","balandžio","gegužės","birželio","liepos","rugpjūčio","rugsėjo","spalio","lapkričio","gruodžio"],"wgRequestId":"890fddc5-a74b-4087-8198-ccdf94724381","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Prūsija","wgTitle":"Prūsija","wgCurRevisionId":6720439,"wgRevisionId":6720439,"wgArticleId":653,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pages with non-numeric formatnum arguments","Straipsniai be šaltinių nuo 2004 m. sausio","Straipsniai be šaltinių pažymėti nuo 2020 m. lapkričio","Straipsniai 

In [132]:
prusija_info = {}
headings = []
for heading in  prusija.select('h2'):
    title = heading.select_one('span').get_text()
    # print(title)
    content = heading.find_next('p').get_text()
    # print(content)
    prusija_info[title] = content
for heading in prusija.select('h3'):
    title=heading.select_one('.mw-headline').get_text()
    # print("--- ", title)
    content = heading.find_next('p').get_text()
    # print(content)
    prusija_info[title] = content

for title, content in prusija_info.items():
    print(f"--- {title} ---")
    print(f"{content}\n")




--- Geografija ---
Prūsijos regionas užėmė teritoriją, šiuo metu padalintą tarp trijų valstybių. Šiaurėje jis buvo ribojamas Baltijos jūros, ir krantas čia yra labai sudėtingas, turi keletą didelių lagūnų (Kuršių marios, Aistmarės), nerijų. Rytuose Prūsija ribojosi su Lietuva, pietuose – su Mazovija, pietryčiuose – su Palenke, o vakaruose – su Pomerelija (nuo kurios skyrė Vysla). Pastarasis, dar vadinamas Vakarų Prūsija, gali būti laikoma išplėstinio Prūsijos regiono dalimi.


--- Istorija ---
Regionas patyrė labai sudėtingą istoriją, čia du kartus visiškai keitėsi etninė sudėtis.


--- Senovės Prūsija ---
Senovės Prūsija – baltiškos prūsų tautos gyventa teritorija pietryčių Pabaltijyje, tarp Vyslos ir Nemuno upių. Čia konsolidavosi ankstyvosios gentinės žemės. XIII a. Kryžiaus žygių į Prūsiją metu šią teritoriją nukariavo Vokiečių ordinas, krašte įvesta krikščionybė.


--- Vokiečių valdymas ---
Nuo XIII a. regione viešpatavo Vokiečių ordinas, kuris padarė Prūsiją savo politiniu ir kul