# BeautifulSoup

Enter – http://www.dr-chuck.com/page1.htm or another web page

In [None]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup

url = input('Enter - ')
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print(tag.get('href', None))

You can parse XML with BeautifulSoup

In [None]:
data = ''' 
<person>
  <name> Chuck </name> 
  <phone type="intl"> +1 734 303 4456 </phone>
  <email hide="yes" />
</person>'''

tree = BeautifulSoup(data, 'xml') ## or lxml
phone = tree.find('phone')
phone_type = phone.attrs.get('type')

print(f'{phone.string.strip()} is an {phone_type} number')

but there are better options

# XML

## xml.etree.ElementTree

'fromstring' converts the string representation of the XML into a “tree” of XML nodes. 

When the XML is in a tree, we have a series of methods we can call to extract portions of data from the XML.

The 'find' function searches through the XML tree and retrieves a node that matches the specified tag. Each node can have some text, some attributes (like hide), and some “child” nodes. Each node can be the top of a tree of nodes.

In [None]:
import xml.etree.ElementTree as ET  

In [None]:
data = ''' 
<person>
  <name> Chuck </name> 
  <phone type="intl"> +1 734 303 4456 </phone>
  <email hide="yes" />
</person>'''

tree = ET.fromstring(data) # parse the XML
print(tree)
print()
print('Name:', tree.find('name').text)
print('Attr:', tree.find('email').get('hide'))
print()
print(ET.tostring(tree)) # get the string representation of the XML Tree

In [None]:
tree.find('name').text

In [None]:
tree.find('name').text.strip()

In [None]:
input_xml = ''' 
<stuff>
    <users>
        <user x="2">
            <id>001</id>
            <name>Chuck</name> 
        </user>
        <user x="7"> 
            <id>009</id>
            <name>Brent</name> 
        </user>
    </users> 
</stuff>'''


stuff = ET.fromstring(input_xml)   # parse the XML, variable does not need to be called stuff
lst = stuff.findall('users/user')  # search for path expression
print('User count:', len(lst))

for item in lst:
    print(item)
    print('Name:', item.find('name').text) 
    print('Id:', item.find('id').text )
    print('Attribute:', item.get('x'))

In [None]:
bookstore = '''
<Bookstore> 
  <Book ID="101">
     <Author>John Doe</Author>
     <Title>Introduction to XML</Title>
     <Date>12 June 2001</Date>
     <ISBN>121232323</ISBN>
     <Publisher>XYZ</Publisher>
     <Price>45</Price>
  </Book>
  <Book ID="102">
     <Author>Jane Doe</Author> 
     <Title language="en">Introduction to XSL</Title>
     <Date>12 June 2001</Date>
     <ISBN>12323573</ISBN>
     <Publisher>ABC</Publisher>
     <Price>25</Price>
  </Book>
</Bookstore>
'''

bks = ET.fromstring(bookstore)   # parse the XML

print(bks.findall('Book[2]/Author'))
print(bks.find('Book[2]/Author').text)

In [None]:
bks.find("Book[@ID='102']/Author").text

In [None]:
bks.find("Book[Publisher='ABC']/Author").text

In [None]:
bks.find("Book[Price='25']/Author").text

In [None]:
# Unfortunately, ElemenTree does not support arithmetic comparisons
bks.find("Book[Price>30]/Author").text

# lxml
The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt:
https://lxml.de/tutorial.html

Better support for XPath queries: 
https://lxml.de/xpathxslt.html

In [None]:
from lxml import etree

In [None]:
bookstore = '''
<Bookstore> 
  <Book ID="101">
     <Author>John Doe</Author>
     <Title>Introduction to XML</Title>
     <Date>12 June 2001</Date>
     <ISBN>121232323</ISBN>
     <Publisher>XYZ</Publisher>
     <Price>45</Price>
  </Book>
  <Book ID="102">
     <Author> Jane Doe </Author> 
     <Title language="en">Introduction to XSL</Title>
     <Date>12 June 2001</Date>
     <ISBN>12323573</ISBN>
     <Publisher>ABC</Publisher>
     <Price>25</Price>
  </Book>
</Bookstore>
'''

tree = etree.XML(bookstore)
tree

In [None]:
tree.xpath("Book[Publisher='ABC']/Author")

In [None]:
tree.xpath("Book[Price='25']/Author/text()")

In [None]:
tree.xpath("Book[Author='Jane Doe']")

In [None]:
tree.xpath("Book[Author=' Jane Doe ']")

In [None]:
tree.xpath("Book[Price>30]/Author/text()")

In [None]:
tree.xpath("Book[Price>10]/Author")

In [None]:
tree.xpath("Book[@ID='102']/Author/text()")

In [None]:
tree.xpath("Book[@ID > 100]/Author/text()")

In [None]:
tree.xpath("Book[Price>10]/@ID")

In [None]:
etree.tostring(tree)

In [None]:
print(etree.tostring(tree).decode() )

Go back to slides

# JSON

In [None]:
# data = ''' 
# <person>
#   <name> Chuck </name> 
#   <phone type="intl"> +1 734 303 4456 </phone>
#   <email hide="yes" />
# </person>'''

import json
data = '''{
  "name" : "Chuck",
  "phone" : {
    "type" : "intl",
    "number" : "+1 734 303 4456"
   },
   "email" : {
     "hide" : "yes"
   } 
}'''

info = json.loads(data)                # parse the JSON input
print(info)
print('Name:', info["name"])           # like a dictionary
print('Hide:', info["email"]["hide"])  # like a dictionary

In [None]:
info

In [None]:
input_json = '''[
  { "id" : "001",
    "x" : "2",
    "name" : "Chuck"
  },
  { "id" : "009",
    "x" : "7",
    "name" : "Chuck"
  }
]'''

In [None]:
info = json.loads(input_json)
print('User count:', len(info))            # it's a just a list
print('Type', type(info))
print('Type of elements', type(info[0]))   # ... of dictionaries

In [None]:
info = json.loads(input_json) # Gives us a Python list, so we can iterate on with 'for'
print('User count:', len(info))
for item in info:
    print('Name', item['name'])
    print('Id', item['id'])
    print('Attribute', item['x'])