## Reading'http://httpbin.org/forms/post' using lxml.html

In [24]:
from lxml import html
from urllib.request import urlopen #loading URL
url='http://httpbin.org/forms/post'

#### Read HTML from URL and parse

In [98]:
tree = html.parse(urlopen(url)) #load URL using urlopen and parse 
type(tree)

lxml.etree._ElementTree

In [97]:
root = tree.getroot() #returns the document root node <html>
type(root)

lxml.html.HtmlElement

#### Iterating among tree elements

In [95]:
for element in tree.iter():
    if len(element):                    #checks if element has childrens or not
        print(f"{element.tag} :parent")
    else:
        print(f"--{element.tag} : {element.text}") #returns tag name and text

html :parent
--head : 
  
body :parent
--<cyfunction Comment at 0x00000198C3C3B370> :  Example form from HTML5 spec http://www.w3.org/TR/html5/forms.html#writing-a-form's-user-interface 
form :parent
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
fieldset :parent
--legend :  Pizza Size 
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
fieldset :parent
--legend :  Pizza Toppings 
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
p :parent
label :parent
--input : None
p :parent
label :parent
--textarea : None
p :parent
--button : Submit order


In [96]:
for element in root.iter(): #iteration with all elements
    print(f"{element.tag} - {element.text}")

html - 
  
head - 
  
body - 
  
<cyfunction Comment at 0x00000198C3C3B370> -  Example form from HTML5 spec http://www.w3.org/TR/html5/forms.html#writing-a-form's-user-interface 
form - 
   
p - None
label - Customer name: 
input - None
p - None
label - Telephone: 
input - None
p - None
label - E-mail address: 
input - None
fieldset - 
    
legend -  Pizza Size 
p - None
label -  
input - None
p - None
label -  
input - None
p - None
label -  
input - None
fieldset - 
    
legend -  Pizza Toppings 
p - None
label -  
input - None
p - None
label -  
input - None
p - None
label -  
input - None
p - None
label -  
input - None
p - None
label - Preferred delivery time: 
input - None
p - None
label - Delivery instructions: 
textarea - None
p - None
button - Submit order


#### Iterating among selected nodes in tree

In [11]:
for element in tree.iter('p','button'):
    print(f"{element.tag} - {element.text}")

p - None
p - None
p - None
p - None
p - None
p - None
p - None
p - None
p - None
p - None
p - None
p - None
p - None
button - Submit order


#### Finding HTML Elements using find()

In [104]:
tagP = root.find('.//p')                               #print(tagP)
tagP.text_content()

'Customer name: '

In [105]:
tagP1 = root.findtext('.//p/')
print(tagP1)

Customer name: 


#### Iterating through all <<u>p</u>>

In [33]:
for pTag in root.findall('.//p/'):
    print(pTag.text_content())

Customer name: 
Telephone: 
E-mail address: 
  Small 
  Medium 
  Large 
  Bacon 
  Extra Cheese 
  Onion 
  Mushroom 
Preferred delivery time: 
Delivery instructions: 
Submit order


#### Using xpath and cssselect (translates CSS selectors to XPath)

In [115]:
print(root.xpath('//p/label/input/@value')) #attribute value
print(root.xpath('//legend/text()'))

print([formP.text_content().strip() for formP in root.xpath('//form/p')])

['small', 'medium', 'large', 'bacon', 'cheese', 'onion', 'mushroom']
[' Pizza Size ', ' Pizza Toppings ']
['Customer name:', 'Telephone:', 'E-mail address:', 'Preferred delivery time:', 'Delivery instructions:', 'Submit order']


In [116]:
print([e.get('value') for e in root.cssselect('p label input[value]')]) #attribute value
print([l.text_content() for l in root.cssselect('legend')])

print([p.text_content().strip() for p in root.cssselect('form > p')])

['small', 'medium', 'large', 'bacon', 'cheese', 'onion', 'mushroom']
[' Pizza Size ', ' Pizza Toppings ']
['Customer name:', 'Telephone:', 'E-mail address:', 'Preferred delivery time:', 'Delivery instructions:', 'Submit order']


#### Exploring <<u>form</u>> Elements

In [107]:
print(root.forms)                  #there's a single <form> Element available

[<Element form at 0x198c4732670>]


In [110]:
print(root.forms[0].items())
print(root.forms[0].keys())

[('method', 'post'), ('action', '/post')]
['method', 'action']


In [112]:
print(root.forms[0].method)
print(root.forms[0].action)

POST
http://httpbin.org/post
