# Lesson 7: Advanced Web Scraping and Data Gathering
## Topic 2: Read data from XML

In [1]:
import xml.etree.ElementTree as ET

### Exercise 11: Create some random data yourself to understand the XML data format better

In [2]:
data = '''
<person>
  <name>Dave</name>
  <surname>Piccardo</surname>
  <phone type="intl">
     +1 742 101 4456
   </phone>
   <email hide="yes">
   dave.p@gmail.com</email>
</person>'''

In [3]:
print(data)


<person>
  <name>Dave</name>
  <surname>Piccardo</surname>
  <phone type="intl">
     +1 742 101 4456
   </phone>
   <email hide="yes">
   dave.p@gmail.com</email>
</person>


In [4]:
type(data)

str

### Exercise 12: Read the string data as an XML `Element` object 

In [5]:
tree = ET.fromstring(data)

In [6]:
type(tree)

xml.etree.ElementTree.Element

### Exercise 13: Find various elements of data within the tree (element)

In [7]:
# Print the name of the person
print('Name:', tree.find('name').text)

Name: Dave


In [8]:
# Print the surname
print('Surname:', tree.find('surname').text)

Surname: Piccardo


In [9]:
# Print the phone number
print('Phone:', tree.find('phone').text.strip())

Phone: +1 742 101 4456


In [10]:
# Print email status and the actual email
print('Email hidden:', tree.find('email').get('hide'))
print('Email:', tree.find('email').text.strip())

Email hidden: yes
Email: dave.p@gmail.com


### Exercise 14: Read from a local XML file (perhaps downloaded) into an `ElementTree` object

In [11]:
tree2=ET.parse('xml1.xml')

In [12]:
type(tree2)

xml.etree.ElementTree.ElementTree

### Exercise 15: How to 'traverse' the tree? Find the `root` and explore all `child` nodes and their `attributes`

In [13]:
root=tree2.getroot()

In [14]:
for child in root:
    print ("Child tag:",child.tag, "| Child attribute:",child.attrib)

Child tag: country | Child attribute: {'name': 'Liechtenstein'}
Child tag: country | Child attribute: {'name': 'Singapore'}
Child tag: country | Child attribute: {'name': 'Panama'}


### Exercise 16: Use the `.text()` method to extract meaningful data

In [15]:
root[0][2]

<Element 'gdppc' at 0x0000012FB63AEA98>

In [16]:
root[0][2].text

'141100'

In [17]:
root[0][2].tag

'gdppc'

In [18]:
root[0]

<Element 'country' at 0x0000012FB63AE688>

In [19]:
root[0].tag

'country'

In [20]:
root[0].attrib

{'name': 'Liechtenstein'}

### Exercise 17: Write a loop to extract and print the GDP/per capita information against each country 

In [21]:
for c in root:
    country_name=c.attrib['name']
    gdppc = int(c[2].text)
    print("{}: {}".format(country_name,gdppc))

Liechtenstein: 141100
Singapore: 59900
Panama: 13600


### Exercise 18: Find all the neighboring countries for each country and print them
Note how to use `findall` and `attrib` together

In [22]:
for c in root:
    ne=c.findall('neighbor') # Find all the neighbors
    print("Neighbors\n"+"-"*25)
    for i in ne: # Iterate over the neighbors and print their 'name' attribute
        print(i.attrib['name'])
    print('\n')

Neighbors
-------------------------
Austria
Switzerland


Neighbors
-------------------------
Malaysia


Neighbors
-------------------------
Costa Rica
Colombia




### Exercise 19: A simple demo of using XML data obtained by web scraping

In [23]:
import urllib.request, urllib.parse, urllib.error

In [24]:
serviceurl = 'http://www.recipepuppy.com/api/?'

In [25]:
item = str(input('Enter the name of a food item (enter \'quit\' to quit): '))
url = serviceurl + urllib.parse.urlencode({'q':item})+'&p=1&format=xml'
uh = urllib.request.urlopen(url)

Enter the name of a food item (enter 'quit' to quit): pizza


In [26]:
data = uh.read().decode()
print('Retrieved', len(data), 'characters')
tree3 = ET.fromstring(data)

Retrieved 2881 characters


In [27]:
type(tree3)

xml.etree.ElementTree.Element

In [28]:
for elem in tree3.iter():
    print(elem.text)





BBQ Chicken Pizza
http://www.recipezaar.com/BBQ-Chicken-Pizza-144689
chicken, brown sugar, cayenne, garlic salt, green pepper, honey, italian cheese blend, salad dressing, margarine, molasses, onions, barbecue sauce, black pepper, prepared pizza crust, provolone cheese, ranch dressing, salt


Basic Chicago-style Pizza Recipe
http://www.grouprecipes.com/65487/basic-chicago-style-pizza.html
pizza, vegetable oil, cornmeal, water, flour, sausage, provolone cheese, olive oil, tomato, yeast, pepperoni, salt, salt, sugar, basil, oregano


BBQ'd Cheeseburger Pizza
http://www.recipezaar.com/BBQd-Cheeseburger-Pizza-299376
barbecue sauce, cheddar cheese, onions, tomato, dill pickle, dill relish, parsley, french dressing, garlic powder, ground beef, lettuce, mayonnaise, mozzarella cheese, pizza dough, mustard


Healthy Italian Bread Sticks or Pizza Crust
http://www.recipezaar.com/Healthy-Italian-Bread-Sticks-or-Pizza-Crust-252423
brown sugar, garlic powder, italian seasoning, olive oil, onion 

In [29]:
print(data)

<?xml version="1.0"?>
<recipes>
<recipe>
<title>BBQ Chicken Pizza</title>
<href>http://www.recipezaar.com/BBQ-Chicken-Pizza-144689</href>
<ingredients>chicken, brown sugar, cayenne, garlic salt, green pepper, honey, italian cheese blend, salad dressing, margarine, molasses, onions, barbecue sauce, black pepper, prepared pizza crust, provolone cheese, ranch dressing, salt</ingredients>
</recipe>
<recipe>
<title>Basic Chicago-style Pizza Recipe</title>
<href>http://www.grouprecipes.com/65487/basic-chicago-style-pizza.html</href>
<ingredients>pizza, vegetable oil, cornmeal, water, flour, sausage, provolone cheese, olive oil, tomato, yeast, pepperoni, salt, salt, sugar, basil, oregano</ingredients>
</recipe>
<recipe>
<title>BBQ'd Cheeseburger Pizza</title>
<href>http://www.recipezaar.com/BBQd-Cheeseburger-Pizza-299376</href>
<ingredients>barbecue sauce, cheddar cheese, onions, tomato, dill pickle, dill relish, parsley, french dressing, garlic powder, ground beef, lettuce, mayonnaise, mozza

In [30]:
for e in tree3.iter():
    h=e.find('href')
    t=e.find('title')
    if h!=None and t!=None:
        print("Receipe Link for:",t.text)
        print(h.text)
        print("-"*100)

Receipe Link for: BBQ Chicken Pizza
http://www.recipezaar.com/BBQ-Chicken-Pizza-144689
----------------------------------------------------------------------------------------------------
Receipe Link for: Basic Chicago-style Pizza Recipe
http://www.grouprecipes.com/65487/basic-chicago-style-pizza.html
----------------------------------------------------------------------------------------------------
Receipe Link for: BBQ'd Cheeseburger Pizza
http://www.recipezaar.com/BBQd-Cheeseburger-Pizza-299376
----------------------------------------------------------------------------------------------------
Receipe Link for: Healthy Italian Bread Sticks or Pizza Crust
http://www.recipezaar.com/Healthy-Italian-Bread-Sticks-or-Pizza-Crust-252423
----------------------------------------------------------------------------------------------------
Receipe Link for: Bacon Cheeseburger Pizza
http://www.recipezaar.com/Bacon-Cheeseburger-Pizza-205270
-----------------------------------------------------