In [1]:
import xml

In [2]:
import xml.etree.ElementTree as ET

In [3]:
article_file = "exampleResearchArticle.xml"


def get_root(fname):
    tree = ET.parse(fname)
    return tree.getroot()


def get_authors(root):
    authors = []
    for author in root.findall('./fm/bibl/aug/au'):
        data = {
                "fnm": None,
                "snm": None,
                "email": None,
                "insr": []
        }

        data['fnm'] = author.find('./fnm').text
        data['snm'] = author.find('./snm').text
        data['email'] = author.find('./email').text
        
        for element in author.findall('./insr'):
            data['insr'].append(element.get('iid'))

        authors.append(data)

    return authors

get_authors(get_root(article_file))

[{'email': 'omer@extremegate.com',
  'fnm': 'Omer',
  'insr': ['I1'],
  'snm': 'Mei-Dan'},
 {'email': 'mcarmont@hotmail.com',
  'fnm': 'Mike',
  'insr': ['I2'],
  'snm': 'Carmont'},
 {'email': 'laver17@gmail.com',
  'fnm': 'Lior',
  'insr': ['I3', 'I4'],
  'snm': 'Laver'},
 {'email': 'nyska@internet-zahav.net',
  'fnm': 'Meir',
  'insr': ['I3'],
  'snm': 'Nyska'},
 {'email': 'kammarh@gmail.com',
  'fnm': 'Hagay',
  'insr': ['I8'],
  'snm': 'Kammar'},
 {'email': 'gideon.mann.md@gmail.com',
  'fnm': 'Gideon',
  'insr': ['I3', 'I5'],
  'snm': 'Mann'},
 {'email': 'barns.nz@gmail.com',
  'fnm': 'Barnaby',
  'insr': ['I6'],
  'snm': 'Clarck'},
 {'email': 'eukots@gmail.com', 'fnm': 'Eugene', 'insr': ['I7'], 'snm': 'Kots'}]

In [4]:
from bs4 import BeautifulSoup

In [5]:
page_source = 'page_source.html'

soup = BeautifulSoup(open(page_source), 'lxml')

In [6]:
data = {"eventvalidation": "",
            "viewstate": ""}

event_validation = soup.find(id='__EVENTVALIDATION')
data['eventvalidation'] = event_validation.get('value')

view_state = soup.find(id='__VIEWSTATE')
data['viewstate'] = view_state.get('value')


In [7]:
html_page = 'options.html'

In [50]:
def get_carriers(page):
    carriers = []
    soup = BeautifulSoup(open(page), 'lxml')
    carrier_list = soup.find(id='CarrierList')
    for carrier in carrier_list.find_all('option'):
        if len(carrier['value']) == 2:
            carriers.append(carrier['value'])
            
    return carriers

get_carriers(html_page)

['FL',
 'AS',
 'AA',
 'MQ',
 '5Y',
 'DL',
 'EV',
 'F9',
 'HA',
 'B6',
 'OO',
 'WN',
 'NK',
 'US',
 'UA',
 'VX']

In [56]:
def get_airports(page):
    airports = []
    soup = BeautifulSoup(open(page), 'lxml')
    airport_list = soup.find(id='AirportList')
    for airport in airport_list.find_all('option'):
        if (len(airport['value']) ==3) and (airport['value'] != 'All'):
            airports.append(airport['value'])
            
    return airports

get_airports(html_page)

['ATL',
 'BWI',
 'BOS',
 'CLT',
 'MDW',
 'ORD',
 'DFW',
 'DEN',
 'DTW',
 'FLL',
 'IAH',
 'LAS',
 'LAX',
 'ABR',
 'ABI']

In [104]:
html_file = 'FL-ATL.html'
courier = html_file[0:2]
airport = html_file[3:6]

data = []
def process_file(file):
    # Data will be a list of dictionaries where each dictionary represents one row of the table
    data = []
    # Record courier and airport code (assuming file names are constant)
    courier = file[0:2]
    airport = file[3:6]
    
    # Put the courier and airport into a dictionary 
    airport_data = {'courier' : courier, 'airport': airport}
    
    # Create the soup for parsing
    soup = BeautifulSoup(open(file), 'lxml')
    
    # Finds every instance of table row that equals the right class
    # table_data contains all the rows
    table_data  = soup.find_all('tr', class_='dataTDRight')
    
    # Iterating through the table data one row at a time, each row will become its own dictionary
    for row in table_data:
        # month_data is dictionary for each row
        month_data = {}
        
        # do not want to include the totals
        if row.find_all('td')[1].text == 'TOTAL':
            pass
        
        else:
            # flight data will contain the domestic and international flights
            flight_data = {}
            month_data = airport_data
            
            table_row = []
            
            # Iterate through the data entries in the row
            for data_entry in row.find_all('td'):
                # Clean each data entry, can access the information stored under the <td> tag using string
                # This removes the commas
                clean_data = int(data_entry.text.replace(',', ''))

                # Add the cleaned data entry into the row list
                table_row.append(clean_data)
               
            # Extract the entries from the row list and place in appropriate location
            month_data['year'] = table_row[0]
            month_data['month'] = table_row[1]
            flight_data['domestic'] = table_row[2]
            flight_data['international'] = table_row[3]
            
            month_data['flights'] = flight_data
            
            data.append(month_data.copy())
            
    return data

data = process_file(html_file)

In [105]:
data

[{'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 815489, 'international': 92565},
  'month': 10,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 766775, 'international': 91342},
  'month': 11,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 782175, 'international': 96881},
  'month': 12,
  'year': 2002},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 785651, 'international': 98053},
  'month': 1,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 690750, 'international': 85965},
  'month': 2,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 797634, 'international': 97929},
  'month': 3,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 766639, 'international': 89398},
  'month': 4,
  'year': 2003},
 {'airport': 'ATL',
  'courier': 'FL',
  'flights': {'domestic': 789857, 'international': 8767