In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import xml.dom.minidom
import pprint
import codecs
import json

### Enter filename to be used

In [27]:
#use sample file name that is built from get_sample.py
#xml_fname = 'sample.osm'
#use full file
xml_fname = 'portland_oregon.osm'

### RegEx Expressions Used

In [3]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\.\t\r\n]')

### Analysis of File Size for Portland OSM File

In [4]:
ls -l portland_oregon.osm

 Volume in drive C is Windows
 Volume Serial Number is D257-5CE4

 Directory of C:\Users\phooper\Documents\Udacity\P3\Project


 Directory of C:\Users\phooper\Documents\Udacity\P3\Project

04/11/2017  12:59 PM     1,586,145,881 portland_oregon.osm
               1 File(s)  1,586,145,881 bytes
               0 Dir(s)  402,746,929,152 bytes free


Here we can see that the file is approx 1.6 GB.

### Analysis of Contents within Sample File Only

#### Tags

In [5]:
def count_tags(filename):
    tag_dict = defaultdict(int)
    for event, elem in ET.iterparse(filename):
        tag_dict[elem.tag] += 1
    return tag_dict

In [6]:
tags = count_tags(xml_fname)
pprint.pprint(tags)

defaultdict(<type 'int'>, {'node': 3307, 'nd': 3852, 'member': 21, 'tag': 2479, 'relation': 3, 'way': 428, 'osm': 1})


Here we can see we're dealing with 4 main XML tags within the Portland OSM file.  Below is the basic structure of the file.
```<osm>
    <node>
        <tag>
    <way>
        <nd>
        <tag>```

#### Attributes

In [7]:
node_attrs = defaultdict(int)
tag_attrs = defaultdict(int)
way_attrs = defaultdict(int)
nd_attrs = defaultdict(int)

for _, element in ET.iterparse(xml_fname):
    if element.tag == "node":
         for attrName, _ in element.attrib.items():
                node_attrs[attrName] += 1
    elif element.tag == "tag":
         for attrName, _ in element.attrib.items():
                tag_attrs[attrName] += 1
    elif element.tag == "way":
         for attrName, _ in element.attrib.items():
                way_attrs[attrName] += 1
    elif element.tag == "nd":
         for attrName, _ in element.attrib.items():
                nd_attrs[attrName] += 1

attribute_dict = {
                "node": node_attrs,
                "tag": tag_attrs,
                "way": way_attrs,
                "nd": nd_attrs}
pprint.pprint(attribute_dict)

{'nd': defaultdict(<type 'int'>, {'ref': 3852}),
 'node': defaultdict(<type 'int'>, {'changeset': 3307, 'uid': 3307, 'timestamp': 3307, 'lon': 3307, 'version': 3307, 'user': 3307, 'lat': 3307, 'id': 3307}),
 'tag': defaultdict(<type 'int'>, {'k': 2479, 'v': 2479}),
 'way': defaultdict(<type 'int'>, {'changeset': 428, 'uid': 428, 'timestamp': 428, 'version': 428, 'user': 428, 'id': 428})}


### Quick Analysis of the Values Within the Attributes

In [8]:
def first_attribs(node, count):
    attrib_values = defaultdict(set)

    for _, element in ET.iterparse(xml_fname):
        if element.tag == node:
             for attrName, attrValue in element.attrib.items():
                    if len(attrib_values[attrName]) <= count:
                        attrib_values[attrName].add(attrValue)
    return attrib_values

#### nd Tag

In [9]:
nd_attrib = first_attribs('nd', 10)
pprint.pprint(nd_attrib)

defaultdict(<type 'set'>, {'ref': set(['3417676306', '3417676305', '3417676304', '3417676303', '3417676301', '3417676300', '3417676299', '37352633', '37352632', '37352637', '37352635'])})


#### node Tag

In [10]:
nd_attrib = first_attribs('node', 10)
pprint.pprint(nd_attrib)

defaultdict(<type 'set'>, {'changeset': set(['3260298', '36909474', '28405770', '2834389', '20328163', '8800623', '27283836', '20294672', '7632877', '27253706', '27255937']), 'uid': set(['393906', '1961229', '2565557', '362111', '92286', '3125856', '147510', '371688', '331031', '1399823', '74295']), 'timestamp': set(['2009-11-30T22:19:34Z', '2014-01-31T05:33:36Z', '2014-12-05T07:33:41Z', '2016-01-31T03:33:10Z', '2014-12-06T06:49:18Z', '2011-03-21T23:25:58Z', '2011-07-22T19:25:37Z', '2009-10-13T11:19:10Z', '2015-01-25T21:36:50Z', '2014-02-02T04:13:10Z', '2014-12-05T02:45:47Z']), 'lon': set(['-123.469024', '-122.8675556', '-123.3766553', '-122.5820211', '-122.558425', '-123.451727', '-122.3180484', '-123.3817716', '-123.4427173', '-122.6677278', '-123.5111647']), 'version': set(['11', '10', '15', '1', '3', '2', '5', '4', '7', '6', '8']), 'user': set(['lukeb', 'Mele Sax-Barnett', 'Darrell_pdxbuildings', 'Grant Humphries', 'Ryan_Peterson', 'cowdog', 'betsy', 'baradam', 'StanB', 'Paul Johns

#### tag Tag

In [11]:
nd_attrib = first_attribs('tag',30)
pprint.pprint(nd_attrib)

defaultdict(<type 'set'>, {'k': set(['amenity', 'gnis:state_id', 'bicycle', 'gnis:county_id', 'addr:housenumber', 'barrier', 'source', 'man_made', 'addr:state', 'ele', 'note', 'religion', 'gnis:feature_id', 'addr:street', 'ref', 'highway', 'entrance', 'power', 'bus', 'phone', 'foot', 'gnis:created', 'addr:city', 'traffic_calming', 'natural', 'name', 'payment:coins', 'addr:postcode', 'public_transport', 'railway', 'landuse']), 'v': set(['Minnehaha Church of Christ', 'mine', '011', '53', 'yes', 'yahoo_wms', '11/25/2008', 'turning_circle', 'quarry', 'hump', '6011', 'christian', '067', '2474191', 'NAIP', '77', 'level_crossing', 'Record <a href="http://tin.er.usgs.gov/mineplant/show.php?labno=5801">5801</a> of the <a href="http://tin.er.usgs.gov/mineplant/">Active mines and mineral plants in the US</a>', 'tower', 'place_of_worship', '59', 'bing', 'Abundant Life Community Church', 'tree', 'traffic_signals', 'switch', '2508966', 'Estacada Pit', 'crossing', '07/16/2008', 'OR'])})


#### way Tag

In [12]:
nd_attrib = first_attribs('way',10)
pprint.pprint(nd_attrib)

defaultdict(<type 'set'>, {'changeset': set(['30457112', '29732045', '40357430', '40000274', '30825661', '41423749', '40075965', '40408716', '41273033', '276075', '12051109']), 'uid': set(['3735764', '7168', '2935247', '362111', '3125856', '1442206', '4048171', '81676', '4048366', '3656932', '1399823']), 'timestamp': set(['2016-06-13T19:23:26Z', '2015-03-25T16:58:00Z', '2016-06-29T04:32:38Z', '2015-05-05T22:33:43Z', '2012-06-28T21:23:04Z', '2007-08-31T16:43:25Z', '2015-04-24T15:57:33Z', '2016-06-16T19:48:40Z', '2016-08-05T20:34:15Z', '2016-07-01T05:53:32Z', '2016-08-13T06:59:58Z']), 'version': set(['10', '22', '16', '1', '3', '5', '4', '7', '6', '9', '8']), 'user': set(['Peter Dobratz', 'roadgeek99', 'Brett_Ham', 'Ryan_Peterson', 'cowdog', 'Mele Sax-Barnett', 'JeanieG', 'Meredith Rider', 'NeilLoehlein', 'cwilley106', 'DaveHansenTiger']), 'id': set(['5399751', '5279577', '5321422', '5303944', '5402878', '5386950', '5396458', '5288595', '5390264', '5297911', '5393256'])})


In [13]:
attrib_values = set()
for _, element in ET.iterparse(xml_fname):
    if element.tag == "tag":
        attrib_values.add(element.attrib['k'])
l = sorted(list(attrib_values))
print l

['CCGIS:bicycle', 'RLIS:bicycle', 'RLIS:systemname', 'access', 'addr:city', 'addr:housenumber', 'addr:postcode', 'addr:state', 'addr:street', 'addr:unit', 'amenity', 'barrier', 'bicycle', 'bridge', 'building', 'building:height', 'building:levels', 'bus', 'construction', 'covered', 'cuisine', 'cutting', 'cycleway', 'denomination', 'description', 'direction', 'ele', 'email', 'entrance', 'est_width', 'fax', 'fee', 'fixme', 'foot', 'footway', 'gnis:county_id', 'gnis:created', 'gnis:fcode', 'gnis:feature_id', 'gnis:ftype', 'gnis:id', 'gnis:state_id', 'height', 'heritage', 'heritage:operator', 'hgv', 'highway', 'horse', 'junction', 'landuse', 'lanes', 'lanes:forward', 'layer', 'leisure', 'level', 'man_made', 'maxspeed', 'maxspeed:advisory', 'motor_vehicle', 'name', 'name:ja', 'name_1', 'name_base', 'name_direction_prefix', 'name_type', 'natural', 'nhd:com_id', 'nhd:fdate', 'nhd:reach_code', 'note', 'office', 'old_name', 'oneway', 'oneway:bicycle', 'opening_hours', 'operator', 'payment:coins'

### Initial Structure Based On Analysis

```
{
    id: element.attr = "id", (NOT manually entered)
    type: element.tag, (NOT manually entered)
    time_stamp: element_attr = "timestamp", (NOT manually entered)
    user: node.attr = "user", (NOT manually entered)
    pos: [node.attr = "lat', node.attr = "lon"], (maybe manually entered?)
    name: tag.k = "name", (manually entered)
    node_refs: [nd.ref, nd.ref,...], (not manually entered)
    contact: 
        {
        phone: tag.k = "phone", (manually entered)
        website: tag.k = "website" (manually entered)
        }   
    address:
        {
        house_number: tag.k = "addr:housenumber", (manually entered)
        street_name: tag.k = "addr:street", (manually entered)
        city: tag.k = "addr:city", (manually entered)
        state: tag.k = "addr:state", (manually entered)
        postal_code: tag.k = "addr:postcode" (manually entered)
        }
 }

```

### Auditing Attributes for Cleanliness and Uniformity

In [14]:
EXPECTED_STREETS = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons", "View", "Way", "Broadway", "Circle", "Loop", "Highway", "Terrace", "Summit",
                   "Walk", "Greenway", "Landing", "North", "East", "Northeast"]

In [15]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [16]:
def is_house_num(elem):
    return (elem.attrib['k'] == "addr:housenumber")

In [17]:
def is_city(elem):
    return (elem.attrib['k']) == "addr:city"

In [18]:
def is_state(elem):
    return (elem.attrib['k']) == "addr:state"

In [19]:
def is_post_code(elem):
    return (elem.attrib['k']) == "addr:postcode"

In [20]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in EXPECTED_STREETS:
            street_types[street_type].add(street_name)

In [21]:
def audit_house_num(house_nums, v):
    if problemchars.search(v):
        house_nums.add(v)

In [22]:
def audit_city(cities, v):
    if problemchars.search(v):
        cities.add(v)

In [23]:
def audit_state(states, v):
    if v != 'OR':
        states.add(v)

In [24]:
def audit_post_code(postal_codes, v):
    if len(v) != 5:
        postal_codes.add(v)

In [25]:
def audit():
    street_types = defaultdict(set)
    house_nums = set()
    states = set()
    cities = set()
    postal_codes = set()
    for event, elem in ET.iterparse(xml_fname, events=("start",)):
        if elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
                if is_house_num(tag):
                    audit_house_num(house_nums, tag.attrib['v'])
                if is_state(tag):
                    audit_state(states, tag.attrib['v'])
                if is_city(tag):
                    audit_city(cities, tag.attrib['v'])
                if is_post_code(tag):
                    audit_post_code(postal_codes, tag.attrib['v'])
                
    #print undefined street types
    if street_types:
        pprint.pprint(dict(street_types))
    else:
        print "No Undefined Street Types"
    #print Special Characters in house nums
    if house_nums:
        pprint.pprint(house_nums)
    else:
        print "No House Nums With Special Characters"
    #print states not equal to two characters in length
    if states:
        pprint.pprint(states)
    else:
        print "No States not equal to \"OR\" "
    #print Special Characters in Cities
    if cities:
        pprint.pprint(cities)
    else:
        print "No Cities With Special Characters"
    #print postal codes not equal to 5 characters in length
    if postal_codes:
        pprint.pprint(postal_codes)
    else:
        print "No Postal Codes not equal to five characters"
        

In [28]:
audit()

{'155th': set(['Southwest 155th']),
 '156th': set(['Southwest 156th']),
 '157th': set(['Southwest 157th']),
 '158th': set(['Southwest 158th']),
 '160th': set(['Southwest 160th']),
 '163rd': set(['Southwest 163rd']),
 '165th': set(['Southwest 165th']),
 '170': set(['South Highway 170']),
 '211': set(['Highway 211', 'South Highway 211', 'Southeast Highway 211']),
 '212': set(['SE Highway 212', 'Southeast Highway 212']),
 '213': set(['Highway 213', 'South Highway 213']),
 '224': set(['Northwest Highway 224',
             'South Highway 224',
             'Southeast Highway 224',
             'Southwest Highway 224']),
 '26': set(['Highway 26', 'Southeast Highway 26']),
 '41st': set(['41st']),
 '4616': set(['4616']),
 '47': set(['Northwest Highway 47',
            'Southwest Highway 47',
            'Southwest Old Highway 47']),
 '91st': set(['SW 91st']),
 '99': set(['Northeast Highway 99']),
 '99E': set(['Highway 99E', 'South Highway 99E']),
 '99W': set(['North Highway 99W',
             

In [None]:
STREET_MAPPING = {"Srive": "Drive",
                  "St": "Street",
                  "St.": "Street",
                  "st.": "Street",
                  "street": "Street",
                  "Rode": "Road",
                  "99e": "99E",
                  "Ave": "Avenue",
                  "Ave.": "Avenue",
                  "Blvd": "Boulevard",
                  "Blvd.": "Boulevard",
                  "Dr": "Drive",
                  "Hwy": "Highway",
                  "Ln": "Lane",
                  "Pkwy": "Parkway",
                  "Pky": "Parkway",
                  "Rd": "Road",
                  
                 }

### Writing to JSON file

In [None]:
TOP_LEVEL_ATTRIBS = ["id", "timestamp", "user"]
CONTACT_ATTRIBS = {
                    "phone": "phone",
                    "website": "website"
}
ADDRESS_ATTRIBS = {
                    "addr:housenumber": "house_number",
                    "addr:street": "street_name",
                    "addr:city": "city",
                    "addr:state": "state",
                    "addr:postcode": "postal_code"
}
ADDRESS_AUDIT = {
    "addr:housenumber": audit_street_num,
    "addr:street": audit_street_name,
    "addr:city": audit_city,
    "addr:state": audit_state,
    "addr:postcode": audit_post_code
}

In [None]:
def shape_element(element):
    node = {}
    pos = []
    address = {}
    node_refs = []
    contact = {}
    #find all top level tags that we have identified
    if element.tag == "node" or element.tag == "way" :
        node['type'] = element.tag     #store node type
        #store all desired attribute values that are not part of an array or in the tag attributes
        for k in TOP_LEVEL_ATTRIBS:
            node[k] = element.attrib[k]
        #store the position attribute as an array if it exists
        if 'lat' in element.attrib:
            lat = float(element.attrib['lat'])
            lon = float(element.attrib['lon'])
            pos = [lat, lon]
            node['pos'] = pos
        #find all "tag" elements under "node" and "way"
        for tag in element.iter("tag"):
            if tag.attrib['k'] in ADDRESS_ATTRIBS.keys():
                #audited_v = ADDRESS_AUDIT[tag.attrib['k']](tag.attrib['v'])
                if tag.attrib['k'] == "addr:street":
                    audit_street_name(tag.attrib['v'])
                address[ADDRESS_ATTRIBS[tag.attrib['k']]] = tag.attrib['v']            
            if tag.attrib['k'] in CONTACT_ATTRIBS.keys():
                contact[CONTACT_ATTRIBS[tag.attrib['k']]] = tag.attrib['v']
        for tag in element.iter("nd"):
            node_refs.append(tag.attrib['ref'])

        if node_refs:
            node['node_refs'] = node_refs
        if address:
            node['address'] = address
        if contact:
            node['contact'] = contact
        
        return node
        #return None
    else:
        return None

In [None]:
def process_map(file_in, pretty=False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2) + "\n")
                else:
                    #fo.write(json.dumps(el) + "\n")
                    #pprint.pprint(el)
                    None
    #return data
    return None

In [None]:
process_map(xml_fname)

### Extra

In [None]:
#osm_file = open("example.xml", "r")

In [None]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_types = defaultdict(int)

In [None]:
less sample.osm

In [None]:
def print_sorted_dict(d):
    keys = d.keys()
    keys = sorted(keys, key=lambda s: s.lower())
    for k in keys():
        v = d[k]
        print "%s: %d" % (k,v)

In [None]:

for event, elem in ET.iterparse(osm_file):
    if is_street_name(elem):
        audit_street_type(street_types, elem.attrib('v'))
print_sorted_dict(street_types)