In [2]:
# import sqlite3 as sq
# import pandas as pd
import xml.etree.cElementTree as ET
import re
import codecs
import csv
from pprint import pprint
from collections import defaultdict
import codecs
# import cerberus

In [3]:
# global constants and regular expressions
SAMPLE_FILE = "sample.osm"
OSMFILE = "sample_for_submission.osm"
KNOWN_WAY_TYPES = ["highway","power","waterway","railway","boundary","aeroway","barrier", \
                   "man_made","natural","leisure","building","amenity", \
                  "landuse","historic","tourism", "shop", "place","aerialway"]
KNOWN_NODE_TYPES = ['building','shop','amenity','power','highway','place','natural',\
                   'landuse','leisure','waterway','barrier','railway','man_made',\
                   'tourism','aeroway','historic','traffic_calming','FIXME','fixme',\
                   'disused','disused:amenity','created_by','note']
EXPECTED_STREET_TYPES = ["Street", "Avenue","Road","Boulevard","Circle","Court","Place", \
                         "Lane","Drive","Trail","Loop", "Way", "Parkway", "Highway", "Alley",\
                        "Terrace","Freeway","Path","Close","Square", "Heights", \
                        "Crossing", "Hill", "Bridge","Point","Ridge", "Causeway","Ravine", \
                        "Creek", "Run","Vista","View"]
KNOWN_STREET_ABBR = ["St","St.","Rd","Rd.","rd","rd.","Cir","Cir.","Ct","Ct.","Crt","Dr","Dr.", \
                     "Pkwy","Pkwy.","Pl","Pl.","L","Ln.","Ln", "Ave.","Ave"]
STREET_ABBR_EQUIV = ["Street","Street","Road","Road","Road","Road","Circle","Circle",\
                    "Court","Court","Court","Drive","Drive","Parkway","Parkway","Place","Place",\
                    "Lane","Lane","Lane","Avenue","Avenue"]
HIGHWAYS_NOT_ROADS = ["access","path","trail", "footway", \
                      "cycleway","rest_area","motorway_link","trunk_link", "primary_link", \
                     "secondary_link","tertiary_link","living_street","proposed","pedestrian", \
                     "construction","steps","escape","raceway","track","road"]
DIRECTION_WORDS = ["North", "South", "East", "West"]
TAGGED_NODES_FILE = "tagged_nodes.osm"
UNTAGGED_NODES_FILE = "untagged_nodes.osm"
CULLED_FILE = "culled_elements.osm"
KEY_REVIEW_FILE = "key_to_review.osm"

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']
RELATION_FIELDS = ['id','user','uid','version','changeset','timestamp']
RELATION_MEMBERS_FIELDS=['id','member_id','role','type','position']
RELATION_TAGS_FIELDS = ['id','key','value','type']
USER_FIELDS = ['uid','user']

ST_NOT_PARSED = 0
ST_GOOD = 1
ST_FIRST = 2
ST_ONE_WORD = 3
ST_UNKNOWN = 4
re_addr = re.compile(r'^addr:')
re_probchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
re_lowercolon = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
re_highway = re.compile(r'^highway')
re_street_type = re.compile(r'\b\S+\.?$', re.IGNORECASE)
re_tiger = re.compile(r'^tiger:')
re_stfirst = re.compile(r'Camino|Calle|Vista|Via|Piazza|Avenue|Rue')
re_last2or1 = re.compile(r'\b([a-z0-9_-]+\.?)\s+([a-z0-9_-]+\.?)$|[a-z0-9_-]+\.?$',re.IGNORECASE)
re_inter = re.compile(r'[@&]')
re_postcode = re.compile(r'\d\d\d\d\d-\d\d\d\d$|\d\d\d\d\d$')

In [4]:
PHYSICAL_FEATURE_TAGS = ['shop','amenity','waterway','power','aerialway','man_made', \
                       'boundary','highway','barrier','leisure','historic','tourism',\
                       'building','traffic_calming','natural','aeroway','place', \
                       'railway','landuse','emergency','office','manhole','city']
OTHER_PRIMARY_TAGS = ['created_by','note','FIXME','fixme','disused','disused:amenity']

for tag in sorted(PHYSICAL_FEATURE_TAGS):
    print tag

aerialway
aeroway
amenity
barrier
boundary
building
city
emergency
highway
historic
landuse
leisure
man_made
manhole
natural
office
place
power
railway
shop
tourism
traffic_calming
waterway


All of the above appear in the top level table of contents of OSM primary map features article, accept traffic_calming.  The article on key:traffic_calming indicates this may be added to a node on a highway, so one could check that all nodes with this key are in fact part of a way tagged as highway.

In [5]:
# The schema for the way, nodes, and relation data files:

SCHEMA = {
    'node': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'lat': {'required': True, 'type': 'float', 'coerce': float},
            'lon': {'required': True, 'type': 'float', 'coerce': float},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'node_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string'}
            }
        }
    },
    'way': {
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'way_nodes': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'node_id': {'required': True, 'type': 'integer', 'coerce': int},
                'position': {'required': True, 'type': 'integer', 'coerce': int}
            }
        }
    },
    'way_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string', 'required': True}
            }
        }
    },
    'relation':{
        'type': 'dict',
        'schema': {
            'id': {'required': True, 'type': 'integer', 'coerce': int},
            'user': {'required': True, 'type': 'string'},
            'uid': {'required': True, 'type': 'integer', 'coerce': int},
            'version': {'required': True, 'type': 'string'},
            'changeset': {'required': True, 'type': 'integer', 'coerce': int},
            'timestamp': {'required': True, 'type': 'string'}
        }
    },
    'relation_members': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'member_id': {'required': True, 'type': 'integer', 'coerce': int},
                'role':{'required':True, 'type':'string'},
                'type':{'required':True, 'type':'string'},
                'position': {'required': True, 'type': 'integer', 'coerce': int}
            }
        }
    },
    'relation_tags': {
        'type': 'list',
        'schema': {
            'type': 'dict',
            'schema': {
                'id': {'required': True, 'type': 'integer', 'coerce': int},
                'key': {'required': True, 'type': 'string'},
                'value': {'required': True, 'type': 'string'},
                'type': {'required': True, 'type': 'string', 'required': True}
            }
        }
    }
}


In [6]:
# Helper functions

def get_element(osm_file, tags=('way',)):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

def get_tags(elem):
    tag_dict = {}
    # Return a dictionary with the key name and value for this element
    for t in elem.iter("tag"):
        tag_dict[t.get("k")] = t.get("v")
    return tag_dict

def is_circular(el):
# returns whether a way element is circular, ie first and last nd elements are same
    node_list = []
    for wt in el.iter("nd"):
        node_list.append(wt.get("ref"))
    return node_list[0] == node_list[len(node_list)-1]

def has_tag_elem(el):
    tag_dict = get_tags(el)
    return len(tag_dict) <> 0


def has_addr_tag(el):
    tags = get_tags(el)
    has_addr = False
    for key in tags.keys():
        if re_addr.match(key):
            has_addr = True
            break
    return has_addr


def is_known_way (el):
    known = False
    for wt in el.iter("tag"):
        key = wt.get("k")
        if key in KNOWN_WAY_TYPES:
            known = True
            break
    return known



def is_roadway_no_name(elem):
    highway = False
    has_name = False
    for t in elem.iter("tag"):
        key = t.get("k")
        value = t.get ("v")
        if key == "highway" and value not in HIGHWAYS_NOT_ROADS:
                highway = True
        if key == "name" or key == "ref":
            has_name = True
        
    return highway and not has_name

def is_tiger_not_reviewed(elem):
    tags = get_tags(elem)
    return "tiger:reviewed" in tags.keys() and tags["tiger:reviewed"] == "no"

def is_tiger_data(elem):
    td = False
    tags = get_tags(elem)
    for key in tags.keys():
        if re_tiger.match(key):
            td = True
            break
    return td

def is_named_roadway(elem):
    tags = get_tags(elem)
    name = ""
    named = False
    roadway = "highway" in tags.keys() and tags["highway"] not in HIGHWAYS_NOT_ROADS
    if "name" in tags.keys():
        name = tags["name"]
        named = True
    elif "ref" in tags.keys():
        name = tags["ref"]
        named = True
    return name, roadway and named



def parse_road_name(name):
    valid_st = ST_NOT_PARSED
    street_type = ""
    ## use regular expressions to parse and return the street type portion of a road name
    ## first check to see if the street type comes first
    m = re_stfirst.match(name)
    if m:
        valid_st = ST_FIRST
        street_type = m.group()
    else:
        m = re_last2or1.search(name)
        if m:
            if None in m.groups():
                ## only matched one word, probably not good street type. leave valid_st False
                ## return street_type as m.group() to show we did find a match with RE
                valid_st = ST_ONE_WORD
                street_type = m.group()
            else:
                if m.group(2) in DIRECTION_WORDS:
                    street_type = m.group(1)
                else:
                    street_type = m.group(2)
                if (street_type in EXPECTED_STREET_TYPES) or (street_type in KNOWN_STREET_ABBR):
                    valid_st = ST_GOOD
                else:
                    valid_st = ST_UNKNOWN
    return valid_st, street_type
            
                
def is_road(el):
    road = False
    tags = get_tags(el)
    return "highway" in tags.keys() and tags["highway"] not in HIGHWAYS_NOT_ROADS

def dup_tag_key(el):
    keys = []
    dup = False
    # return true if the element has more than one key with same name
    for t in el.iter("tag"):
        key = t.get("k")
        if key in keys:
            dup = True
        else:
            keys.append(key)
    return dup

def default_process(el):
    return True

def OSMSampleFile(output_file=SAMPLE_FILE,input_file=OSMFILE,\
                  get_tags=('way','node'),process_function=default_process, k=1):
    write_ct = 0
    i = 0
    with open(output_file,'wb') as output:
        output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        output.write('<osm>\n  ')
        for element in get_element(input_file,tags=get_tags):
            if process_function(element):
                i += 1
                if i % k == 0:
                    output.write(ET.tostring(element,encoding='utf-8'))
                    write_ct += 1
        output.write('</osm>')
    return write_ct

In [7]:
# Start with some basic statistics
# count number of way, node, and relation elements
node_count = 0
way_count = 0
relation_count = 0
for elem in get_element(OSMFILE, tags=('way','node','relation')):
    if elem.tag == "node":
        node_count += 1
    elif elem.tag == "way":
        way_count += 1
    elif elem.tag == "relation":
        relation_count += 1
print node_count, "nodes."
print way_count, "ways."
print relation_count, "relations."
    

622875 nodes.
42629 ways.
174 relations.


In [7]:
# how many nodes are without additional tags, presumably only used for defining ways
node_only = 0
for element in get_element(OSMFILE, tags=('node',)):
    if not has_tag_elem(element):
        node_only +=1
print node_only, "nodes have no child tags."

610354 nodes have no child tags.


In [14]:
## Write an output file for ways with no associated "tag" tags, that is they are
## just a series of nodes ("nd" tags.)

def count_ways_no_tags(element):
    if not has_tag_elem(element):
        return True
    else:
        return False
    
ways_no_tags = OSMSampleFile(SAMPLE_FILE,OSMFILE,('way',), count_ways_no_tags,1)

print ways_no_tags, "ways have no associated 'tag' tags."
## A number of these were examined using the openstreetmap.org ?way=<way id> functionality.
## they appear to be parts of boundaries of diffierent entities: parks, districts, etc. They should
## be part of a relation, and can be audited once the data is in the SQL database.

731 ways have no associated 'tag' tags.


In [16]:
## This code used iteratively to find the various way types, adding to the KNOWN_WAY_TYPES
## global constant after reviewing the output file.  As the file gets smaller, reduce the 
## last parameter indicating fraction to write, finally writing all to the sample file.

def count_unknown_ways(element):
    tags=get_tags(element)
    if len(tags) == 0:
        return False
    else:
        known_wt = False
        for tag in tags:
            if tag in PHYSICAL_FEATURE_TAGS:
                known_wt = True
                break
        if not known_wt:
            return True
        else:
            return False
    
write_ct = OSMSampleFile("Ways_No_Physical_Feature.osm",OSMFILE,('way',),count_unknown_ways, 1)

print write_ct, "ways of unknown type written to sample file."
## Various additional tags need to be added to these, do from SQL side.

72 ways of unknown type written to sample file.


In [17]:
## Similar process for tagged nodes.  
def count_unknown_nodes(element):
    tags = get_tags(element)
    if len(tags) == 0:
        return False
    else:
        known_nt = False
        for tag in tags:
            if tag in KNOWN_NODE_TYPES or tag in OTHER_PRIMARY_TAGS:
                known_nt = True
                break
        if not known_nt:
            return True
        else:
            return False

write_ct = OSMSampleFile("Nodes_No_Physical_Feature", OSMFILE,('node',),count_unknown_nodes,1)
print write_ct, "nodes of unknown type written to sample file."
## Again - many need additional tag added.  Do from SQL side.

382 nodes of unknown type written to sample file.


In [8]:
# how many ways are open (start and end with different node) and how many closed?
closed_ways = 0
open_ways = 0
for elem in get_element(OSMFILE,tags=('way',)):
    if is_circular(elem):
        closed_ways += 1
    else:
        open_ways += 1
print "In this dataset there are", closed_ways, "closed ways and", open_ways, "open ways."

In this dataset there are 17077 closed ways and 25552 open ways.


In [19]:
## list and count the highway=* values in the dataset
highway_values = defaultdict(int)
total_highways = 0
for element in get_element(OSMFILE,tags=('way',)):
    tags = get_tags(element)
    if "highway" in tags:
        highway_values[tags["highway"]] += 1
        total_highways += 1

print "highway=* values in the dataset:"
for value in sorted(highway_values.keys()):
    print "\t", value, ":", highway_values[value]
print "Total highway-tagged ways:", total_highways

highway=* values in the dataset:
	construction : 2
	cycleway : 252
	escape : 1
	footway : 405
	living_street : 14
	motorway : 210
	motorway_link : 285
	path : 719
	pedestrian : 8
	primary : 128
	primary_link : 22
	proposed : 7
	raceway : 1
	residential : 15374
	rest_area : 2
	road : 32
	secondary : 377
	secondary_link : 28
	service : 7712
	steps : 9
	tertiary : 411
	tertiary_link : 16
	track : 747
	trail : 1
	trunk : 85
	trunk_link : 11
	unclassified : 288
Total highway-tagged ways: 27147


Comparing this list to the key:highway documentation in the OSM wiki, it appears that the following are highway types which could, or should be named roads: motorway, trunk, primary, secondary, tertiary, unclassified, residential, road, and service.  Ways tagged highway=track could also be named roads.  

In [20]:
## Get all of the highway= type tags which could, or should, be roads
## and count them and how many have names.
highway_types = defaultdict(int)
named_hw_types = defaultdict(int)
i = 0
named_ct = 0
for element in get_element(OSMFILE,tags=('way',)):
    tags=get_tags(element)
    if "highway" in tags.keys() and tags["highway"] not in HIGHWAYS_NOT_ROADS:
        highway_types[tags["highway"]] += 1
        if "name" in tags.keys():
            named_hw_types[tags["highway"]] += 1
        elif "ref" in tags.keys():
            named_hw_types[tags["highway"]] += 1
        

for ht in sorted(highway_types.keys()):
    print ht, ":", highway_types[ht], "named:", named_hw_types[ht]
    

motorway : 210 named: 210
primary : 128 named: 126
residential : 15374 named: 12912
secondary : 377 named: 373
service : 7712 named: 428
tertiary : 411 named: 391
trunk : 85 named: 83
unclassified : 288 named: 265


Leaving aside the 2,000+ unnamed residential highways, what's going on with the primary, secondary, tertiary, and unclassified roads with no name?  Pull them and write them out to a file for inspection.

In [21]:
## Write out the way elements, tagged has highway=primary,secondary,tertiary,unclassified and
## have no name or ref tag
def unnamed_big_roads(el):
    tags = get_tags(el)
    if "highway" not in tags:
        return False
    else:
        if tags['highway'] not in ['primary','secondary','tertiary','unclassified']:
            return False
        else:
            if "name" in tags.keys() or "ref" in tags.keys():
                return False
            else:
                return True

write_ct = OSMSampleFile("Big_Roads_No_Name.osm", OSMFILE,('way',),unnamed_big_roads,1)
print write_ct, "ways tagged highway=<big road> have no name."

49 ways tagged highway=<big road> have no name.


Looking at a few of these, it looks like they are mistakes (using highway=residential to tag what looks like a driveway), or refer to bits of named roads which might better be tagged as some type of
link.  Or highway=tertiary for roads in shopping centers where highway=service would be more accurate.  It might be possible to organize them according to user, look at them visually on the map, and correct them in the SQL database.

In [22]:
## Look at a sample of the highway=residential elements to see if there is pattern to why these
## don't have names
def unnamed_residential_roads(el):
    tags = get_tags(el)
    if "highway" in tags and tags["highway"] == "residential":
        if "name" not in tags:
            return True
        else:
            return False
    else:
        return False
            
write_ct = OSMSampleFile(SAMPLE_FILE, OSMFILE,('way',), \
                         unnamed_residential_roads, 100)
print write_ct, "elements written to SAMPLE_FILE"

24 elements written to SAMPLE_FILE


Many of these are from the TIGER upload and have no name info.  Exclude them from auditing and look again.

In [23]:
def unnamed_not_TIGER_roads(el):
    tags = get_tags(el)
    if "highway" in tags and tags["highway"] == "residential":
        if "name" not in tags:
            td = False
            for key in tags.keys():
                if re_tiger.match(key):
                    td = True
                    break
            if td:
                return False
            else:
                return True
        else:
            return False
    else:
        return False
            
write_ct = OSMSampleFile("Unnamed_Not_Tiger_Roads.osm", OSMFILE,('way',), \
                         unnamed_not_TIGER_roads, 1)
print write_ct, "elements written to SAMPLE_FILE"

303 elements written to SAMPLE_FILE


So from visual review of above it looks like many of these are, indeed, small residential streets which do have names when viewed on Google Maps.  Processing further an opportunity for improvement in the database by getting names from external reference using lat-lon data in the way nodes.

In [24]:
## Before actually auditing and cleaning street names, examine keys to see if any have the
## problem characters as defined in the problem set in the lessons for this project.
prob_char_keys = set()
with open ("probchar_key_elements.osm",'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')
    for elem in get_element(OSMFILE, tags=('way','node','relation')):
        prob_elem = False
        for t in elem.iter("tag"):
            kstring = t.get("k")
            if re_probchars.search(kstring) is not None:
                prob_elem = True
                prob_char_keys.add(kstring)
        if prob_elem: output.write(ET.tostring(elem,encoding='utf-8'))
    output.write('</osm>')
print prob_char_keys
## and it turns out there are a few.  need to write them out to a file and examine
## All part of WST!!!

set(['Segment #', 'Elevation, maximum', 'Elevation, minimum', 'Elevation, end', 'Elevation, start'])


In [7]:
## On to auditing the names of the "highway" tagged ways.  Name should come from the "name" or
## "ref" tag.  Look at last word and compare to expected street types.  
## Make a dictionary with the unrecognized street types and the full name.
street_types = defaultdict(set)
st_first = defaultdict(int)
st_first_ct = 0
st_known = 0
ways_not_roads = 0
st_oneword = 0
with open("One_Word_Highway_Names.osm",'wb') as one_word_names, \
open ("Unknown_Street_Types.osm",'wb') as unknown_st:
    one_word_names.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    one_word_names.write('<osm>\n  ')
    unknown_st.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    unknown_st.write('<osm>\n  ')
    for element in get_element(OSMFILE, tags=('way',)):
        name, roadway = is_named_roadway(element)
        if roadway:
            pstat, st = parse_road_name(name)
            if pstat == ST_UNKNOWN:
                ## found what should be a street type but not in known types or abbreviations
                unknown_st.write(ET.tostring(element,encoding='utf-8'))
                street_types[st].add(name)
            elif pstat == ST_FIRST:
                ## the street name comes first, count it
                st_known += 1
                st_first[st] += 1
                st_first_ct += 1
            elif pstat == ST_GOOD:
                ## street type in known types or abbreviations, count it
                st_known += 1
            elif pstat == ST_ONE_WORD:
                ## seems to have only one word in street name, write out the full element
                st_oneword += 1
                one_word_names.write(ET.tostring(element,encoding='utf-8'))
            elif pstat == ST_NOT_PARSED:
                print "OOPS, ST_NOT_PARSED", name
            else:
                print "OOPS, SOMETHING ELSE", name
        else:
            ways_not_roads += 1
    one_word_names.write('</osm>')
    unknown_st.write('</osm>')
print "Number of entries in unknown street_types dictionary:", len(street_types)
print "Found",st_known,"roads with recognized street type or abbreviation."
print "Found",st_oneword, "roads with only one word in the name."
print "Found", st_first_ct, "roads where the street type is the first word in the name."

for st in sorted(street_types.keys()):
    print st, ':', street_types[st]


OOPS, ST_NOT_PARSED Shadow Ridge (driveway)
Number of entries in unknown street_types dictionary: 105
Found 14293 roads with recognized street type or abbreviation.
Found 81 roads with only one word in the name.
Found 83 roads where the street type is the first word in the name.
1071 : set(['County Road 1071'])
1733 : set(['Road 1733'])
174 : set(['SR 174', 'CA 174'])
20 : set(['CA 20'])
2233 : set(['County Road 2233'])
24 : set(['Forest Route 24'])
364 : set(['County Road 364'])
4431 : set(['Dixon 4431'])
4471 : set(['Bresna 4471'])
49 : set(['CA 49', 'I 80;CA 49'])
50 : set(['US 50'])
5781 : set(['Brown 5781'])
80 : set(['I 80'])
A : set(['Unnamed Road A'])
Acres : set(['Hasti Acres', 'Adams Acres'])
Allie : set(['Ellies Allie'])
Arnold : set(['T Arnold'])
Away : set(['Model Away'])
B : set(['Cul de Sac B'])
Barbaree : set(['High Barbaree'])
Bend : set(['Skaith Bend', 'Dogwood Bend'])
Bosque : set(['El Bosque'])
Bottom : set(['Copper Bottom'])
Bypass : set(['Chiquita Bypass'])
Cables

No additional abbreviations or obvious street types to add.  Some of these may have street type left off of the name in this database.  Will leave street type/abbreviation list as is.

In [15]:
## Now to actually processing the file.  First deal with the way elements.  The schema and
## parsing rules from the practice exercise are used.  Highways which are roads have the name
## field cleaned by substituting the appropriate full-word version of the street type when there
## is an abbreviation. ("Rd." -> "Road", etc.).  If present, the zipcode found in addr:postcode is
## checked and cleaned if needed.
## This dictionary gives the abbreviation:full word pairings to use:
str_abbr_mapping = dict(zip(KNOWN_STREET_ABBR,STREET_ABBR_EQUIV))

def clean_street_name(name):
    # if the name starts with a street type ("Calle", etc.) then return name unchanged
    if re_stfirst.match(name):
        new_name = name
    else:
    # looking for last 2 words in the name if it has 2, otherwise just the one word
        m = re_last2or1.search(name)
        if m:
            if None in m.groups():
            ## only found one word, probably no street type in this name, just return it unchanged
                new_name = name
            ## if 2 words, then street type if probably next to last if a direction word last
            else:
                if m.group(2) in DIRECTION_WORDS:
                    st = m.group(1)
                    loc = m.span(1)
                else:
                    st = m.group(2)
                    loc = m.span(2)
                if st in str_abbr_mapping:
                ## the street_type is an abbreviation, look it up and substitute in name
                    new_type = str_abbr_mapping[st]
                    start,end = loc
                    new_name = name[:start] + new_type + name[end:]
                else:
                ## street type not an abbreviation, so name can be returned unchanged
                    new_name = name
        else:
            # this shouldn't happen - no match in regex search
            print "Problem in clean_street_name function:", name
            new_name = name
    return new_name

def clean_zipcode(zipcode):
    # re_postcode matches 5+4 digits or 5 digits which is OK
    m = re_postcode.match(zipcode)
    if m:
        return zipcode
    else:
        ## if there is a 5-digit sequence in the string, return that
        print "Trying to correct zipcode:", zipcode
        m = re.search(r'\d\d\d\d\d',zipcode)
        if m:
            print "Changing to:", m.group(0)
            return m.group(0)
        else:
        ## even if it's bad, return unchanged if we can't fix it
            return zipcode
    
    

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  relation_attr_fields = RELATION_FIELDS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    relation_attribs = {}
    way_nodes = []
    relation_members = []
    tags = []  # Handle secondary tags the same way for node, way, and relation elements

    if element.tag == 'way':
        for key in way_attr_fields:
            way_attribs[key] = element.get(key)
# Process the nodes associated with the way:
        pos = 0
        for wn in element.iter("nd"):
            wn_attribs = {}
            wn_attribs["id"] = way_attribs["id"]
            wn_attribs["node_id"] = wn.get("ref")
            wn_attribs["position"] = pos
            way_nodes.append(wn_attribs)
            pos += 1
# Process other tags associated with the way
        road = is_road(element)
        for wt in element.iter("tag"):
            wt_key = wt.get("k")
# If the key contains "problem characters" it won't be processed
            probs = re_probchars.search(wt_key)
            if not probs:
                wt_value = wt.get('v')
                if road and wt_key == "name":
                    wt_value = clean_street_name(wt_value)
                if wt_key == 'addr:street':
                    wt_value = clean_street_name(wt_value)
                elif wt_key == 'addr:postcode':
                    wt_value = clean_zipcode(wt_value)
                tag_attribs = {}
                tag_attribs['id'] = way_attribs['id']
                if re_lowercolon.match(wt_key):
                    key_split = wt_key.split(':',1)
                    tag_attribs['key'] = key_split[1]
                    tag_attribs['type'] = key_split[0]
                else:
                    tag_attribs['key'] = wt_key
                    tag_attribs['type'] = "regular"
                tag_attribs['value'] = wt_value
                tags.append(tag_attribs)
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}
    # Get the required attribute values for a node tag:    
    elif element.tag == 'node':
        for key in node_attr_fields:
            node_attribs[key] = element.get(key)
        # Process the tags associated with the node:
        for nt in element.iter("tag"):
            nt_key = nt.get("k")
            nt_value = nt.get("v")
            probs = re_probchars.match(nt_key)
            if not probs:
                tag_attribs = {}
                tag_attribs['id'] = node_attribs['id']
                if nt_key == "addr:street":
                    nt_value = clean_street_name(nt_value)
                elif nt_key == "addr:postcode":
                    nt_value = clean_zipcode(nt_value)
                tag_attribs['value'] = nt_value
                if re_lowercolon.match(nt_key):
                    key_split = nt_key.split(':',1)
                    tag_attribs['key'] = key_split[1]
                    tag_attribs['type'] = key_split[0]
                else:
                    tag_attribs['key'] = nt_key
                    tag_attribs['type'] = "regular"
                tags.append(tag_attribs)
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'relation':
        for key in relation_attr_fields:
            relation_attribs[key] = element.get(key)
        # Process the members associated with the relation:
        pos = 0
        for rm in element.iter("member"):
            rm_attribs = {}
            rm_attribs["id"] = relation_attribs["id"]
            rm_attribs["member_id"] = rm.get("ref")
            rm_attribs["role"] = rm.get("role")
            rm_attribs["type"] = rm.get("type")
            rm_attribs["position"] = pos
            relation_members.append(rm_attribs)
            pos += 1
        # Process the rest of the tags asociated with the relation:
        for rt in element.iter("tag"):
            rt_key = rt.get("k")
            # If the key contains "problem characters" it won't be processed
            probs = re_probchars.search(rt_key)
            if not probs:
                rt_value = rt.get('v')
                tag_attribs = {}
                tag_attribs['id'] = relation_attribs['id']
                if re_lowercolon.match(rt_key):
                    key_split = rt_key.split(':',1)
                    tag_attribs['key'] = key_split[1]
                    tag_attribs['type'] = key_split[0]
                else:
                    tag_attribs['key'] = rt_key
                    tag_attribs['type'] = "regular"
                tag_attribs['value'] = rt_value
                tags.append(tag_attribs)
        return {'relation': relation_attribs, 'relation_members': relation_members, \
                'relation_tags': tags}

In [16]:
NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "node_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "way_nodes.csv"
WAY_TAGS_PATH = "way_tags.csv"
RELATIONS_PATH = "relations.csv"
RELATION_MEMBERS_PATH = "relation_members.csv"
RELATION_TAGS_PATH = "relation_tags.csv"


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )

class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
         codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, \
         codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file, \
         codecs.open(RELATIONS_PATH,'w') as relations_file, \
         codecs.open(RELATION_MEMBERS_PATH,'w') as relation_members_file, \
         codecs.open(RELATION_TAGS_PATH,'w') as relation_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)
        relations_writer = UnicodeDictWriter(relations_file, RELATION_FIELDS)
        relation_members_writer = UnicodeDictWriter(relation_members_file, RELATION_MEMBERS_FIELDS)
        relation_tags_writer = UnicodeDictWriter(relation_tags_file, RELATION_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()
        relations_writer.writeheader()
        relation_members_writer.writeheader()
        relation_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way','relation')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])
                elif element.tag == 'relation':
                    relations_writer.writerow(el['relation'])
                    relation_members_writer.writerows(el['relation_members'])
                    relation_tags_writer.writerows(el['relation_tags'])



In [15]:
## Create a sample file for processing to test the code:
k = 20
i = 0
ct = 0
with open("Test_File.osm", 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for element in get_element(OSMFILE,tags=('node','way')):
        i += 1
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))
            ct += 1

    output.write('</osm>')
print ct, "elements written to test file."

33275 elements written to test file.


In [28]:
## Try the process_map function on the test file, run validation with the test
process_map("Test_File.osm", False)
print "done"

done


The above code ran without raising exceptions or printing out any error statements.  The small CSV files can be opened in Excel and checked against the elements in the Test_File.osm to verify that the conversion code is running correctly.
Next cell will run the process_map function on the entire gold country OSM file.  Skip running validator as this seems to take forever.

Reviewed all of the test...CSV files and compared to the Test_File.osm file.  All of the element types seem to be converted correctly.

In [17]:
## Do the process_map function on the large OSM file, skip the validation on the large file.
process_map(OSMFILE, False)
print "done"

Trying to correct zipcode: CA 95650 ‎
Changing to: 95650
Problem in clean_street_name function: Penryn Rd Int Nw Of I-80 On Frontage Road (Boyington Road)
Problem in clean_street_name function: Shadow Ridge (driveway)
Trying to correct zipcode: CA 95667
Changing to: 95667
Trying to correct zipcode: CA 95602
Changing to: 95602
done


In [30]:
## The schema used in the lessons for this project is denormalized in that the user name is
## repeated in each row of the ways, nodes, and relations tables.  The database could be
## normalized by eliminating the user column from these tables and creating a users table 
## with the uid->user correspondence.  This code creates the necessary CSV file for doing that.

USERS_PATH = "users.csv"
uids = set()
user_table = []
for element in get_element(OSMFILE, tags=('way','node','relation')):
    uid = element.get('uid')
    if uid not in uids:
        user_row = {}
        uids.add(uid)
        user_row['uid'] = uid
        user_row['user'] = element.get('user')
        user_table.append(user_row)

print len(user_table)        

with codecs.open(USERS_PATH, 'w') as users_file:
    user_writer = UnicodeDictWriter(users_file, USER_FIELDS)
    user_writer.writeheader()
    for row in user_table:
        user_writer.writerow(row)
    



458


In [16]:
# Create a sample file to submit with project documentation

write_ct = OSMSampleFile(output_file = "Sample_For_Submission.osm", \
                        get_tags = ('way','node','relation'), k=50)

print write_ct, "elements written to sample file for project submission."

13313 elements written to sample file for project submission.
