In [14]:
import xml.etree.cElementTree as ET
import pprint
import sys
import re
from collections import defaultdict
import codecs
import json

OSM = "/Users/wilfriedhoge/Downloads/MUC.osm"

lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
expected = [u'Stra\xdfe', "Allee", "Platz", "Weg", "Ring", u"Chause\xe8", u'Br\xfccke', "Berg", "Park",
           "Feld", "Graben", "Holz", "Siedlung", "Steig", "Weiher", "Bach", "Anger"]
expectedEnds = ["weg", "-Weg","-Platz", "platz", u'stra\xdfe', u'-Stra\xdfe', "promenade", "-Bogen", "-Ring",
               "gasse", "-Gasse", "allee", "-Allee", "bogen", "graben", "see", "feld", "leiten", "berg", "ring",
               "rain", "burg", "leite", "bach", "tal", "wiese", "garten", "anger", "acker", "thal", "hof", "wald",
               "-Anger", "ried", "dorf", "-Damm", "damm", "-Passage", u"m\xfchle", "gassl", "steig", u'-Br\xfccke',
               "park", "pfad", "wies", "viertel", "ufer", "hof", "haus", "weide", "kanal", "-Hof", "mark", "winkel",
               "zentrum", "hausen", "bauer", "blick", "hofen", "wall", "brunnen", "breite", "hain", "fleck",
               "passage", "schlag", "tor", "holz", "hort", "-Siedlung", "weiden", "stein", "kam", "grube", "grund"]


expectedCity = [u'M\xfcnchen']
mappingCity = { "MU": u'M\xfcnchen',
               u'M\xfcchen': u'M\xfcnchen',
               "Munchen": u'M\xfcnchen',
               "Munich": u'M\xfcnchen',
               u'M\xfcnschen':u'M\xfcnchen',
               u'M\xdc': u'M\xfcnchen',
               u'M\xfcchen': u'M\xfcnchen',
               u'M\xfcchnen': u'M\xfcnchen',
               u'M\xfcnchen\u200e': u'M\xfcnchen',
               u'M\xfcnschen': u'M\xfcnchen',
               u'Haar bei M\xfcnchen': 'Haar',
               u'Gr\xe4felfing bei M\xfcnchen':u'Gr\xe4felfing'
            }


In [4]:
def count_tags(filename):
    tags = {}
    i = 1
    for _, elem in ET.iterparse(filename):
        if elem.tag not in tags:
            tags[elem.tag] = 1
        else:
            tags[elem.tag] += 1
        if i > 300000:
            sys.stdout.write('*')
            i = 1
        else:
            i += 1

    sys.stdout.write('\n')    
    return tags 
        
if True:
    tags = count_tags(OSM)
    print tags

******************
{'node': 1486269, 'member': 137545, 'remark': 1, 'nd': 1959807, 'tag': 1634060, 'note': 1, 'meta': 1, 'relation': 5421, 'way': 291987, 'osm': 1}


In [5]:
def key_type(element, keys, problematics, others):
    if element.tag == "tag":
        for i in element.iter():
            if (lower.search(i.attrib["k"])):
                keys["lower"] += 1
            elif (lower_colon.search(i.attrib["k"])):
                keys["lower_colon"] += 1
            elif (problemchars.search(i.attrib["k"])):
                problematics.add(i.attrib["k"])
                keys["problemchars"] += 1
            else:
                others.add(i.attrib["k"])
                keys["other"] += 1
        
    return keys

def process_map(filename):
    i = 1
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    problematics = set()
    others = set()
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys, problematics, others)
        if i > 300000:
            sys.stdout.write('*')
            i = 1
        else:
            i += 1
    
    sys.stdout.write('\n') 
    return keys, problematics, others

if True:
    keys, problematics, others = process_map(OSM)
    print "Key list:"
    pprint.pprint(keys)
    print "\nProblematic keys:"
    pprint.pprint(problematics)
    print "\nOther keys:"
    pprint.pprint(others)


******************
Key list:
{'lower': 802489, 'lower_colon': 821809, 'other': 9424, 'problemchars': 338}

Problematic keys:
set(['addr.source:housenumber',
     'contact.source:website',
     'footway:left.sloped_curb.end',
     'footway:left.sloped_curb.start',
     'footway:left.smoothness',
     'footway:left.surface',
     'footway:left.wheelchair',
     'footway:left.wheelchair.end',
     'footway:left.wheelchair.start',
     'footway:left.width',
     'footway:right.sloped_curb.end',
     'footway:right.sloped_curb.start',
     'footway:right.smoothness',
     'footway:right.surface',
     'footway:right.wheelchair',
     'footway:right.wheelchair.end',
     'footway:right.wheelchair.start',
     'footway:right.width',
     'step.condition',
     'step.height',
     'surface.material'])

Other keys:
set(['3dr:direction',
     '3dr:entrance',
     '3dr:h1',
     '3dr:type',
     'BLfD:criteria',
     'Bayernets',
     'FIXME',
     'Gewerbeeinheiten',
     'ISO3166-2',
     'Lade

In [8]:
def process_map(filename):
    users = set()
    i = 1
    for _, element in ET.iterparse(filename):
        if element.tag in ["node", "way", "relation"]:
            users.add(element.get("user"))
        if i > 300000:
            sys.stdout.write('*')
            i = 1
        else:
            i += 1
    
    sys.stdout.write('\n') 

    return users

if True:

    users = process_map(OSM)
    pprint.pprint(users)

******************
set(['',
     '(maxmaxmax)',
     '0815andi',
     '1976',
     '2StepForward',
     '2bogen',
     '2weiblum',
     '3dMartin',
     '42429',
     '46&2',
     '4SeasonNL',
     '4rch',
     '708145',
     '715371',
     '7oak',
     '80798',
     '81825-MUC',
     '87takeiteasy',
     '8p1t1h',
     '<don>',
     'ABRob',
     'AFeixelmeier',
     'AadenrsSceehrr',
     'Achterin',
     'AdinaB',
     'Admin UFG',
     'AdrianDiemer',
     'AignerImmobilien',
     'AkiraYoshi',
     'AlbertKa',
     'Aleks-Berlin',
     'Alessio Ceroni',
     'Alexander Roalter',
     'Alexander Rohde',
     'AlexanderF',
     'AlexanderNass',
     'Almhias',
     'Alois74',
     'Alpin100',
     'Amili',
     'Amplico',
     'Anchises',
     'Anderl0815',
     'AndersAndersson',
     'AndiG88',
     'AndiO',
     'AndiV',
     'Andi_Voss',
     'Andradir',
     'Andre68',
     'Andre71139',
     'AndreR',
     'Andreas Binder',
     'AndreasDangel',
     'Andrey Korolyov',
     'A

In [6]:
def audit_street_type(street_types, street_name):

    # look for last word in street name
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type in expected:
            return

    # look for ending in street name
    for ends in expectedEnds:
        if re.match(r".*" + ends + "$", street_type):
            return

    # neither last word nor ending matches -> unusual street name
    street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    i = 1
    street_types = defaultdict(set)
    for _, elem in ET.iterparse(osmfile, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    # print tag.attrib["v"]
                    audit_street_type(street_types, tag.attrib['v'])

        if i > 300000:
            sys.stdout.write('*')
            i = 1
        else:
            i += 1

    return street_types

def update_name(name, mapping):

    m = street_type_re.search(name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            name = street_type_re.sub(mapping[street_type], name)

    return name


if True:
    st_types = audit(OSM)
    pprint.pprint(dict(st_types))



******************{'Angerlohe': set(['Untere Angerlohe']),
 u'Autobahn-Rastst\xe4tte-Obermenzing': set([u'Autobahn-Rastst\xe4tte-Obermenzing']),
 'Bienenkorb': set(['Am Bienenkorb']),
 'Birkenau': set(['Birkenau']),
 'Birkicht': set(['Am Birkicht']),
 'Blankstadl': set(['Am Blankstadl']),
 u'B\xe4chl': set([u'Am Hartmannshofer B\xe4chl']),
 'Campeon': set(['Am Campeon']),
 'Eck': set(['Altheimer Eck']),
 'Eduard-Stadler-Winkel': set(['Eduard-Stadler-Winkel']),
 u'Eiche': set([u'Zur Gr\xfcnen Eiche']),
 u'Eichgeh\xf6lz': set([u'Im Eichgeh\xf6lz']),
 'Einfang': set(['Am Einfang']),
 'Einheit': set(['Zur Deutschen Einheit']),
 u'Einla\xdf': set([u'Am Einla\xdf']),
 'Emmeram': set(['Sankt Emmeram']),
 'Eschbichl': set(['Am Eschbichl']),
 'Eulenhorst': set(['Am Eulenhorst']),
 'Falkenbeiz': set(['Auf der Falkenbeiz']),
 u'Feuerb\xe4chl': set([u'Am Feuerb\xe4chl']),
 'Forst': set(['Am Perlacher Forst']),
 u'Freiheit': set([u'M\xfcnchner Freiheit']),
 u'F\xe4hrtwegl': set([u'F\xe4hrtwegl']),


In [18]:
def audit_city (city_list, name):

    # expected city
    if name in expectedCity:
        return

    # city unexpected
    city_list.add(name)

    return

def is_city(elem):
    return (elem.attrib['k'] == "addr:city")

def audit(osmfile):
    i = 1
    city_list = set()
    for _, elem in ET.iterparse(osmfile, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_city(tag):
                    audit_city(city_list, tag.attrib['v'])

        if i > 300000:
            sys.stdout.write('*')
            i = 1
        else:
            i += 1

    return city_list

# if city name not expected -> replace it from mapping
def update_name(name):

    if name not in expectedCity:
        if name in mappingCity:
            name = mappingCity[name]

    return name

if True:
    city_list = audit(OSM)
    pprint.pprint(city_list)

if True:
    # update city names
    for name in city_list:
        better_name = update_name(name)
        print name, "=>", better_name



******************set(['Allach',
     'Aschheim',
     'Aschheim - Dornach',
     'Dornach',
     'Gauting',
     'Grasbrunn',
     u'Gr\xe4felfing',
     u'Gr\xe4felfing bei M\xfcnchen',
     u'Gr\xfcnwald',
     'Haar',
     u'Haar bei M\xfcnchen',
     'Haar-Gronsdorf',
     'Haar-Salmdorf',
     'Hohenbrunn',
     'Ingolstadt',
     'Ismaning',
     'Krailling',
     'Martinsried',
     'Munchen',
     'Munich',
     u'M\xdc',
     u'M\xfcchen',
     u'M\xfcchnen',
     u'M\xfcnchen - Oberf\xf6hring',
     u'M\xfcnchen - Trudering',
     u'M\xfcnchen-Obermenzing',
     u'M\xfcnchen-Riem',
     u'M\xfcnchen-Thalkirchen',
     u'M\xfcnchen-Unterf\xf6hring',
     u'M\xfcnchen\u200e',
     u'M\xfcnschen',
     'Neubiberg',
     'Neuperlach',
     'Neuried',
     'Ottobrunn',
     'Planegg',
     'Pullach',
     'Pullach i. Isartal',
     'Putzbrunn',
     'Putzbrunn/Seemannsiedlung',
     'Putzbrunn/Solalinden',
     'Putzbrunn/Waldkolonie',
     'Riezlerweg',
     'Salmdorf',
     'So

In [19]:
# is the current node a city
def is_city(elem):
    return (elem.attrib['k'] == "addr:city")

# if city name not expected -> replace it from mapping
def update_city_name(name):

    if name not in expectedCity:
        if name in mappingCity:
            name = mappingCity[name]

    return name

# shape element for json export
def shape_element(element):
    node = {}
    created = {}
    address = {}
    noderefs = []

    # just the "node" and "way" nodes
    if element.tag == "node" or element.tag == "way" :

        #extract everything the nodes have commonly
        node["id"] = element.attrib["id"]
        node["type"] = element.tag
        if element.get("visible"):
            node["visible"] = element.get("visible")
        if element.tag == "node":
            node["pos"] = [float(element.attrib["lat"]),float(element.attrib["lon"])]

        # get the metadata
        created["version"] = element.get("version")
        created["changeset"] = element.get("changeset")
        created["timestamp"] = element.get("timestamp")
        created["user"] = element.get("user")
        created["uid"] = element.get("uid")
        node["created"] = created

        # process all the other info (tags)
        for i in element.iter():
            if i.tag == "tag":
                k = i.get("k")
                
                # some tags have a "." -> replace it with ":" first 
                k = re.sub("\.", ":", k)
                if problemchars.search(k):
                    # still a character problematic, so skip
                    pass

                elif lower_colon.search(k):
                    if k[0:5] == "addr:":
                        # an address element found -> add to address
                        if is_city(i):
                            # clean city name 
                            address["city"] = update_city_name(i.get("v"))
                        else:
                            # other address fields 
                            address[k[5:]] = i.get("v")
                    else:
                        # lower colon, but no address
                        node[k] = i.get("v")

                else:
                    # other key value
                    node[k] = i.get("v")

            # combine node refs
            if i.tag == "nd":
                noderefs.append(i.get("ref"))

            # an address found
            if address != {}:
                node["address"] = address
                
            # a noderef found
            if noderefs != []:
                node["node_refs"] = noderefs

        return node

    else:
        return None


def process_map(file_in, pretty = False):
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

if True:
    data = process_map(OSM, False)

In [3]:
import xml.etree.cElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = OSM  # Replace this with your osm file
SAMPLE_FILE = "/Users/wilfriedhoge/Downloads/sample.osm"


def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every 10th top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % 100 == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')