In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
import re
import codecs
import json

In [2]:
filename='pune_india.osm'

In [3]:
def count_tags(filename):
    tags={}
    for event,elem in ET.iterparse(filename):
        if elem.tag in tags:
            tags[elem.tag]+=1
        else:
            tags[elem.tag]=1
    return tags

In [4]:
count_tags(filename)

{'bounds': 1,
 'member': 8048,
 'nd': 1702582,
 'node': 1419019,
 'osm': 1,
 'relation': 2182,
 'tag': 306838,
 'way': 270406}

In [4]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        if re.match(lower,element.get("k"))!=None:
            keys["lower"]+=1
        elif re.match(lower_colon,element.get("k"))!=None:
            keys["lower_colon"]+=1
        elif re.match(problemchars,element.get("k"))!=None:
            keys["problemchars"]+=1
        else:
            keys["other"]+=1
    return keys



def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(file):
        keys = key_type(element, keys)


    return keys

In [6]:
process_map(filename)

{'lower': 299881, 'lower_colon': 6761, 'other': 196, 'problemchars': 0}

In [5]:
def get_user(element):
    return element.get("uid")


def process_id(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        user = get_user(element)
        if user != None:
         users.add(user)
        

    return users

In [6]:
len(process_id(filename))

710

In [7]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "pune_india.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [8]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Court", "Place", "Square", "Lane","Chowk", "Parkway",
            "Circle","Marg","Avenue","Nagar","Park","Path","Road","Street"]


mapping = { "St": "Street",
            "st": "Street",
            "street":"Street",
            "Ave": "Avenue",
            "ave": "Avenue",
            "Rd.": "Road",
            "Rd": "Road",
            "road": "Road",
            "raod": "Road",
            "udyog": "Udyog",
            "chowk": "Chowk"}




def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)


def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

def update_name(name, mapping):

    splitted_name = name.split()
    for word in range(len(splitted_name)):
        if splitted_name[word] in mapping:
            splitted_name[word] = mapping[splitted_name[word]]
    name = " ".join(splitted_name)
    return name


In [9]:
st_types = audit(SAMPLE_FILE)

pprint.pprint(dict(st_types))


{'10': set(['Road 10']),
 '2': set(['Hinjewadi Phase 2']),
 'Bypass': set(['NH4 Bypass']),
 'Gymkhana': set(['Deccan Gymkhana']),
 'J13': set(['Road number 13, sub lane J13']),
 'Magarpatta': set(['Magarpatta']),
 'Nilakh': set(['Pimple Nilakh']),
 'Pashan': set(['Dr. Homi Bhabha Road, Pashan']),
 'Rd': set(['Bhajimandai Rd', 'Gulawani Maharaj Rd', 'MIT College Rd']),
 'Sheri': set(['Mahadev Nagar, Wadgaon Sheri']),
 'Trail': set(['Pashan Hill Trail']),
 'raod': set(['katepuram mayur nagari raod'])}


In [10]:
 for st_type, ways in st_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name

Road 10 => Road 10
Dr. Homi Bhabha Road, Pashan => Dr. Homi Bhabha Road, Pashan
Deccan Gymkhana => Deccan Gymkhana
Pimple Nilakh => Pimple Nilakh
Bhajimandai Rd => Bhajimandai Road
Gulawani Maharaj Rd => Gulawani Maharaj Road
MIT College Rd => MIT College Road
Pashan Hill Trail => Pashan Hill Trail
Hinjewadi Phase 2 => Hinjewadi Phase 2
Magarpatta => Magarpatta
NH4 Bypass => NH4 Bypass
Mahadev Nagar, Wadgaon Sheri => Mahadev Nagar, Wadgaon Sheri
Road number 13, sub lane J13 => Road number 13, sub lane J13
katepuram mayur nagari raod => katepuram mayur nagari Road


In [None]:


import Improving_Street_Names
from Improving_Street_Names import update_name, mapping,name
Improving_Street_Names.update_name(name, mapping)
Improving_Street_Names.mapping
OSM_PATH = "pune_india.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements
    toc = 0
    # YOUR CODE HERE
  
    if element.tag == 'node':
        for i in NODE_FIELDS:
            node_attribs[i] = element.attrib[i]
        for tag in element.iter("tag"):
            node_tags_attribs = {}
            temp = LOWER_COLON.search(tag.attrib['k'])
            is_pc = PROBLEMCHARS.search(tag.attrib['k'])
            if is_pc:
                continue
            if tag.attrib["k"] == 'addr:street': 
                node_tags_attribs["value"] = update_name(tag.attrib["v"], mapping)
                node_tags_attribs["id"] = element.attrib['id']
                node_tags_attribs["key"] = tag.attrib["k"].split(':',1)[1]
                node_tags_attribs["type"] = tag.attrib["k"].split(':',1)[0]
            elif temp:
                split_char = temp.group(1)
                split_index = tag.attrib['k'].index(split_char)
                type1 = temp.group(1)
                node_tags_attribs['id'] = element.attrib['id']
                node_tags_attribs['key'] = tag.attrib['k'][split_index+2:]
                node_tags_attribs['value'] = tag.attrib['v']
                node_tags_attribs['type'] = tag.attrib['k'][:split_index+1]
            else:
                node_tags_attribs['id'] = element.attrib['id']
                node_tags_attribs['key'] = tag.attrib['k']
                node_tags_attribs['value'] = tag.attrib['v']
                node_tags_attribs['type'] = 'regular'
            tags.append(node_tags_attribs)
            
            
        return {'node': node_attribs, 'node_tags': tags}
        
    elif element.tag == 'way':
        id = element.attrib['id']
        for i in WAY_FIELDS:
            way_attribs[i] = element.attrib[i]
        for i in element.iter('nd'):
            idu = {}
            idu['id'] = id
            idu['node_id'] = i.attrib['ref']
            idu['position'] = toc
            toc+=1
            way_nodes.append(idu)
        for c in element.iter('tag'):
            temp = LOWER_COLON.search(c.attrib['k'])
            is_pc = PROBLEMCHARS.search(c.attrib['k'])
            eu = {}
            if is_pc:
                continue
            if c.attrib['k'] == 'addr:street': 
                eu['value'] = update_name(c.attrib['v'], mapping)
                eu['id'] = id
                eu['key'] = c.attrib["k"].split(':',1)[1]
                eu['type'] = c.attrib["k"].split(':',1)[0]
            elif temp:
                split_char = temp.group(1)
                split_index = c.attrib['k'].index(split_char)
                eu['id'] = id
                eu['key'] = c.attrib['k'][split_index+2:]
                eu['type'] = c.attrib['k'][:split_index+1]
                eu['value'] = c.attrib['v']
            else:
                eu['id'] = id
                eu['key'] = c.attrib['k']
                eu['type'] = 'regular'
                eu['value'] = c.attrib['v']
            tags.append(eu)
            
        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file,          codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file,          codecs.open(WAYS_PATH, 'w') as ways_file,          codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file,          codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)



In [11]:
import sqlite3
from pprint import pprint

sqlite_file = 'newmap.db'    # name of the sqlite database file created

# Connecting to the database file
conn = sqlite3.connect(sqlite_file)
cur = conn.cursor()


QUERY = '''
SELECT COUNT(*) as count
FROM nodes;
'''

cur.execute(QUERY)

result = cur.fetchall()
print 'number of nodes:'
print(result)

number of nodes:
[(1419019,)]


In [12]:
QUERY = '''
SELECT COUNT(*) as count
FROM ways;
'''

cur.execute(QUERY)

result = cur.fetchall()
print 'number of ways:'
print(result)

number of ways:
[(270406,)]


In [13]:
QUERY = '''
 SELECT id.user, COUNT(*) as num
FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) id
GROUP BY id.user
ORDER BY num DESC
LIMIT 10;
'''

cur.execute(QUERY)

result = cur.fetchall()
print 'Top 10 contributing users'
print(result)

Top 10 contributing users
[(u'singleton', 96623), (u'harishvarma', 60143), (u'jasvinderkaur', 57694), (u'sramesh', 57627), (u'praveeng', 56788), (u'shiva05', 51899), (u'anushapyata', 49530), (u'kranthikumar', 47435), (u'harishk', 43180), (u'saikumar', 40332)]


In [14]:

QUERY = '''
 SELECT value, COUNT(*) as num
FROM nodes_tags
WHERE key='amenity'
GROUP BY value
ORDER BY num DESC
LIMIT 15;
'''


cur.execute(QUERY)

result = cur.fetchall()
print'Top 15 appearing amenities'
print(result)

Top 15 appearing amenities
[(u'restaurant', 241), (u'bank', 179), (u'atm', 140), (u'place_of_worship', 121), (u'cafe', 75), (u'fast_food', 70), (u'hospital', 53), (u'fuel', 45), (u'school', 40), (u'police', 31), (u'pharmacy', 30), (u'toilets', 23), (u'post_office', 16), (u'bus_station', 14), (u'parking', 14)]


In [15]:
QUERY = '''
SELECT nodes_tags.value, COUNT(*) as num
FROM nodes_tags 
    JOIN (SELECT DISTINCT(id) FROM nodes_tags WHERE value='place_of_worship') wp
    ON nodes_tags.id=wp.id
WHERE nodes_tags.key='religion'
GROUP BY nodes_tags.value
ORDER BY num DESC
LIMIT 5;
'''


cur.execute(QUERY)

result_zip2 = cur.fetchall()
print 'Place of Worship:'
print(result_zip2)

Place of Worship:
[(u'hindu', 76), (u'muslim', 10), (u'christian', 4), (u'sikh', 1)]


In [16]:
QUERY = '''
SELECT nodes_tags.value, COUNT(*) as num
FROM nodes_tags 
    JOIN (SELECT DISTINCT(id) FROM nodes_tags WHERE value='bank') bk
    ON nodes_tags.id=bk.id
WHERE nodes_tags.key='name'
GROUP BY nodes_tags.value
ORDER BY num DESC
LIMIT 5;
'''


cur.execute(QUERY)
print '5 Biggest Bank:'
result = cur.fetchall()
print(result)

5 Biggest Bank:
[(u'Bank of Maharashtra', 16), (u'ICICI Bank', 10), (u'Axis Bank', 9), (u'HDFC Bank', 9), (u'State Bank of India', 9)]
