# OpenStreetMap Project
# Data Wrangling with MongoDB
### Prasand Kumar

Map Area: Hyderabad, India

https://mapzen.com/data/metro-extracts/metro/hyderabad_india/

http://www.openstreetmap.org/relation/237385

### Top Audit

In [8]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict
import re

file = 'Hyderabad.osm'

def count_tags(filename):
    # YOUR CODE HERE
    tags = defaultdict(int)
    for event,element in ET.iterparse(filename):
        tags[element.tag] += 1 
    return tags

In [6]:
tags = count_tags(file)
pprint.pprint(tags)

defaultdict(<class 'int'>,
            {'bounds': 1,
             'member': 11471,
             'nd': 4094848,
             'node': 3239607,
             'osm': 1,
             'relation': 2470,
             'tag': 869409,
             'way': 772473})


### Exploring Users

In [12]:
def process_map(filename):
    users = set()
    for event, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            users.add(element.attrib['uid'])
    return users

users = process_map(file)
pprint.pprint(users)

{'1',
 '100487',
 '1008085',
 '1008088',
 '101441',
 '1016290',
 '1031039',
 '1031188',
 '103253',
 '1051550',
 '1069176',
 '109217',
 '110263',
 '110639',
 '1108251',
 '1112761',
 '111849',
 '1127874',
 '112975',
 '1136933',
 '114161',
 '1144645',
 '1164',
 '1164143',
 '116609',
 '1178187',
 '118021',
 '1198074',
 '1198756',
 '1218879',
 '1219875',
 '1225195',
 '1228101',
 '12295',
 '1229564',
 '1231560',
 '1234722',
 '1240103',
 '1240849',
 '1262469',
 '1273824',
 '1281349',
 '1293194',
 '12946',
 '1295796',
 '1297525',
 '1300764',
 '1306',
 '13203',
 '132740',
 '13303',
 '13363',
 '13445',
 '135290',
 '136860',
 '1386982',
 '1397555',
 '1397912',
 '1407360',
 '1408522',
 '1420318',
 '145231',
 '145671',
 '146675',
 '148676',
 '148776',
 '1487858',
 '1517900',
 '152289',
 '1554107',
 '1562577',
 '1569605',
 '158194',
 '1587943',
 '1588571',
 '1588602',
 '1589731',
 '1597155',
 '160949',
 '1611',
 '161619',
 '1617449',
 '1621698',
 '165061',
 '1651798',
 '1652386',
 '165869',
 '166129

In [20]:
print('The numbers of users exist are',len(users),'.')

The numbers of users exist are 1082 .


## Auditing Street Name

In [11]:
def street_name(element):
    #Checks if the key is a street name
    return (element.attrib['k'] in ['addr:street'])
    
def is_city_name(elem):
    #Checks if the key is the city name
    return (elem.attrib['k'] == "addr:city")

def is_postal_code(elem):
    #Checks if the key is a postal code
    return (elem.attrib['k'] == "addr:postcode")

street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

In [14]:
def audit_street(osm_file):
    counts = defaultdict(int)
    street_types = defaultdict(set)
    for event, element in ET.iterparse(osm_file, events=("start",)):
        if element.tag in ["node", "way", "relation"]:
            for tag in element.iter("tag"):
                if street_name(tag):
                    if tag.attrib['v']:
                        #Getting counts of addr:street keys
                        counts[tag.attrib['k']] += 1
                        #Storing values of addr:street tags
                        m = street_type_re.search(tag.attrib['v'])
                        if m:
                            street_type = m.group()
                            #if street_type not in expected:
                            street_types[street_type].add(tag.attrib['v'])
                            
    return dict(counts), street_types

#unpacks the count and the key values
counts, street_types = audit_street(file)    

print("The number of 'addr:street': {}".format(counts['addr:street']))
pprint.pprint(street_types)

The number of 'addr:street': 841
defaultdict(<class 'set'>,
            {'1': {'Parwathi Nagar Road No 1',
                   'Quena Square, Banjara Hills Road No. 1',
                   'lane number 1',
                   'road number 1',
                   'street number 1',
                   'ushodaya colony phase 1'},
             '10': {'Road no 10', 'Street No. 10'},
             '10-D': {'Street 10-D'},
             '11': {'Road No 11'},
             '12': {'12', 'Road No 12', 'Road No. 12'},
             '13': {'Road No 13'},
             '14': {'Road No 14'},
             '15': {'Road No 15'},
             '2': {'Road Number 2'},
             '20': {'street no: 20', '20'},
             '22': {'22'},
             '25': {'road no 25'},
             '3': {'Banjarahiils, Rd. No. 3',
                   'KPHB road no 3',
                   'Road No 3',
                   'Siddartha Nagar Road number 3',
                   'Street No : 3',
                   'VANDANAPURI COLONY STRE

## Auditing City Name and Postal code

In [3]:
def audit_rest(osm_file):
    #audits city name and postalcodes
    city_name = defaultdict(int)
    postal_codes = defaultdict(int)
    
    for event, element in ET.iterparse(osm_file, events=("start",)):
        if element.tag in ["node", "way", "relation"]:
            for tag in element.iter("tag"):
                if is_city_element(tag):
                    city_name[tag.attrib['v']] += 1
                elif is_postal_code(tag):
                    postal_codes[tag.attrib['v']] += 1                                    
    return city_name, postal_codes

city_name, postal_codes = audit_rest(file)

pprint.pprint(city_name)
pprint.pprint(postal_codes)    

defaultdict(<class 'int'>,
            {', Hyderabad': 1,
             'Bandlaguda': 1,
             "Beside Centre for Good Governance, Greenlands colony, Gachibowli 'X'Roads, Sherilingampally, Rangareddy Dt.,": 1,
             'Beside Sai Gopi Chand Batmintion Academy, Greenlands Colony': 1,
             'CHAMPAPET': 1,
             'Greater Hyderabad Municipal Corporation': 1,
             'HITEC City': 1,
             'HYDERABAD': 21,
             'Hyderabad': 375,
             'Hyderabad Telangana': 1,
             'Hyderabad, Telangana': 4,
             'Hyderabad, Telangana.': 3,
             'Hyderabad,Telangana': 1,
             'KARMANGHAT': 1,
             'Kismat pur': 3,
             'Kismatpur': 1,
             'Kukatpally Hyderabad': 1,
             'Kukatpally Hyderabad,Telangana': 1,
             'Madhapur': 1,
             'Madhapur, Hyderabad, India': 1,
             'Masab Tank, Hyderabad': 1,
             'Masoorabad': 2,
             'Nagaram, Keesara Mandal, Rang