In [1]:
# -*-coding:utf-8 -*-
import xml.etree.ElementTree as ET
import pprint
import re
from collections import defaultdict
OSM_FILE='./delaware-latest.osm'
SAMPLE_FILE ='./sample.osm'
k = 40 # Parameter: take every k-th top level element
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(OSM_FILE, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

In [2]:
#This function is count_tags and get tpye of tags

def count_tags(filename):
        # YOUR CODE HERE
        taglist=[]
        tags=dict()
        tree=ET.iterparse(filename)
        for event,elem in tree:
            if elem.tag not in tags.keys():
                tags[elem.tag]=1
            else:
                tags[elem.tag]=tags[elem.tag]+1
        return tags

In [3]:
# Get tags number
tags = count_tags(OSM_FILE)
pprint.pprint(tags)

{'bounds': 1,
 'member': 62932,
 'nd': 1123208,
 'node': 941707,
 'osm': 1,
 'relation': 1712,
 'tag': 527988,
 'way': 92076}


In [12]:
# Judege exist illegal data or not
def judge_id(filename):
    bad_id=[]
    for _, element in ET.iterparse(filename):
        try:
            if not re.match(r'^\d+$',element.attrib['uid']):
                bad_id.append(element.attrib['uid'])
        except:
            pass
        pass
    return bad_id

In [13]:
print judge_id(SAMPLE_FILE)

[]


In [14]:
#This is get unique user
def get_user(filename):
    
    users = []
    for _, element in ET.iterparse(filename):
        user_info={}
        try:
            user_info['uid']=element.attrib['uid']
            user_info['timestamp']=element.attrib['timestamp']
            user_info['user']=element.attrib['user']
            users.append(user_info)
        except:
            pass
        pass
    return users

In [15]:
user_data=get_user(OSM_FILE)

In [16]:
#This function is get tags
def get_node_tags(filename):
    
    node_list=[]
    for _,element in ET.iterparse(filename):
        node_tags={}
        try:
            if element.tag=='node':
                node_tags['changeset']=element.attrib['changeset']
                node_tags['node_id']=element.attrib['id']
                node_tags['lat']=element.attrib['lat']
                node_tags['lon']=element.attrib['lon']
                node_list.append(node_tags)
        except:
            pass
        pass
    return node_list

In [17]:
#get node
node_data= get_node_tags(OSM_FILE)

In [18]:
print node_data[0:10]

[{'lat': '39.8023061', 'changeset': '14254291', 'node_id': '75385503', 'lon': '-75.4149476'}, {'lat': '39.7221284', 'changeset': '8027863', 'node_id': '75390099', 'lon': '-75.7887221'}, {'lat': '39.7439567', 'changeset': '4830611', 'node_id': '75390129', 'lon': '-75.762562'}, {'lat': '39.7556043', 'changeset': '4870911', 'node_id': '75390163', 'lon': '-75.7544014'}, {'lat': '39.781798', 'changeset': '4870911', 'node_id': '75390202', 'lon': '-75.7301321'}, {'lat': '39.7848591', 'changeset': '4870911', 'node_id': '75390209', 'lon': '-75.726572'}, {'lat': '39.7792758', 'changeset': '4870911', 'node_id': '75390220', 'lon': '-75.732966'}, {'lat': '39.791653', 'changeset': '4870911', 'node_id': '75390223', 'lon': '-75.7176608'}, {'lat': '39.7939574', 'changeset': '4870911', 'node_id': '75390224', 'lon': '-75.7147491'}, {'lat': '39.7893187', 'changeset': '4870911', 'node_id': '75390230', 'lon': '-75.7207621'}]


In [19]:
def get_ways(filename):
    way_list=[]
    for _,element in ET.iterparse(filename):
        way_tags={}
        try:
            if element.tag=='way':
                way_tags['changeset']=element.attrib['changeset']
                way_tags['way_id']=element.attrib['id']
                way_list.append(way_tags)
        except:
            pass
        pass
    return way_list

In [20]:
#This function is get way
way_data= get_ways(OSM_FILE)

In [21]:
# Get name of node which choice
def get_infor(element):
    information=['name','wifi','amenity','cuisine','addr:city','addr:postcode']
    get_information={}
    for tag in element.iter('tag'):
        k_value=tag.attrib['k']
        if k_value in information:
            get_information[k_value]=tag.attrib['v']
    return get_information
def get_tags(filename,value):
    data=[]
    for _,element in ET.iterparse(filename):
        try:
            if element.tag=='way':
                for tag in element.iter('tag'):
                    if tag.attrib['k']=='amenity' and tag.attrib['v']==value:
                        infor=get_infor(element)
                        data.append(infor)
                        break
        except:
            pass
        pass
    return data

In [22]:
restaurant_data=get_tags(OSM_FILE,'restaurant')

In [2]:
#Judge zip illegal or not
def judge_zip(filename):
    bad_code=[]
    for _,element in ET.iterparse(filename):
        try:
            if element.tag=='way':
                for tag in element.iter('tag'):
                    if tag.attrib['k']=='tiger:zip_left':
                        value=tag.attrib['v']
                        if not re.match(r'^\d{5}$',value):
                            bad_code.append(value)
        except:
            pass
        pass
    return bad_code

In [3]:
#illegal zip
illegal_zip=judge_zip(SAMPLE_FILE)
print illegal_zip

['19805:19808', '19707:19805', '19805:19808', '19803:19809:19810', '19703:19809']


In [4]:
#Update illegal zip
def Update_zip(zip):
    Updated_zip=[]
    temp=zip.split(':')
    for i in range(int(temp[0]),int(temp[-1])+1):
        Updated_zip.append(str(i))
    return Updated_zip

In [26]:
#Get zip code
from collections import defaultdict
def get_zip(filename):
    zip_code=[]
    for _,element in ET.iterparse(filename):
        zip_infor={}
        try:
            if element.tag=='way':
                for tag in element.iter('tag'):
                    if tag.attrib['k']=='tiger:zip_left':
                        if len(tag.attrib['v'])==5:
                            zip_infor['zip_code']=tag.attrib['v']
                            for tag in element.iter('tag'):
                                if tag.attrib['k']=='name':
                                    zip_infor['name']=tag.attrib['v']
                                    break
                            zip_code.append(zip_infor)
                            break
                        if len(tag.attrib['v'])!=5:
                            tmp_zip=Update_zip(tag.attrib['v'])
                            zip_infor['zip_code']=tmp_zip
                            for tag in element.iter('tag'):
                                if tag.attrib['k']=='name':
                                    zip_infor['name']=tag.attrib['v']
                                    break
                            zip_code.append(zip_infor)
                            break
        except:
            pass
        pass
    return zip_code

In [27]:
# Get zip code and append Updated_zip
zip_code_data=get_zip(OSM_FILE)

In [28]:
#Ouput json
import json
def output_json(data,name):
    name=name+'.json'
    with open(name,'w') as f:
        json.dump(data,f)
    print "completed"

In [29]:
output_json(user_data,'user')
output_json(node_data,'node')
output_json(way_data,'way')
output_json(restaurant_data,'restaurant')
output_json(zip_code_data,'zip_code')

completed
completed
completed
completed
completed


In [30]:
# Insert the json to mongodb
import json
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
db=client.examples

In [31]:
def insert_user(document):
    with open(document) as f:
        data = json.loads(f.read())
        db.user.insert_many(data)
        print 'completed'

In [32]:
insert_user('user.json')

completed


In [33]:
def insert_node(document):
    with open(document) as f:
        data = json.loads(f.read())
        db.node.insert_many(data)
        print 'completed'

In [34]:
insert_node('node.json')

completed


In [35]:
def insert_way(document):
    with open(document) as f:
        data = json.loads(f.read())
        db.way.insert_many(data)
        print 'completed'

In [36]:
insert_way('way.json')

completed


In [37]:
def insert_restaurant(document):
    with open(document) as f:
        data = json.loads(f.read())
        db.restaurant.insert_many(data)
        print 'completed'

In [38]:
insert_restaurant('restaurant.json')

completed


In [39]:
def insert_zip_code(document):
    with open(document) as f:
        data = json.loads(f.read())
        db.zip_code.insert_many(data)
        print 'completed'

In [40]:
insert_zip_code('zip_code.json')

completed
