# P3: OpenStreetMap Data Case Study

### 0.1. Code Library and Data

In [34]:
filedir = '/Users/olgabelitskaya/projects/nd002/Data_Analyst_ND_Project3/'
file1 = filedir + 'dubai_abu-dhabi.osm'
file2 = filedir + 'dubai_abu-dhabi_osm_line.geojson'
file3 = filedir + 'dubai_abu-dhabi_osm_point.geojson'
file4 = filedir + 'dubai_abu-dhabi_osm_polygon.geojson'

In [2]:
from ipykernel import kernelapp as app

In [3]:
import folium

In [4]:
import geopandas as gpd

In [22]:
import sqlite3

In [5]:
import geopandas.io

In [6]:
from sqlalchemy import create_engine

In [7]:
import xml.etree.cElementTree as ET

In [8]:
import pprint

In [9]:
import pandas as pd

In [10]:
import re

In [11]:
import json

In [12]:
import io

In [13]:
import os

In [14]:
import urllib

In [15]:
import codecs

In [16]:
import cerberus

In [17]:
from IPython.display import Image

In [18]:
from IPython.core.display import display, HTML

In [19]:
from string import Template

In [20]:
from collections import defaultdict as dfdict

### 0.2. Researching the Imported Files and Creating the Data.

In [35]:
df_line = gpd.read_file(file2)

In [23]:
df_line.shape

(143973, 59)

In [24]:
df_point = gpd.read_file(file3)

In [25]:
df_point.shape

(42256, 60)

In [31]:
df_polygon = gpd.read_file(file4)

IndexError: list index out of range

In [27]:
df_polygon.shape

(42256, 60)

In [24]:
# Function for counting tags.
def count_tags(filename):
    count = dfdict(int)
    for item in ET.iterparse(filename):
        count[item[1].tag] += 1
    return count

In [79]:
# Function for counting users.
def get_user(element):
    return

def process_map_users(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if element.tag == 'node' or element.tag == 'way' or element.tag == 'relation':
            users.add(element.attrib['user'])

    return users

In [58]:
# Strings containing lower case chars
lower = re.compile(r'^([a-z]|_)*$') 
# Strings with lower case chars and a ':'
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
# Strings with chars that will cause problems as keys. 
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')  

# Function for sorting by key type.
def key_type(element, keys):
    if element.tag == "tag":

            if lower.search(element.attrib['k']) != None: 
                keys['lower'] += 1
            elif lower_colon.search(element.attrib['k']) != None:
                keys['lower_colon'] += 1
            elif problemchars.search(element.attrib['k']) != None:
                keys['problemchars'] += 1
            else: 
                keys['other'] += 1

    return keys
# Function for collecting lower, lower colon, problemchars, others.
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

In [84]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", "Trail", "Parkway", "Commons", "Curve", "Circle"]

mapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road", 
            "N.": "North", 
            "Ave.": "Avenue", 
            "Blvd.": "Boulevard", 
            "Blvd": "Boulevard",
            "Ln": "Lane", 
            "N": "North"            
            }

def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = dfdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"): 
                if is_street_name(tag): 
                    audit_street_type(street_types, tag.attrib['v'])
    return street_types

def update_name(name, mapping):
    m = street_type_re.search(name)
    street_type = m.group()
    if street_type not in expected: 
        if street_type in mapping.keys(): 
            new_street_type = mapping[street_type]
            name = name.replace(street_type, new_street_type)
    return name

In [86]:
def street_number(file_name):
    count = 0

    for event, elem in ET.iterparse(file_name, events=("start",)):
        if elem.tag == 'node' or elem.tag == 'way':
            for tag in elem.iter('tag'):
                if tag.attrib['k'] == "addr:street":
                    count += 1
    return count

In [92]:
def zip_codes(f):
    count = 0
    data = set()

    for event, elem in ET.iterparse(f, events=("start",)):
        if elem.tag == 'node' or elem.tag == 'way':
            for tag in elem.iter('tag'):
                if tag.attrib['k'] == "addr:postcode":
                    count += 1
                    data.add( tag.attrib['v'] )
                                     
    return count, data


In [33]:
OSM_PATH = file1

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


# Make sure the fields order in the csvs matches the column order in the sql table schema
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS, 
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):
    """Clean and shape node or way XML element to Python dict"""

    if element.tag == 'node':
        node_attributes = {}    
        tags = []
        
        for unit in NODE_FIELDS:
            node_attributes[unit] = element.attrib[unit]

        for tag in element.iter('tag'):  
 
            problem = PROBLEMCHARS.search(tag.attrib['k'])

            if not problem:
                node_tag_dictionary = {} 
                node_tag_dictionary['id'] = element.attrib['id'] 
                node_tag_dictionary['value'] = tag.attrib['v']  

                point = LOWER_COLON.search(tag.attrib['k'])
                if not point:
                    node_tag_dictionary['type'] = 'regular'
                    node_tag_dictionary['key'] = tag.attrib['k']
                else:
                    before = re.findall('^(.+):', tag.attrib['k'])
                    after = re.findall('^[a-z]+:(.+)', tag.attrib['k'])

                    node_tag_dictionary['type'] = before[0]
                    node_tag_dictionary['key'] = after[0]

            tags.append(node_tag_dictionary)
        
        return {'node': node_attributes, 'node_tags': tags}
    
    elif element.tag == 'way':
        way_attributes = {}
        way_nodes = []
        tags = []  
    
        for unit in WAY_FIELDS:
            way_attributes[unit] = element.attrib[unit]
    
        for tag in element.iter('tag'):  
 
            problem = PROBLEMCHARS.search(tag.attrib['k'])
            if not problem:
                way_tag_dictionary = {}
                way_tag_dictionary['id'] = element.attrib['id'] 
                way_tag_dictionary['value'] = tag.attrib['v']  

                point = LOWER_COLON.search(tag.attrib['k'])
                if not point:
                    way_tag_dictionary['type'] = 'regular'
                    way_tag_dictionary['key'] = tag.attrib['k']
                else:
                    before = re.findall('^(.+):', tag.attrib['k'])
                    after = re.findall('^[a-z]+:(.+)', tag.attrib['k'])

                    way_tag_dictionary['type'] = before[0]
                    way_tag_dictionary['key'] = after[0]

            tags.append(way_tag_dictionary)
    
        for tag in element.iter("nd"):  
            way_nd_dictionary = {} 
            count = 0
            way_nd_dictionary['id'] = element.attrib['id'] 
            way_nd_dictionary['node_id'] = tag.attrib['ref'] 
            way_nd_dictionary['position'] = count  
            count += 1
            
            way_nodes.append(way_nd_dictionary)
    
        return {'way': way_attributes, 'way_nodes': way_nodes, 'way_tags': tags}
# ================================================== #
#               Helper Functions                     #
# ================================================== #
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=schema1):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_strings = (
            "{0}: {1}".format(k, v if isinstance(v, str) else ", ".join(v))
            for k, v in errors.iteritems()
        )
        raise cerberus.ValidationError(
            message_string.format(field, "\n".join(error_strings))
        )


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
        codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
        codecs.open(WAYS_PATH, 'w') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
        codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=True)

NameError: name 'schema1' is not defined

In [None]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]


def is_address(elem):
    if elem.attrib['k'][:5] == "addr:":
        return True

def shape_element(element):
    node = {}
    if element.tag == "node" or element.tag == "way":
        address_info = {}
        nd_info = []
        #pprint.pprint(element.attrib)
        node["type"] = element.tag
        node["id"] = element.attrib["id"]
        if "visible" in element.attrib.keys():
            node["visible"] = element.attrib["visible"]
        if "lat" in element.attrib.keys():
            node["pos"] = [float(element.attrib['lat']), float(element.attrib['lon'])]
        node["created"] = {"version": element.attrib['version'],
                            "changeset": element.attrib['changeset'],
                            "timestamp": element.attrib['timestamp'],
                            "uid": element.attrib['uid'],
                            "user": element.attrib['user']}
        for tag in element.iter("tag"):
            #print tag.attrib
            p = problemchars.search(tag.attrib['k'])
            if p:
                #print "PROBLEM:", p.group()
                continue
            elif is_address(tag):
                if ":" in tag.attrib['k'][5:]:
                    #print "Bad Address:", tag.attrib['k'], "--", tag.attrib['v']
                    continue
                else:
                    address_info[tag.attrib['k'][5:]] = tag.attrib['v']
                    #print "Good Address:", tag.attrib['k'], "--", tag.attrib['v']
            else:
                node[tag.attrib['k']] = tag.attrib['v']
                #print "Outside:", tag.attrib['k'], "--", tag.attrib['v']
        if address_info != {}:
            node['address'] = address_info
        for tag2 in element.iter("nd"):
            nd_info.append(tag2.attrib['ref'])
            #print tag2.attrib['ref']
        if nd_info != []:
            node['node_refs'] = nd_info
        return node
    else:
        return None

def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data


### 1. Map Area

##### I chose the map sector of the dynamically developing region.
##### From the website https://mapzen.com/data/metro-extracts/metro/dubai_abu-dhabi/ the files dubai_abu-dhabi.osm, dubai_abu-dhabi_osm_line.geojson, dubai_abu-dhabi_osm_point.geojson,dubai_abu-dhabi_osm_point.geojson  were downloaded.
##### This is not so large piece of data to process (394,1 MB) and a very interesting subject for reseach because of constant changing in this territory.
##### For displaying the area I have used the folium package.

In [23]:
# Setup the coordinates of the map center and the zoom option.
map_osm = folium.Map(location=[25.1548, 55.2708], zoom_start=8)
# Add labels with coordinates.
folium.LatLngPopup().add_to(map_osm)
# Setup the coordinates of the map area.
points=[[26.5381, 53.5766], [23.7091, 53.5766], [23.7091, 56.8776], [26.5381, 56.8766], [26.5381, 53.5766]]
# Setup the border line with options.
folium.PolyLine(points, color="red", weight=5, opacity=0.3).add_to(map_osm)
# Display the map.
map_osm

In [None]:
df_polygon.plot()

##### Let's count the tags and the users of the data.

In [16]:
count_tags(file1)

defaultdict(int,
            {'bounds': 1,
             'member': 9783,
             'nd': 2270396,
             'node': 1889204,
             'osm': 1,
             'relation': 2820,
             'tag': 501632,
             'way': 234132})

In [80]:
users1 = process_map_users(file1)
# pprint.pprint(users1)
print len(users1)

1860


### 2. Problems and errors

In [60]:
keys1 = process_map(file1)
pprint.pprint(keys1)

{'lower': 478301, 'lower_colon': 20311, 'other': 3000, 'problemchars': 20}


### 3. Street Names

In [85]:
st_types1 = audit(file1)
pprint.pprint(dict(st_types1))

for st_type, ways in st_types1.iteritems():
        for name in ways:
            better_name = update_name(name, mapping)
            print name, "=>", better_name

{'07': set(['07']),
 '1': set(['20B Street, Safa 1',
           'City Walk, Jumeirah 1',
           'E 1',
           'Hattan Street 1',
           'aljurf ind 1']),
 '10': set(['Street 10', 'ind area 10']),
 '111': set(['P.O.Box 111']),
 '12': set(['District 12', 'Street 12']),
 '12A': set(['12A']),
 '12K': set(['District 12K']),
 '13': set(['Street 13', 'industrial 13', 'street 13\n']),
 '14': set(['11th street, Musaffah M 14',
            'Musaffah Industrial Area Street 14']),
 '147': set(['147']),
 '15': set(['sweet 15']),
 '153': set(['Community 153']),
 '166': set(['166']),
 '17': set(['17']),
 '18': set(['Street 18', 'street 18']),
 '19': set(['19']),
 '19th)': set(["Sa'ada Street (19th)"]),
 '1D': set(['1D']),
 '2': set(['Al Barsha south 2',
           'Al Jaddaf 2',
           'Al Nahda 2',
           'Dubai Investment Park 2',
           'Hattan Street 2',
           'Icad 2',
           'Street 2',
           'dubai investment park 2']),
 '2-A': set(['2-A']),
 '282825': set

In [87]:
print "Number of Street Addresses: %d" % street_number(file1)

Number of Street Addresses: 1764


### 4. Postal Codes

In [93]:
print zip_codes(file1)

(114, set(['00962', '34121', '7819', '108100', 'P.O. Box 5618, Abu Dhabi, U.A.E', '8988', '0', '23117', 'P O BOX 3766', '103711', '549', '38495', 'P.O. Box 4605', 'Muhaisnah 4', '20767', '81730', '2504', '6656', 'PO Box 6770', '25494', 'PO Box 43377', '97717', '24857', '232574', 'P.O. Box 9770', '60884', '44263', '277', '16095', 'P. O. Box 31166', '502227', '2666', '41318', 'P. O. Box 123234', '00971', '128358', '79506', '115443', '500368', '473828', '114692', '232144', '2574', '121641', '1243', '125939', 'PO Box 118737', '57566', '6834', '28818', 'PO Box 114822', '42524', '52799', '2157', '392189', '9978', '4758', '22436', '231992', '46477', '5280 dubai', '811', '28676', '12345', '38126', '113431', '47612', '24976', 'P.O. Box 6446', '111695', '44548', '119417', '1111', '7770', '77947', '4599', '8845', '47602', '1234', '11999', '34238', '20661', '53577', '20268', '9292', '3541', '000000', '000001', '38575', '444786', '263076', '71444', '32923', '26268']))


### 5. Sort cities

### 6. Data Overview

### 7. Conclusion

### 8. Addition

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "some_osm.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')