In [1]:
import sqlite3 as sq3
import re
from collections import defaultdict

In [2]:
DBNAME = "Gold_Country_OSM2.db"
FETCH_ONE = 0
FETCH_ALL = 1
PHYSICAL_FEATURE_TAGS = ['shop','amenity','waterway','power','aerialway','man_made', \
                       'boundary','highway','barrier','leisure','historic','tourism',\
                       'building','traffic_calming','natural','aeroway','place', \
                       'railway','landuse','city','emergency','manhole','office']

## convert the PHYSICAL_FEATURE_TAGS list into a string for use in SQL queries
pfs = "'" + PHYSICAL_FEATURE_TAGS[0] + "'"
for tag in PHYSICAL_FEATURE_TAGS[1:]:
    pfs = pfs + ",'" + tag + "'"

In [3]:
## function to run one SQLite query and return either one row, or all of the result set
def run_sql(SQL, dbname=DBNAME, fetch=FETCH_ALL, params = None):
    with sq3.connect(dbname) as conn:
        cur = conn.cursor()
        if params == None:
            cur.execute(SQL)
        else:
            cur.execute(SQL, params)
        if fetch == FETCH_ALL:
            data = cur.fetchall()
            return data
        elif fetch == FETCH_ONE:
            data = cur.fetchone()
            return data

In [4]:
# Count the number of nodes, ways, and relations in the database.

for table_name in ["Nodes","Ways","Relations"]:
    data = run_sql("SELECT COUNT(*) FROM " + table_name, fetch=FETCH_ONE)
    print data[0], "elements in the", table_name, "table."
    
## Also count nodes which have one or more tags:
tag_nodes = run_sql("SELECT COUNT(DISTINCT Nodes.id) from Nodes JOIN Node_tags \
                on Nodes.id = Node_tags.id WHERE Node_tags.id IS NOT NULL", fetch=FETCH_ONE)
print tag_nodes[0], "of the nodes have one or more tags."

622875 elements in the Nodes table.
42629 elements in the Ways table.
174 elements in the Relations table.
12521 of the nodes have one or more tags.


In [5]:
## One problem identified in initial audit was ways tagged as highway=residential which
## were not from the TIGER upload and had no name tag.  On visual inspection these appeared
## to be real roads in residential neighborhoods which did show names when I looked at the
## area on Google Maps.

## Select ways tagged as 'highway=residential', not from TIGER upload, no name=tag:
data = run_sql("SELECT DISTINCT Ways.id FROM Ways JOIN Way_tags on ways.id = way_tags.id \
    WHERE way_tags.key = 'highway' and way_tags.value = 'residential' \
    AND Ways.id NOT IN (SELECT DISTINCT id FROM Way_Tags \
    WHERE Way_tags.type = 'tiger') \
    AND Ways.id NOT IN (SELECT DISTINCT id from Way_tags WHERE \
    Way_tags.key = 'name' and Way_tags.type='regular')", \
               fetch=FETCH_ALL)

print len(data)

303


This is the same count I got when I did this programmatically.  One could pull these ways and the associated nodes, insert the latitude and longitutde into Google Maps API and see what comes back as a street name, and use that to update the name in the OSM database.  

In [6]:
## Find nodes without tags and make sure they are part of a way or relation
## These are "orphan" nodes which should be deleted.
SQL = "SELECT id from NODES WHERE (SELECT COUNT(*) FROM Node_Tags \
    WHERE Nodes.id = Node_Tags.id) = 0 AND \
    Nodes.id NOT IN (SELECT DISTINCT Way_nodes.node_id from Way_Nodes) AND \
    Nodes.id NOT IN (SELECT member_id from Relation_members where relation_members.type = 'node');"
orphan_nodes = run_sql(SQL, fetch = FETCH_ALL)
print len(orphan_nodes)

## Verified in separate queries that there are no ways or relations with these node id's, and
## that these nodes have no tags.

44


In [7]:
## Delete Nodes which are not part of a way or relation
SQL = "DELETE FROM Nodes WHERE id IN \
    (SELECT id from NODES WHERE (SELECT COUNT(*) FROM Node_Tags \
    WHERE Nodes.id = Node_Tags.id) = 0 AND \
    Nodes.id NOT IN (SELECT DISTINCT Way_nodes.node_id from Way_Nodes) AND \
    Nodes.id NOT IN (SELECT member_id from Relation_members where relation_members.type = 'node'))"
conn = sq3.connect(DBNAME)
cur = conn.cursor()
cur.execute(SQL)
print cur.rowcount, "orphan nodes deleted."
conn.commit()
conn.close()

44 orphan nodes deleted.


In [8]:
## Repeat SELECT query
SQL = "SELECT COUNT(*) FROM Nodes WHERE (SELECT COUNT(*) FROM Node_Tags \
    WHERE Nodes.id = Node_Tags.id) = 0 AND \
    Nodes.id NOT IN (SELECT DISTINCT Way_nodes.node_id from Way_Nodes) AND \
    Nodes.id NOT IN (SELECT member_id from Relation_members where relation_members.type = 'node');"
orphan_node_ct = run_sql(SQL, fetch = FETCH_ONE)
print orphan_node_ct[0], "orphan nodes found."


0 orphan nodes found.


In [9]:
## Check for ways with no tags which are not part of relations
## These are "orphan" ways which should also be deleted
SQL = "SELECT id from ways WHERE (SELECT COUNT(*) FROM Way_Tags WHERE \
    Ways.id = Way_tags.id) = 0 AND \
    Ways.id NOT IN (SELECT DISTINCT member_id from Relation_members WHERE \
    Relation_members.type = 'way');"
data = run_sql(SQL, fetch = FETCH_ALL)
print len(data), "orphan ways found in database."

25 orphan ways found in database.


In [10]:
## Delete these ways then check again for orphan nodes
SQL = "DELETE FROM Ways WHERE id IN \
    (SELECT id from ways WHERE (SELECT COUNT(*) FROM Way_Tags WHERE \
    Ways.id = Way_tags.id) = 0 AND \
    Ways.id NOT IN (SELECT DISTINCT member_id from Relation_members WHERE \
    Relation_members.type = 'way'));"
conn = sq3.connect(DBNAME)
cur = conn.cursor()
cur.execute(SQL)
print cur.rowcount, "orphan ways deleted from database."
conn.commit()
conn.close()


25 orphan ways deleted from database.


In [11]:
# Repeat query to confirm DELETE worked as expected
SQL = "SELECT COUNT(*) from ways WHERE (SELECT COUNT(*) FROM Way_Tags WHERE \
    Ways.id = Way_tags.id) = 0 AND \
    Ways.id NOT IN (SELECT DISTINCT member_id from Relation_members WHERE \
    Relation_members.type = 'way');"
orphan_way_ct = run_sql(SQL, fetch = FETCH_ONE)
print orphan_way_ct[0], "oprhan ways found in repeat query."

0 oprhan ways found in repeat query.


In [12]:
## Finally check again for orphan nodes:
SQL = "SELECT COUNT(*) from NODES WHERE (SELECT COUNT(*) FROM Node_Tags \
    WHERE Nodes.id = Node_Tags.id) = 0 AND \
    Nodes.id NOT IN (SELECT DISTINCT Way_nodes.node_id from Way_Nodes) AND \
    Nodes.id NOT IN (SELECT member_id from Relation_members where relation_members.type = 'node');"
orphan_node_ct = run_sql(SQL, fetch = FETCH_ONE)
print orphan_node_ct[0], "orphan nodes found after deleting orphan ways."
## and we didn't create any new orphans by getting rid of the orphan ways

0 orphan nodes found after deleting orphan ways.


In [13]:
## Counting open and closed ways. The definition in this query is the same as the one in
## the OSM Wiki, which is first node = last node for closed way, otherwise open.  Note there
## are ways which share nodes which may not be first,last and are therefore counted as "open".

SQL = "SELECT COUNT(*) from ways WHERE (SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position Limit 1) = \
    (SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position DESC LIMIT 1) ;"
way_ct = run_sql(SQL, fetch = FETCH_ONE)
print "There are now", way_ct[0], "closed ways in the database."

SQL = "SELECT COUNT(*) from ways WHERE (SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position Limit 1) <> \
    (SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position DESC LIMIT 1) ;"
way_ct = run_sql(SQL, fetch = FETCH_ONE)
print "There are now", way_ct[0], "open ways in the database."
## This numbers are lower than when counted during initial audit as orphan ways have been deleted.

There are now 17071 closed ways in the database.
There are now 25533 open ways in the database.


In [14]:
## Count the times each primary physical feature tag is used in the open and closed way groups
total_ways= 0
print "Closed Ways:"
for ft in PHYSICAL_FEATURE_TAGS:
    SQL = "SELECT COUNT(DISTINCT ways.id) FROM Ways JOIN Way_Tags on ways.id = way_tags.id WHERE \
    (Way_tags.key='"+ft+"') AND ((SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position Limit 1) = \
    (SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position DESC LIMIT 1))"
    data = run_sql(SQL, fetch=FETCH_ONE)
    print ft, ':', data[0]
    total_ways += data[0]
print "total:", total_ways
print
total_ways = 0
print "Open Ways:"
for ft in PHYSICAL_FEATURE_TAGS:
    SQL = "SELECT COUNT(DISTINCT ways.id) FROM Ways JOIN Way_Tags on ways.id = way_tags.id WHERE \
    Way_tags.key='"+ft+"' AND Ways.id IN (SELECT id from ways WHERE \
    (SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position Limit 1) <> \
    (SELECT node_id FROM Way_Nodes WHERE \
    Ways.id = Way_Nodes.id ORDER BY way_nodes.position DESC LIMIT 1));"
    data = run_sql(SQL, fetch=FETCH_ONE)
    print ft, ':', data[0]
    total_ways += data[0]
print "total:", total_ways

Closed Ways:
shop : 29
amenity : 499
waterway : 21
power : 42
aerialway : 0
man_made : 29
boundary : 48
highway : 3850
barrier : 44
leisure : 1462
historic : 9
tourism : 15
building : 7927
traffic_calming : 0
natural : 1040
aeroway : 12
place : 31
railway : 3
landuse : 7480
city : 87
emergency : 3
manhole : 0
office : 1
total: 22632

Open Ways:
shop : 0
amenity : 0
waterway : 1406
power : 242
aerialway : 1
man_made : 82
boundary : 8
highway : 23297
barrier : 106
leisure : 5
historic : 1
tourism : 1
building : 2
traffic_calming : 0
natural : 2
aeroway : 49
place : 0
railway : 155
landuse : 0
city : 4
emergency : 8
manhole : 0
office : 0
total: 25369


Interestingly, the count of primary physical feature tags in the closed way group far exceeds the number of ways. It turns out these numbers are correct and several thousand closed ways are tagged with 2 primary physical feature tags.  

There is one group of > 2000 closed ways which are tagged building=terrace and landuse=residential.  On examining several of these visually (using Google Maps satellite view), they all map an area contiguous with a private residence which appears to be a patio or pool deck.  According to the OSM Wiki the building=terrace tag should be used for a group of joined residences like row houses. The landuse=residential tag is supposed to be for mapping areas of land on which residential buildings are sited.  Both of these tags are used in this group of ways incorrectly, at least according to the documentation in the OSM Wiki. It is not clear to me from the documentation what the correct tagging should be for these elements.  I've decided to tag them man_made=patio, which is consistent with tagging other structures which are not buildings.  I'll also add a tag 'error_corrected=yes', which I can use in later analysis of the error rate in this database.

The other large group (about 3600) of double-tagged ways appear to be residential driveways in subdivisions, which are tagged highway=service and landuse=residential.  The highway=service tag is appropriate for these elements, but the landuse tag is not.  A service=driveway tag can be added to clarify what these are.  I discussed this on the OSM help forum, and it was suggested that adding area=yes would help to clarify it further.  Therefore I will add the service=driveway and area=yes tags to these elements, and delete the landuse=residential tag. Also add the error_corrected=yes tag.    

In [15]:
## Select the ways with the double tag 'highway=service' and 'landuse = residential'
SQL = "SELECT ways.id FROM Ways WHERE ways.id IN \
    (SELECT DISTINCT id FROM Way_Tags WHERE \
     way_tags.key = 'highway' AND way_tags.value = 'service' INTERSECT \
     SELECT DISTINCT id FROM way_tags WHERE \
     way_tags.key = 'landuse' AND way_tags.value = 'residential');"
## Ways defined this way are small residential driveways
smdways = run_sql(SQL, fetch=FETCH_ALL)

print "Found", len(smdways), "inappropriately tagged small residential driveways."


## Each of these way_ids should have a row inserted into the way_tags table with the way id,
## key=service, value=driveway, type=regular, and another with key=area value=yes type=regular
## Also add error_corrected=yes for my further analysis later.
## The key=landuse tag should be deleted.
with sq3.connect(DBNAME) as conn:
    cur = conn.cursor()
    for smdway in smdways:
        way_id = smdway[0]
        SQL = "INSERT INTO Way_Tags (id, key, value, type) \
            VALUES (?,?,?,?);"
        cur.execute(SQL,(way_id,'service','driveway','regular'))
        SQL = "INSERT INTO Way_Tags (id, key, value, type) \
            VALUES (?,?,?,?);"
        cur.execute(SQL, (way_id,'area','yes','regular'))
        SQL = "INSERT INTO Way_Tags (id, key, value, type) \
            VALUES(?,?,?,?);"
        cur.execute(SQL, (way_id, 'error_corrected','yes','regular'))
        SQL = "DELETE FROM Way_Tags WHERE id = ? AND key = 'landuse';"
        cur.execute(SQL,(way_id,))

print 'Update complete.'


Found 3542 inappropriately tagged small residential driveways.
Update complete.


In [16]:
## Confirm updates have been done correctly
SQL = "SELECT COUNT(*) FROM Ways WHERE ways.id IN \
    (SELECT DISTINCT id FROM Way_Tags WHERE \
     way_tags.key = 'highway' AND way_tags.value = 'service' INTERSECT \
     SELECT DISTINCT id from way_tags WHERE \
     way_tags.key = 'landuse' AND way_tags.value = 'residential');"
count = run_sql(SQL, fetch=FETCH_ONE)
print count[0], "inappropriately tagged residential driveways found."

SQL = "SELECT COUNT(*) FROM Ways WHERE ways.id IN \
    (SELECT DISTINCT id FROM Way_Tags WHERE \
    way_tags.key = 'highway' and way_tags.value = 'service' INTERSECT \
    SELECT DISTINCT id FROM Way_Tags WHERE \
    way_tags.key = 'service' and way_tags.value = 'driveway' INTERSECT \
    SELECT DISTINCT id FROM Way_Tags WHERE \
    way_tags.key = 'area' and way_tags.value = 'yes' INTERSECT \
    SELECT DISTINCT id FROM Way_Tags WHERE \
    way_tags.key = 'error_corrected' AND way_tags.value = 'yes');"
count = run_sql(SQL, fetch=FETCH_ONE)
print count[0], "fixed residential driveways found."

0 inappropriately tagged residential driveways found.
3542 fixed residential driveways found.


In [17]:
## Ways tagged with building=terrace and landuse=residential appear to be backyard patios,
## verandas, pool decks, etc.  
SQL = "SELECT id FROM Ways WHERE ways.id IN \
    (SELECT DISTINCT id FROM Way_Tags WHERE key = 'building' AND value = 'terrace' \
     INTERSECT SELECT DISTINCT id FROM way_tags WHERE key = 'landuse' AND \
     value = 'residential');"
patios = run_sql(SQL, fetch=FETCH_ALL)

## Tag these with man_made=patio for now, as this is most consistent with documentation I could
## find.  Add the error_corrected tag as well.
FIXME_NOTE = 'Should probably tag as man_made=patio, or similar.'
with sq3.connect(DBNAME) as conn:
    cur = conn.cursor()
    for patio in patios:
        SQL = "INSERT INTO Way_tags (id,key,value,type) \
            VALUES(?,?,?,?);"
        cur.execute(SQL, (patio[0],'man_made','patio','regular'))
        SQL = "INSERT INTO Way_tags (id,key,value,type) \
            VALUES (?,?,?,?);"
        cur.execute(SQL, (patio[0],'error_corrected','yes','regular'))
        SQL = "DELETE FROM Way_Tags WHERE id = ? AND key = 'landuse';"
        cur.execute(SQL, (patio[0],))
        SQL = "DELETE FROM Way_Tags WHERE id =? AND key = 'building'"
        cur.execute(SQL, (patio[0],))
        
print len(patios), "inappropriately tagged backyard patios fixed."

2070 inappropriately tagged backyard patios fixed.


In [18]:
## Verify insert query 
SQL = "SELECT COUNT(*) FROM Ways WHERE ways.id IN \
    (SELECT DISTINCT id FROM way_tags WHERE key = 'man_made' AND value = 'patio' INTERSECT \
     SELECT DISTINCT id from way_tags WHERE key = 'error_corrected' AND value = 'yes');"
fixme_patios = run_sql(SQL, fetch=FETCH_ONE)
print fixme_patios[0], 'backyard patios re-tagged.'

SQL = "SELECT COUNT (*) FROM Ways WHERE ways.id IN \
    (SELECT DISTINCT id FROM Way_Tags WHERE key = 'building' AND value = 'terrace' \
     INTERSECT SELECT DISTINCT id FROM way_tags WHERE key = 'landuse' AND \
     value = 'residential');"
bad_patios = run_sql(SQL, fetch=FETCH_ONE)
print bad_patios[0], 'inappropriately-tagged patios now in database.'

2070 backyard patios re-tagged.
0 inappropriately-tagged patios now in database.


In [19]:
## In auditing addresses the default style in the dataset is to have the address number
## separate from the street name.  There are a few addresses where the number is part of the
## addr:street tag.  Find those and fix them.

# Define custom function to use in SQL query to select bad street address formatting:

re_number_first = re.compile(r'^\d+\w+')

def number_first(address):
    m = re_number_first.match(address)
    if m:
        return 1
    else:
        return 0
for Table in ["Node_Tags", "Way_Tags"]:
    # Get the address records each table and fix them
    with sq3.connect(DBNAME) as conn:
        conn.create_function("number_first", 1, number_first)
        cur = conn.cursor()
        cur.execute("SELECT id, value FROM "+Table+" WHERE key='street' \
            AND type = 'addr' AND number_first(value) = 1")
        bad_adds = cur.fetchall()
    # print the bad addresses
    print bad_adds


    with sq3.connect(DBNAME) as conn:
        cur = conn.cursor()
        for bad_add in bad_adds:
            m = re_number_first.match(bad_add[1])
            if m:
                st_num = m.group(0).strip()
                st_name = re_number_first.sub('',bad_add[1]).strip()
                cur.execute("UPDATE " +Table+" SET value = ? WHERE id = ? AND key = ? AND type = ?", \
                       (st_name, bad_add[0],'street','addr'))
                print cur.rowcount
                # check to see if there is already a housenumber row
                cur.execute("SELECT COUNT(*) FROM "+Table+" WHERE id=? AND key='housenumber' AND \
                        type = 'addr';", (bad_add[0],))
                has_number = cur.fetchone()
                # if not create one with the number extracted from the address string
                if has_number[0] == 0:
                    cur.execute("INSERT INTO "+Table+" (id,key,value,type) \
                        VALUES (?,?,?,?)",(bad_add[0],'housenumber',st_num,'addr'))
                # add tag to indicate a record where an error was corrected
                cur.execute("INSERT INTO "+Table+" (id,key,value,type) \
                    VALUES (?,'error_corrected','yes','regular')", (bad_add[0],))

    ## Verify the fix
    for bad_add in bad_adds:
        # get all the tags for the id and display them.
        SQL = "SELECT key, value from "+Table+" WHERE id=?;"
        fixed_add = run_sql(SQL, params=(bad_add[0],), fetch=FETCH_ALL)
        print bad_add
        print fixed_add

    with sq3.connect(DBNAME) as conn:
        # repeat the original query - should be empty list returned
        conn.create_function("number_first", 1, number_first)
        cur = conn.cursor()
        cur.execute("SELECT id, value FROM "+Table+" WHERE key='street' \
            AND type = 'addr' AND number_first(value) = 1")
        bad_adds = cur.fetchall()

    print bad_adds


[(1211580027, u'565 Brunswick Road'), (1234381058, u'10057 Gold Flat Road'), (1961968435, u'2885 Bell Road')]
1
1
1
(1211580027, u'565 Brunswick Road')
[(u'city', u'Grass Valley'), (u'country', u'US'), (u'error_corrected', u'yes'), (u'full', u'565 Brunswick Road'), (u'housenumber', u'565'), (u'state', u'CA'), (u'street', u'Brunswick Road')]
(1234381058, u'10057 Gold Flat Road')
[(u'attribution', u'Caltrans'), (u'county', u'Nevada'), (u'description', u'District 3 - Sutter/Sierra Reg'), (u'district', u'3'), (u'dynsegpm', u'NEV 20 R15.9'), (u'error_corrected', u'yes'), (u'housenumber', u'10057'), (u'landuse', u'industrial'), (u'name', u'Caltrans Sutter/Sierra Region Office Maintenance Station'), (u'phone', u'530-265-4290'), (u'postcode', u'95959'), (u'route', u'20'), (u'source', u'http://www.dot.ca.gov/hq/tsip/gis/datalibrary/gisdatalibrary.html, bing'), (u'street', u'Gold Flat Road'), (u'type', u'RMH,AS,HMS,LSMS,S/SS')]
(1961968435, u'2885 Bell Road')
[(u'amenity', u'bank'), (u'city', u'

In [20]:
## The tag addr:street is used in the ways, but not nodes, in a manner inconsistent with OSM
## documentation.  Most often used to describe an intersection ("I-80 &amp; SR-49", for example).
## This type of address really should be in the tag addr:full=.  By selecting for the "@" or "&" 
## character in the addr:street name, a majority can be identified and converted to the correct
## tag.  The others can then be brought up for inspection and identified as y/n answer whether
## to reclassify.
re_inter = re.compile(r'[@&]')
def is_intersect(name):
    m = re_inter.search(name)
    if m:
        return 1
    else:
        return 0

# This query selects those addr:street way tags which we know are intersections
SQL = "SELECT id, value FROM Way_Tags WHERE type = 'addr' AND \
    key='street' and is_intersect(value) = 1;"
with sq3.connect(DBNAME) as conn:
    conn.create_function("is_intersect",1,is_intersect)
    cur = conn.cursor()
    cur.execute(SQL)
    add_ints = cur.fetchall()
    # insert a new row with an addr:full tag for each intersection description
    SQL1 = "INSERT INTO Way_Tags (id,key,value,type) \
            VALUES (?,?,?,?)"
    # insert the 'error_corrected' tag also
    SQL2 = "INSERT INTO Way_Tags (id, key, value, type) \
            VALUES(?, 'error_corrected','yes','regular')"
    for add_int in add_ints:
        cur.execute(SQL1,(add_int[0],'full',add_int[1],'addr'))
        cur.execute(SQL2, (add_int[0],))
    print "INSERT query complete."
    
# Verify INSERT query:
SQL = "SELECT id, value FROM Way_Tags WHERE id = ? AND type = 'addr' AND key = 'full'"
for add_int in add_ints:
    print "addr:street tag:",add_int
    data = run_sql(SQL, params = (add_int[0],),fetch=FETCH_ALL)
    print "addr:full tag:", data


# Delete the addr:street tags:
SQL = "DELETE FROM Way_Tags WHERE type = 'addr' AND \
    key='street' and is_intersect(value) = 1;"
with sq3.connect(DBNAME) as conn:
    conn.create_function("is_intersect",1,is_intersect)
    cur = conn.cursor()
    cur.execute(SQL)
    print cur.rowcount, "rows deleted."

print "DELETE query complete."

# Verify the DELETE query
SQL = "SELECT COUNT(*) FROM Way_Tags WHERE type = 'addr' AND \
    key='street' and is_intersect(value) = 1;"
with sq3.connect(DBNAME) as conn:
    conn.create_function("is_intersect",1,is_intersect)
    cur = conn.cursor()
    cur.execute(SQL)
    data = cur.fetchone()
    print data[0], "rows match the repeated SELECT query."    

INSERT query complete.
addr:street tag: (107277657, u'Missouri Flat Rd & Mother Lode Drive Int Se Side')
addr:full tag: [(107277657, u'Missouri Flat Rd & Mother Lode Drive Int Se Side')]
addr:street tag: (107277669, u'@ Grassy Run Int')
addr:full tag: [(107277669, u'@ Grassy Run Int')]
addr:street tag: (107277675, u'S Single Rd & Durock Rd Int')
addr:full tag: [(107277675, u'S Single Rd & Durock Rd Int')]
addr:street tag: (107277690, u'Camino Heights Dr & Sierra Blanca Dr')
addr:full tag: [(107277690, u'Camino Heights Dr & Sierra Blanca Dr')]
addr:street tag: (107277694, u'Chaparrel & Shingle Road Ne Side')
addr:full tag: [(107277694, u'Chaparrel & Shingle Road Ne Side')]
addr:street tag: (107277700, u'Ponderosa Rd & Wild Chapparal Drive')
addr:full tag: [(107277700, u'Ponderosa Rd & Wild Chapparal Drive')]
addr:street tag: (107397908, u'Int Bell Rd & Bowman Rd Nw Side')
addr:full tag: [(107397908, u'Int Bell Rd & Bowman Rd Nw Side')]
addr:street tag: (107397914, u'Sr 193 @ Lincoln / O

In [21]:
## Now there are still ways which have a addr:street tag which should be addr:full.  Select
## all ways which have the addr:street tag but not addr:housenumber.  Ask user to choose which
## need to be reclassified as addr:full
SQL = "SELECT id, value from Way_Tags WHERE key='street' AND id IN \
    (SELECT id from Way_Tags WHERE key='street' and type='addr') and id NOT IN \
    (SELECT id from Way_Tags WHERE key='housenumber' and type='addr');"
data = run_sql(SQL, fetch=FETCH_ALL)
with sq3.connect(DBNAME) as conn:
    SQL1 = "INSERT INTO Way_Tags (id, key, value, type) VALUES (?,?,?,?)"
    SQL2 = "DELETE FROM Way_Tags WHERE id = ? AND key = 'street' AND type = 'addr';"
    SQL3 = "INSERT INTO Way_Tags(id,key,value,type) VALUES (?,'error_corrected','yes','regular')"
    cur = conn.cursor()
    for datum in data:
        ans=''
        while ans not in ['y','n']:
            print datum[1]
            ans=raw_input("Reclassify as addr:full?")
            if ans == 'y':
                print "Reclassifying:", datum[1]
                cur.execute(SQL1, (datum[0],'full',datum[1],'addr'))
                cur.execute(SQL2, (datum[0],))
                cur.execute(SQL3, (datum[0],))


Horseshoe Bar Rd Int S Side
Reclassify as addr:full?y
Reclassifying: Horseshoe Bar Rd Int S Side
Penryn Rd Int Nw Of I-80 On Frontage Road (Boyington Road)
Reclassify as addr:full?y
Reclassifying: Penryn Rd Int Nw Of I-80 On Frontage Road (Boyington Road)
Ne Side Merrychase Drive Int
Reclassify as addr:full?y
Reclassifying: Ne Side Merrychase Drive Int
Nw Cnr Shingle Springs Dr Int
Reclassify as addr:full?y
Reclassifying: Nw Cnr Shingle Springs Dr Int
Hwy 174 PM 0.6
Reclassify as addr:full?y
Reclassifying: Hwy 174 PM 0.6
N Side I-80
Reclassify as addr:full?y
Reclassifying: N Side I-80
Sr 20 / Sr 174 Ic Lt Under Sr 20 Structure
Reclassify as addr:full?y
Reclassifying: Sr 20 / Sr 174 Ic Lt Under Sr 20 Structure
Across Street From Factory Outlet Stores
Reclassify as addr:full?y
Reclassifying: Across Street From Factory Outlet Stores
E Side Newcastle Rd Int
Reclassify as addr:full?y
Reclassifying: E Side Newcastle Rd Int
I-80 4 Mi East of Baxter
Reclassify as addr:full?y
Reclassifying: I-8

In [22]:
## Check the queries worked as expected
SQL = "SELECT COUNT(*) FROM Way_Tags where key = 'street' and type = 'addr' AND \
    id NOT IN (SELECT id from Way_Tags where key = 'housenumber' and type ='addr')"
data = run_sql(SQL, fetch=FETCH_ONE)
print data[0]
SQL =  "SELECT DISTINCT id, value from way_tags where key = 'full' and type = 'addr'"
data = run_sql(SQL, fetch=FETCH_ALL)
for datum in data:
    print datum

11
(28845963, u'Horseshoe Bar Rd Int S Side')
(28865741, u'Penryn Rd Int Nw Of I-80 On Frontage Road (Boyington Road)')
(107277653, u'Ne Side Merrychase Drive Int')
(107277657, u'Missouri Flat Rd & Mother Lode Drive Int Se Side')
(107277669, u'@ Grassy Run Int')
(107277675, u'S Single Rd & Durock Rd Int')
(107277690, u'Camino Heights Dr & Sierra Blanca Dr')
(107277694, u'Chaparrel & Shingle Road Ne Side')
(107277700, u'Ponderosa Rd & Wild Chapparal Drive')
(107277704, u'Nw Cnr Shingle Springs Dr Int')
(107397900, u'Hwy 174 PM 0.6')
(107397902, u'N Side I-80')
(107397905, u'Sr 20 / Sr 174 Ic Lt Under Sr 20 Structure')
(107397907, u'Across Street From Factory Outlet Stores')
(107397908, u'Int Bell Rd & Bowman Rd Nw Side')
(107397909, u'E Side Newcastle Rd Int')
(107397913, u'I-80 4 Mi East of Baxter')
(107397914, u'Sr 193 @ Lincoln / Ophir I/C')
(107397915, u'Indian Hills Road & Newcastle Rd Int')
(107397916, u'S Of Us 50, E Side Of Latrobe')
(107397928, u'Hwy 80 PM 42.1')
(107397938, u'

In [24]:
## Finally fix, where possible, elements which do not have a primary physical feature tag.
## This code used interatively to list tagged ways without physical feature tag.  Where an
## appropriate physical feature tag can be added programmatically, this is done.  This code
## Used iteratively to find the type of ways which could be reclassified and add them to the 
## if .. elif block.

SQL = "SELECT DISTINCT ways.id from Ways WHERE \
    (SELECT COUNT(*) FROM Way_tags WHERE way_tags.id = ways.id) > 0 AND Ways.id NOT IN \
    (SELECT DISTINCT id FROM Way_Tags WHERE key IN (" + pfs + "))"
data = run_sql(SQL)
print len(data)
with sq3.connect(DBNAME) as conn:
    cur = conn.cursor()
    for i, datum in enumerate(data):
        print datum[0]
        #if i>25: break
        SQL1 = "SELECT key, value from Way_Tags where id = ?"
        cur.execute(SQL1,(datum[0],))
        tags = cur.fetchall()
        print tags
        SQL2 = "INSERT INTO way_tags (id,key,value,type) \
                VALUES(?,?,?,?)"
        SQL3 = "INSERT INTO way_tags (id,key,value,type) \
            VALUES (?,'error_corrected','yes','regular')"
        for tag in tags:
            if tag[0] == 'sport' and tag[1] == 'tennis':
                print "inserting leisure=pitch tag"
                cur.execute(SQL2,(datum[0],'leisure','pitch','regular'))
                cur.execute(SQL3,(datum[0],))
                print cur.rowcount, "rows inserted"
            elif tag[0] == 'sport' and tag[1] == 'baseball':
                print "inserting leisure = pitch tag"
                cur.execute(SQL2,(datum[0],'leisure','pitch','regular'))
                cur.execute(SQL3,(datum[0],))
                print cur.rowcount, "rows inserted"
            elif tag[0] == 'attribution' and tag[1] =='Farmland Mapping and Monitoring Program':
                print "inserting landuse=other tag"
                cur.execute(SQL2, (datum[0],'landuse','other','regular'))
                cur.execute(SQL3,(datum[0],))
                print cur.rowcount, "rows inserted"
            elif tag[0] == 'sport' and tag[1] == 'volleyball':
                print "inserting leisure=pitch tag"
                cur.execute(SQL2, (datum[0],'leisure','pitch','regular'))
                cur.execute(SQL3,(datum[0],))
                print cur.rowcount, "rows inserted"
            elif tag[0] == 'golf':
                print "inserting leisure=golf_course tag"
                cur.execute(SQL2, (datum[0],'leisure','golf_course','regular'))
                cur.execute(SQL3,(datum[0],))
                print cur.rowcount, "rows inserted"
            elif tag[0] == 'service' and tag[1] == 'parking_aisle':
                print "inserting highway=service tag"
                cur.execute(SQL2, (datum[0], 'highway','service','regular'))
                cur.execute(SQL3,(datum[0],))
                print cur.rowcount, "rows inserted"
                

34
165786334
[(u'com_id', u'63430298'), (u'fcode', u'39009'), (u'ftype', u'390'), (u'reach_code', u'18020129002651'), (u'source', u'NHD')]
165786868
[(u'com_id', u'63434950'), (u'fcode', u'39009'), (u'ftype', u'390'), (u'reach_code', u'18020129001096'), (u'source', u'NHD')]
205519695
[(u'public_transport', u'platform')]
217681639
[(u'note', u'these alleged ways are not accessible to the public nor visible on Bing imagery')]
218147876
[(u'area', u'yes')]
221259890
[(u'area', u'yes'), (u'name', u'FM5 cafe patio')]
221259891
[(u'area', u'yes'), (u'name', u'FM7 cafe patio')]
221259892
[(u'area', u'yes'), (u'name', u'FM1 cafe patio')]
232279475
[(u'access', u'private'), (u'service', u'driveway')]
232279490
[(u'access', u'private')]
237626595
[(u'name', u'China Wall Staging Area'), (u'operator', u'Forest Service')]
247958150
[(u'access', u'private')]
260425766
[(u'area', u'yes'), (u'parking', u'surface')]
260425767
[(u'area', u'yes'), (u'parking', u'surface')]
260425768
[(u'area', u'yes'), (

In [16]:
# Repeat the process for nodes.  Select tagged nodes with no primary physical feature tag, and
# which are not part of a way.  It is OK for a node to be used to mark something on a way, 
# such as a 'fixme' spot, or a dead end on a highway, building entrance, and not have its own
# primary feature tag so assume we don't need to look at those nodes.
# Code is used iteratively to list the nodes, idenitfy and fix those
# where a physical feature tag can be added programmatically, without ambiguity.

SQL = "SELECT DISTINCT nodes.id from Nodes WHERE \
    (SELECT COUNT(*) FROM Node_tags WHERE Node_tags.id = nodes.id) > 0 \
    AND Nodes.id NOT IN \
    (SELECT DISTINCT id FROM Node_Tags WHERE key IN (" + pfs + ")) AND Nodes.id NOT IN \
    (SELECT DISTINCT Node_id from Way_Nodes)"
data = run_sql(SQL)
print len(data)
with sq3.connect(DBNAME) as conn:
    cur = conn.cursor()
    for i, datum in enumerate(data):
        # if i>40: break
        SQL1 = "SELECT key, value from Node_Tags where id = ?"
        cur.execute(SQL1,(datum[0],))
        tags = cur.fetchall()
        print tags
        SQL2 = "INSERT INTO Node_tags (id,key,value,type) \
                VALUES(?,?,?,?)"
        SQL3 = "INSERT INTO Node_tags (id, key, value, type) \
                VALUES(?, 'error_corrected', 'yes', 'regular')"
        for tag in tags:
            if tag[0] == 'housenumber':
                # these are addresses with no building or other feature, assume they
                # are just address locaters where place=plot is appropriate tag
                print "inserting place=plot tag"
                cur.execute(SQL2,(datum[0],'place','plot','regular'))
                cur.execute(SQL3,(datum[0],))
                print cur.rowcount, "rows updated."
            elif tag[0] == 'whitewater':
                # these are features associated with a river
                print 'inserting waterway=river tag'
                cur.execute(SQL2, (datum[0],'waterway','river','regular'))
                print cur.rowcount, "rows updated."
                cur.execute(SQL3,(datum[0],))
            elif tag[0] == 'crossing':
                # these are features associated with a highway
                print 'inserting highway=crossing tag'
                cur.execute(SQL2, (datum[0],'highway','crossing','regular'))
                cur.execute(SQL3,(datum[0],))
            elif tag[0] == 'junction' and tag[1] == 'roundabout':
                # these should be used with 'highway=junction'
                print 'inserting highway=junction tag'
                cur.execute(SQL2, (datum[0],'highway','junction','regular'))
                cur.execute(SQL3, (datum[0],))


34
[(u'bitcoin', u'yes'), (u'description', u'Affordable web hosting, web development, Wordpress customization & training, and more. 15% bitcoin discount.'), (u'error_corrected', u'no'), (u'name', u'SierraHosts.net'), (u'website', u'http://sierrahosts.net')]
[(u'error_corrected', u'no'), (u'name', u'Shiny Green Serpentine')]
[(u'error_corrected', u'no'), (u'name', u'Refinery Room'), (u'source', u'local_knowledge')]
[(u'error_corrected', u'no'), (u'name', u'Model Room'), (u'source', u'local_knowledge')]
[(u'error_corrected', u'no'), (u'note', u'There used to be another tower here per 2006-7 USGS Ortho, not seen in ~2009 Bing imagery nor during 2011 survey')]
[(u'advertising', u'billboard'), (u'error_corrected', u'no'), (u'source', u'bing')]
[(u'error_corrected', u'no'), (u'name', u'Blue Shield')]
[(u'error_corrected', u'no'), (u'name', u'Nevada County Government Center')]
[(u'error_corrected', u'no'), (u'name', u'Hills Flat Lumber Co.')]
[(u'error_corrected', u'no'), (u'name', u'Lathrope

In [27]:
# Now add a tag error_corrected=no to the ways and nodes where the correct physical feature tag
# can not be added programmatically.  These would need to be investigated individually to 
# identify the nature of the feature.
SQL = "SELECT DISTINCT nodes.id from Nodes WHERE \
    (SELECT COUNT(*) FROM Node_tags WHERE Node_tags.id = nodes.id) > 0 \
    AND Nodes.id NOT IN \
    (SELECT DISTINCT id FROM Node_Tags WHERE key IN (" + pfs + ")) \
    AND Nodes.id NOT IN \
    (SELECT DISTINCT node_id FROM Way_Nodes)"
No_pft_nodes = run_sql(SQL, fetch=FETCH_ALL)
print len(No_pft_nodes)
SQL = "INSERT INTO Node_tags (id,key,value,type) \
    VALUES (?, 'error_corrected','no','regular')"
with sq3.connect(DBNAME) as conn:
    cur = conn.cursor()
    for node in No_pft_nodes:
        cur.execute(SQL, (node[0],))
print 'done'

34
done


In [28]:
SQL = "SELECT DISTINCT ways.id from Ways WHERE \
    (SELECT COUNT(*) FROM Way_tags WHERE way_tags.id = ways.id) > 0 AND Ways.id NOT IN \
    (SELECT DISTINCT id FROM Way_Tags WHERE key IN (" + pfs + "))"
No_pft_ways = run_sql(SQL, fetch=FETCH_ALL)

print len(No_pft_ways)

SQL = "INSERT INTO Way_tags (id, key, value, type) \
        VALUES(?, 'error_corrected','no','regular')"
with sq3.connect(DBNAME) as conn:
    cur=conn.cursor()
    for way in No_pft_ways:
        SQL = "INSERT INTO Way_tags (id,key,value,type) \
            VALUES (?, 'error_corrected','no','regular')"
        cur.execute(SQL, (way[0],))

print 'done'        

34
done


And that's it.  All but 68 of the tagged ways and nodes have a valid physical feature tag.  Where tagging errors could be corrected using OSM documentation guidelines, they have been fixed.  Where there isn't a clear answer to correct tagging, a FIXME= tag has been added.  Address formatting has been cleaned up.  Zip codes have been corrected.  Applying the results of this project back to the OSM database could provide a significant improvement in map quality in the covered area.  Add 'error_corrected=no' to these last few ways and nodes which can't be easily fixed.