In [7]:
# -*- coding: utf-8 -*-


import json
import cchardet as chardet
from pprint import pprint
from tqdm import tnrange, tqdm_notebook
from time import sleep
from bs4 import BeautifulSoup
from pymongo import MongoClient
from pymongo.mongo_client import database


def tqdm_ipython_test():
    for i in tnrange(3, desc='1st loop'):
        for j in tqdm_notebook(range(100), desc='2nd loop'):
            sleep(0.01)


def read_osm_file(filename: str):
    with open(filename, "r", encoding='UTF-8') as f:
        msg = f.read()
        # result = chardet.detect(msg)
    return msg


def get_soup(file, tags):
    soup = BeautifulSoup(file, 'xml')
    return [{tag: soup.find_all(tag)} for tag in tqdm_notebook(tags)]


def get_dict_data(result_set_item):
    list_of_dicts = []
    for k, v in result_set_item:
        primary_tag = k
        result_set = v
    for entry in result_set:
        entry_data_dict = {}
        entry_data_dict['type'] = primary_tag
        for k, v in entry.attrs.items():
            entry_data_dict[k] = v
        for tag in entry.find_all('tag'):
            entry_data_dict[tag['k']] = tag['v']
        list_of_dicts.append(entry_data_dict)
    return list_of_dicts


def json_to_mongo(col: database.Collection, json_file: str ="rochester_osm.json" ):
    # data = []
    with open(json_file) as f:
        data = json.load(f)
        #for line in f:
        #    data.append(json.loads(line))
    for node_dict in data:
        col.insert_many(node_dict)
    # return col.insert_many(data)
    


def get_db(db_name="udacity", collection="rochester_osm"):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    col = db[collection]
    return col


def main():
    file_name = "rochester_ny.osm"
    osm_file = read_osm_file(filename=file_name)
    tag_list = ['node', 'way']
    result_set_list = get_soup(file=osm_file, tags=tag_list)

    osm_dicts = [get_dict_data(res) for res in result_set_list.values()]

    with open("osm_dicts", 'w') as f:
        f.write(osm_dicts)
    # json_file = "twitter.json"
    # db = get_db("udacity")
    # col = db.twitter
    # json_to_mongo(json_file=json_file, col=col)



In [None]:
if __name__ == "__main__":
    main()


### Code to convert the osm to json in preparation for insertion to MongoDB
```python
# Loading The osm file
file_name = r"rochester_ny.osm"
osm_file = read_osm_file(filename=file_name)
# Loading the osm file into beautiful soup and grabbing all node and way tags
tag_list = ['node', 'way']
%time result_set_list = get_soup(file=osm_file, tags=tag_list)
# make list of dictionaries containing the attribute and tag data for the result set
osm_dicts = [get_dict_data(res.items()) for res in result_set_list]
# osm_dicts = [get_dict_data(res) for res in result_set_list.values()]
# dump this dict data to a json object so that parsing doesn't need to be re-run
json_osm = json.dumps(osm_dicts)
# write the json to file
with open('rochester_osm.json', 'w') as f:
    f.write(json_osm)
```

In [11]:
pwd()

'C:\\Users\\Riley\\PycharmProjects\\DataWrangling\\Final_Project'

In [15]:
# read the json file we just read to verify it's working
from importlib import import_module
j2m = import_module('json_to_mongo')
%time j2m.main()

Wall time: 13.3 s


In [16]:
# connect to the database/collection we'll be storing the osm data in

from pymongo.mongo_client import BulkWriteError
db = get_db("udacity")
col = db.rochester_osm
for dl in osm_dicts:
    try:
        col.insert_many(dl)
    except BulkWriteError as BWE:
        details = BWE.details
        pprint(details['writeErrors'][0]['errmsg'])
       # pprint([err['writeErrors'][0]['errmsg'] for err in BWE.details])

('E11000 duplicate key error collection: udacity.rochester_osm index: '
 'index_type_id dup key: { : "18598788", : "node" }')
('E11000 duplicate key error collection: udacity.rochester_osm index: '
 'index_type_id dup key: { : "15089724", : "way" }')


## May get a bulk write error above if the records already exist

A unique compound index is built on the 'id'(not to be confused with '_id') and 'type' fields

This should help prevent duplicate entries since the 'id' field seems to be unique within each node type

In [31]:
# use function we defined to upload our json data to the collection
json_file = "rochester_osm.json"
try:
    json_to_mongo(json_file=json_file, col=col)
except BulkWriteError as BWE:
        details = BWE.details
        pprint(details['writeErrors'][0]['errmsg'])

('E11000 duplicate key error collection: udacity.rochester_osm index: '
 'index_type_id dup key: { : "18598788", : "node" }')


In [1]:
# setup connection for data exploration and cleaning

from pymongo import MongoClient
database_name = "udacity"
collection_name = "rochester_osm"

client = MongoClient('localhost:27017')
db = client[database_name]
osm_col = db[collection_name]

### Below we get a list of all unique keys in the rochester_osm database:

In [17]:

# Run a query to get a list of unique keys for the osm Data we uploaded
from pprint import pprint

pipeline = [
    {
        '$project': {
            'arrayofkeyvalue': {
                '$objectToArray': '$$ROOT'
            }
        }
    }, {
        '$unwind': {
            'path': '$arrayofkeyvalue'
        }
    }, {
        '$group': {
            '_id': None,
            'allkeys': {
                '$addToSet': '$arrayofkeyvalue.k'
            }
        }
    }
]

# unique_osm_keys = list(osm_col.aggregate(pipeline=pipeline))


unique_osm_keys = list(osm_col.aggregate(pipeline=pipeline))[0]['allkeys'] # type : list
print(f'{len(unique_osm_keys)} unique keys in {collection_name}')
pprint(unique_osm_keys)


606 unique keys in rochester_osm
['dist:gold',
 'abandoned:building',
 'seamark:type',
 'seamark:harbour:category',
 'managed',
 'phases',
 'tomb',
 'min_height',
 'psv',
 'floating',
 'bulk_purchase',
 'historic_1',
 'payment:tap_to_pay',
 'memorial:text',
 'monitoring:air_traffic',
 'room',
 'beds',
 'maxspeed:backward',
 'rooftop',
 'surface:colour',
 'line',
 'roller_coaster',
 'denotation',
 'note:lanes',
 'source:imagery',
 'contact:facebook',
 'ref:store_number',
 'road',
 'monument',
 'memorial_1',
 'content',
 'wall',
 'bridge:structure',
 'site_ownership',
 'protection_object',
 'protect_class',
 'historic:tunnel',
 'source:addr',
 'payment:electronic_purses',
 'stormwater',
 'check_date',
 'wildlife',
 'green',
 'addr:housenumber_6',
 'addr:street_3',
 'addr:street_2',
 'alt_name_2',
 'addr:city_1',
 'addr:housenumber_3',
 'addr:housenumber_1',
 'turn:lanes',
 'way',
 'OBJECTID',
 'addr:street_1',
 'addr:housenumber_2',
 'industrial',
 'abandoned',
 'store_ref',
 'waterway_1

### code snippet to find the count of a particular field

In [29]:
osm_col.find({"lit": {"$exists": True}}, {"lit": 1, "_id": 0}).count()

  """Entry point for launching an IPython kernel.


497

### code snippet to find the counts of each particular field

In [31]:
key_counts_dict = dict()

for field_key in unique_osm_keys:
   key_counts_dict[field_key] = osm_col.find({f"{field_key}": {"$exists": True}}, {f"{field_key}": 1, "_id": 0 }).count() 
   pprint(key_counts_dict[field_key])
   

  after removing the cwd from sys.path.


{'FIXME': 9,
 'FIXME:hgv:state_network': 3,
 'FIXME:motorboat': 1,
 'FIXME:oneway': 2,
 'FIXME:ship': 1,
 'NHS': 136,
 'OBJECTID': 73,
 'Routes': 1,
 '_id': 457947,
 'abandoned': 1,
 'abandoned:building': 1,
 'access': 2052,
 'access:conditional': 1,
 'addr:city': 5104,
 'addr:city_1': 2,
 'addr:country': 1994,
 'addr:floor': 2,
 'addr:floot': 1,
 'addr:full': 1,
 'addr:housename': 33,
 'addr:housenumber': 5238,
 'addr:housenumber_1': 34,
 'addr:housenumber_2': 5,
 'addr:housenumber_3': 5,
 'addr:housenumber_4': 5,
 'addr:housenumber_5': 5,
 'addr:housenumber_6': 1,
 'addr:housenumber_7': 1,
 'addr:place': 2,
 'addr:pobox': 1,
 'addr:postcode': 5123,
 'addr:province': 1,
 'addr:state': 4612,
 'addr:street': 5588,
 'addr:street_1': 4,
 'addr:street_2': 2,
 'addr:street_3': 1,
 'addr:unit': 68,
 'admin_level': 53,
 'aeroway': 104,
 'alt_name': 117,
 'alt_name_1': 2,
 'alt_name_2': 1,
 'amenity': 3739,
 'amenity_1': 5,
 'animal_shelter': 1,
 'area': 186,
 'artist_name': 9,
 'artwork_type'

### Get a list of fields that begin with address

In [32]:

address_fields = [x for x in unique_osm_keys[0]['allkeys'] if str(x).startswith('addr')]
pprint(sorted(address_fields))



TypeError: string indices must be integers

### find all address codes in collection

In [2]:
return_field = {'_id': False,
                'addr:postcode': True}

address_code_list = list(osm_col.find({'addr:postcode': {'$exists': True}}, {'addr:postcode': 1, '_id': 0}))
pprint(address_code_list)




NameError: name 'pprint' is not defined

In [3]:
# get unique zip codes
unique_zip_codes = set([x['addr:postcode'] for x in address_code_list])
pprint(unique_zip_codes)


NameError: name 'pprint' is not defined

In [5]:

testersss = list(osm_col.find({'addr:postcode': "1445033"}, {'_id': 0}))


In [None]:
# fix the outlier record with googled zipcode
testersss = list(osm_col.find({'addr:postcode': "West Main Street"}))

myquery = {'id': "1609006999"}
newvalues = [{"$set": {"addr:postcode": "14614"}},  # zip code from googling address
             {"$set": {"addr:street": "West Main Street"}}]  # upserting street address

for value in newvalues:
    update_s = osm_col.update_one(myquery, value)
    pprint(update_s.raw_result)
    pprint(update_s.upserted_id)


In [None]:
# Validate update worked
pprint(list(osm_col.find({'id': '1609006999'})))

In [None]:
# get a list of zip codes that aren't the standard 5 digit format
zips_to_fix = [x for x in unique_zip_codes if len(x) > 5 and str(x)[0:5].isdigit()]


In [None]:
# loop to fix the malformed zipcodes
for value in zips_to_fix:
    myquery = {"addr:postcode": value}
    value = {"$set": {"addr:postcode": str(value[0:5])}}
    update_s = osm_col.update_one(myquery, value)
    pprint(update_s.raw_result)
    pprint(update_s.upserted_id)
    pprint(update_s.acknowledged)
    pprint(update_s.matched_count)


In [None]:
#
updated_address_code_list = list(osm_col.find({'addr:postcode': {'$exists': True}}, {'addr:postcode': 1, '_id': 0}))
set([x['addr:postcode'] for x in updated_address_code_list])
