In [24]:
# -*- coding: utf-8 -*-


import json
import cchardet as chardet
from pprint import pprint
from tqdm import tnrange, tqdm_notebook
from time import sleep
from bs4 import BeautifulSoup
from pymongo import MongoClient
from pymongo.mongo_client import database
from pymongo.collection import Collection
from pprint import pprint
from pymongo import IndexModel, ASCENDING, DESCENDING


def tqdm_ipython_test():
    for i in tnrange(3, desc='1st loop'):
        for j in tqdm_notebook(range(100), desc='2nd loop'):
            sleep(0.01)


def read_osm_file(filename: str):
    with open(filename, "r", encoding='UTF-8') as f:
        msg = f.read()
        # result = chardet.detect(msg)
    return msg


def get_soup(file, tags):
    soup = BeautifulSoup(file, 'xml')
    return [{tag: soup.find_all(tag)} for tag in tqdm_notebook(tags)]


def get_dict_data(result_set_item):
    list_of_dicts = []
    for k, v in result_set_item:
        primary_tag = k
        result_set = v
    for entry in result_set:
        entry_data_dict = {}
        entry_data_dict['type'] = primary_tag
        for k, v in entry.attrs.items():
            entry_data_dict[k] = v
        for tag in entry.find_all('tag'):
            entry_data_dict[tag['k']] = tag['v']
        list_of_dicts.append(entry_data_dict)
    return list_of_dicts


def json_to_mongo(col: database.Collection, json_file: str ="rochester_osm.json" ):
    # data = []
    with open(json_file) as f:
        data = json.load(f)
        #for line in f:
        #    data.append(json.loads(line))
    for node_dict in data:
        col.insert_many(node_dict)
    # return col.insert_many(data)
    


def get_col(db_name="udacity", collection="rochester_osm"):
    from pymongo import MongoClient
    client = MongoClient('localhost:27017')
    db = client[db_name]
    col = db[collection]
    return col


def main():
    file_name = "rochester_ny.osm"
    osm_file = read_osm_file(filename=file_name)
    tag_list = ['node', 'way']
    result_set_list = get_soup(file=osm_file, tags=tag_list)
    osm_dicts = [get_dict_data(res) for res in result_set_list.values()]
    with open("osm_dicts", 'w') as f:
        f.write(osm_dicts)

### Code to convert the osm to json in preparation for insertion to MongoDB
```python
# Loading The osm file
file_name = r"rochester_ny.osm"
osm_file = read_osm_file(filename=file_name)
# Loading the osm file into beautiful soup and grabbing all node and way tags
tag_list = ['node', 'way']
%time result_set_list = get_soup(file=osm_file, tags=tag_list)
# make list of dictionaries containing the attribute and tag data for the result set
osm_dicts = [get_dict_data(res.items()) for res in result_set_list]
# osm_dicts = [get_dict_data(res) for res in result_set_list.values()]
# dump this dict data to a json object so that parsing doesn't need to be re-run
json_osm = json.dumps(osm_dicts)
# write the json to file
with open('rochester_osm.json', 'w') as f:
    f.write(json_osm)
```

### Initial MongoDB collection creation
- Insert all records from json file
- Create compound unique index on 'id' and 'type' fields

In [25]:
# read the json file we just read to verify it's working
from importlib import import_module
j2m = import_module('json_to_mongo')
%time j2m.main()

457947 records inserted from rochester_osm.json
[{'key': SON([('_id', 1)]),
  'name': '_id_',
  'ns': 'udacity.rochester_osm',
  'v': 2},
 {'key': SON([('id', 1), ('type', -1)]),
  'name': 'id_type_unique_index',
  'ns': 'udacity.rochester_osm',
  'unique': True,
  'v': 2}]
Wall time: 15.1 s


In [26]:
# setup connection for data exploration and cleaning
osm_col = get_col() # type: MongoClient

### Query total document count

In [27]:
total_docs = osm_col.count_documents({})
total_docs

457947

### Get count of each key in collection

In [28]:
key_counts_dict = dict()
for entry in tqdm_notebook(osm_col.find(), total=total_docs):
    for k in entry.keys():
        key_counts_dict.setdefault(k, 0)
        key_counts_dict[k] += 1

HBox(children=(IntProgress(value=0, max=457947), HTML(value='')))




In [29]:
#itemgetter used with sorted to allow sorting by key values
from operator import itemgetter
pprint(sorted(key_counts_dict.items(), key=itemgetter(1), reverse=True))

[('_id', 457947),
 ('type', 457947),
 ('id', 457947),
 ('version', 457947),
 ('timestamp', 457947),
 ('changeset', 457947),
 ('uid', 457947),
 ('user', 457947),
 ('lat', 405420),
 ('lon', 405420),
 ('highway', 32238),
 ('name', 17997),
 ('building', 14960),
 ('tiger:county', 12249),
 ('tiger:cfcc', 12242),
 ('tiger:name_base', 11459),
 ('tiger:name_type', 9816),
 ('tiger:zip_left', 8696),
 ('tiger:zip_right', 8478),
 ('tiger:reviewed', 8253),
 ('service', 6392),
 ('addr:street', 5587),
 ('addr:housenumber', 5238),
 ('addr:postcode', 5123),
 ('addr:city', 5104),
 ('addr:state', 4612),
 ('surface', 3766),
 ('amenity', 3739),
 ('oneway', 3347),
 ('power', 2814),
 ('source', 2228),
 ('access', 2052),
 ('addr:country', 1994),
 ('ref', 1887),
 ('footway', 1880),
 ('leisure', 1528),
 ('lanes', 1439),
 ('landuse', 1400),
 ('railway', 1242),
 ('tiger:source', 1225),
 ('tiger:upload_uuid', 1224),
 ('tiger:name_base_1', 1217),
 ('tiger:tlid', 1211),
 ('operator', 1021),
 ('hgv', 1005),
 ('parking

### Get a list of fields that begin with address

In [30]:

address_fields = {k:v for (k, v) in key_counts_dict.items() if 'addr' in k}
pprint(sorted(address_fields.items(), key=itemgetter(1), reverse=True))



[('addr:street', 5587),
 ('addr:housenumber', 5238),
 ('addr:postcode', 5123),
 ('addr:city', 5104),
 ('addr:state', 4612),
 ('addr:country', 1994),
 ('addr:unit', 68),
 ('addr:housenumber_1', 34),
 ('addr:housename', 33),
 ('addr:housenumber_2', 5),
 ('addr:housenumber_3', 5),
 ('addr:housenumber_4', 5),
 ('addr:housenumber_5', 5),
 ('addr:street_1', 4),
 ('addr:floor', 2),
 ('addr:place', 2),
 ('addr:city_1', 2),
 ('addr:street_2', 2),
 ('addr:province', 1),
 ('addr:floot', 1),
 ('addr:pobox', 1),
 ('addr:full', 1),
 ('addr:street_3', 1),
 ('addr:housenumber_6', 1),
 ('addr:housenumber_7', 1),
 ('source:addr', 1)]


In [31]:
# Get a list of distinct streets
distinct_streets = osm_col.distinct('addr:street')

In [32]:
# Get a list of distinct street types
pprint(set(x.split()[-1] for x in distinct_streets))

{'#102',
 '#2',
 '#A-2',
 '31',
 '92',
 'Apartment',
 'Ave',
 'Ave.',
 'Avenu',
 'Avenue',
 'Bend',
 'Blvd',
 'Boulelvard',
 'Boulevard',
 'Bridge',
 'Center',
 'Cir',
 'Circle',
 'Court',
 'Crescent',
 'Ct',
 'Dr',
 'Drive',
 'Drop',
 'East',
 'Green',
 'Highway',
 'Hill',
 'Homes',
 'Landing',
 'Lane',
 'Manor',
 'Market',
 'Meadows',
 'N',
 'North',
 'Oaks',
 'PW',
 'Park',
 'Parkway',
 'Passage',
 'Place',
 'Race',
 'Rd',
 'Rd.',
 'Rise',
 'Road',
 'Run',
 'S',
 'South',
 'Spruce',
 'Square',
 'St',
 'St.',
 'Stree',
 'Street',
 'Trail',
 'Villas',
 'W',
 'Way',
 'West',
 'Woods',
 'ave',
 'line'}


### find all address codes in collection

In [33]:
unique_zip_codes = osm_col.distinct('addr:postcode')
pprint(unique_zip_codes)

['14607',
 '14624',
 '14617',
 '14623',
 '14622',
 '14612',
 '14626',
 '14450',
 '14618',
 '14616',
 '14526',
 '14502',
 '14514',
 '14615',
 '14580',
 '14620',
 '14625',
 '14445',
 '14608',
 '14609',
 '14606',
 '14559',
 '14621',
 '14613',
 '14534',
 '14604',
 '14614',
 '14620-1327',
 'West Main Street',
 '14694',
 '14605',
 '14610',
 '14611',
 '14468',
 '14607-2082',
 '14519',
 '14642',
 '14627',
 '14624-4721',
 '14617-1822',
 '14467',
 '14692',
 '14568',
 '14543',
 '14586',
 '14428',
 '1445033',
 '14424',
 '14619']


In [34]:
update_dict = {'modified': 0,
              'deleted': 0,
              'good': 0}
for zip in tqdm_notebook(unique_zip_codes):
    if zip[0:5].isdigit() and len(zip) > 5:
        result = osm_col.update_many({'addr:postcode': zip}, {"$set": {'addr:postcode': zip[0:5]}}) 
        update_dict['modified'] += result.modified_count
    elif not zip.isdigit() and len(zip)!=5:
        result = osm_col.delete_many({'addr:postcode': zip})
        update_dict['deleted'] += result.deleted_count
    elif zip.isdigit() and len(zip)==5:
        update_dict['good'] += 1

pprint(update_dict)

HBox(children=(IntProgress(value=0, max=49), HTML(value='')))


{'deleted': 1, 'good': 43, 'modified': 6}


In [35]:
#
updated_address_code_list = list(osm_col.find({'addr:postcode': {'$exists': True}}, {'addr:postcode': 1, '_id': 0}))
set([x['addr:postcode'] for x in updated_address_code_list])


{'14424',
 '14428',
 '14445',
 '14450',
 '14467',
 '14468',
 '14502',
 '14514',
 '14519',
 '14526',
 '14534',
 '14543',
 '14559',
 '14568',
 '14580',
 '14586',
 '14604',
 '14605',
 '14606',
 '14607',
 '14608',
 '14609',
 '14610',
 '14611',
 '14612',
 '14613',
 '14614',
 '14615',
 '14616',
 '14617',
 '14618',
 '14619',
 '14620',
 '14621',
 '14622',
 '14623',
 '14624',
 '14625',
 '14626',
 '14627',
 '14642',
 '14692',
 '14694'}

[Rochester Zip codes](https://www.zip-codes.com/city/ny-rochester.asp)
> After running our function we can see that all the unique zip codes in the database are valid Rochester Zip codes

# User Counts

In [49]:
def get_single_users(col: Collection):
    user_counts_dict = list(col.aggregate([
        {
            '$sortByCount': '$user'
        }, {
            '$sort': {
                'count': 1
            }
        }
    ]))
    single_doc_user = list()
    for entry in user_counts_dict:
        if entry['count'] == 1:
           single_doc_user.append(entry['_id'])
        else:
            break
    pprint(single_doc_user[0:5])
    pprint(f"{len(single_doc_user)} users with only one post out of {len(user_counts_dict)}")    
    # return single_doc_user
get_single_users(osm_col)


['dgitto', 'Takuto', 'lonvia', 'glglgl', 'ayazhaider9']
'146 users with only one post out of 719'
