Skip to content

Commit

Permalink
Merge pull request #1868 from SEED-platform/1867-bldg-parsing
Browse files Browse the repository at this point in the history
1867 bldg parsing
  • Loading branch information
nllong committed May 4, 2019
2 parents 7606ca2 + 0939ce9 commit 837f414
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 72 deletions.
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ raven==6.9.0
jellyfish==0.6.1
Markdown==3.0.1
python-dateutil==2.7.3
street-address==0.3.0
street-address==0.4.0
unicodecsv==0.14.1
unidecode==1.0.22
usaddress==0.5.10
Expand Down
37 changes: 37 additions & 0 deletions seed/migrations/0102_auto_20190503_1251.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11.20 on 2019-05-03 19:51
from __future__ import unicode_literals

from django.db import migrations, transaction

from seed.utils.address import normalize_address_str


def forwards(apps, schema_editor):
PropertyState = apps.get_model("seed", "PropertyState")
TaxLotState = apps.get_model("seed", "TaxLotState")

with transaction.atomic():
for index, p in enumerate(PropertyState.objects.filter(address_line_1__isnull=False)):
if index % 1000 == 0:
print('iterating ... %s' % index)

p.normalized_address = normalize_address_str(p.address_line_1)
p.save(update_fields=["normalized_address"])

for index, t in enumerate(TaxLotState.objects.filter(address_line_1__isnull=False)):
if index % 1000 == 0:
print('iterating ... %s' % index)

t.normalized_address = normalize_address_str(t.address_line_1)
t.save(update_fields=["normalized_address"])


class Migration(migrations.Migration):
dependencies = [
('seed', '0101_auto_20190318_1835'),
]

operations = [
migrations.RunPython(forwards),
]
126 changes: 61 additions & 65 deletions seed/tests/test_address_normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,72 +9,68 @@
from seed.utils.address import normalize_address_str


def make_method(message, expected):
def run(self):
result = normalize_address_str(message)
self.assertEquals(expected, result)
return run
class TestColumnListSettings(TestCase):

def test_adding_columns(self):
cases = [
# case, test str, expected resulting string, actual response
('simple', '123 Test St.', '123 test st'),
('none input', None, None),
('empty input', '', None),
('missing number', 'Test St.', 'test st'),
('missing street', '123', '123'),
('integer address', 123, '123'),
('strip leading zeros', '0000123', '123'),
('street 1', 'STREET', 'st'),
('street 2', 'Street', 'st'),
('boulevard', 'Boulevard', 'blvd'),
('avenue', 'avenue', 'ave'),
('trailing direction', '123 Test St. NE', '123 test st ne'),
('prefix direction', '123 South Test St.', '123 s test st'),
('verbose direction', '123 Test St. Northeast', '123 test st ne'),
('two directions', '111 S West Main', '111 s west main'),
('numeric street and direction', '555 11th St. NW', '555 11th st nw'),
('direction 1', '100 Main S', '100 main s'),
('direction 2', '100 Main South', '100 main s'),
('direction 3', '100 Main S.', '100 main s'),
('direction 4', '100 Main', '100 main'),
# Found edge cases
# https://github.com/SEED-platform/seed/issues/378
('regression 1', '100 Peach Ave. East', '100 peach ave e'),
('regression 2', '100 Peach Avenue E.', '100 peach ave e'),
('multiple addresses', 'M St., N St., 4th St., Delaware St., SW',
'm st., n st., 4th st., delaware st., sw'),
# House numbers declared as ranges
('no range separator', '300 322 S Green St', '300-322 s green st'),
('- as separator no whitespace', '300-322 S Green St', '300-322 s green st'),
('/ as separator no whitespace', '300/322 S Green St', '300-322 s green st'),
('\\ as separator no whitespace', '300\\322 S Green St', '300-322 s green st'),
('- as separator whitespace', '300 - 322 S Green St', '300-322 s green st'),
('/ as separator whitespace', '300 / 322 S Green St', '300-322 s green st'),
('\\ as separator whitespace', '300 \\ 322 S Green St', '300-322 s green st'),
# Ranges which leave off common prefix.
('end of range leaves off common prefix', '300-22 S Green St', '300-322 s green st'),
# Odd characters
('unicode characters', '123 Main St\uFFFD', '123 main st'),
# Straight numbers
('straight numbers', 56195600100, '56195600100'),
# bytestrings
('bytestring', b'123456 Main St', '123456 main st'),
# Suites / building ids
('suite 1', '2655 SEELY AV Suite 9', '2655 seely av suite 9'),
('suite 2', '2655 SEELY AV Ste 9', '2655 seely av suite 9'),
('bldg 1', '2655 SEELY AV BLDG 9', '2655 seely av building 9'),
('bldg 2', '2655 SEELY AV BUILDING 9', '2655 seely av building 9'),
('b+s 1', '2655 SEELY AV BUILDING 9a ste 50', '2655 seely av building 9a suite 50'),
('b+s 2', '2655 SEELY AV BUILDING 9 suite 50', '2655 seely av building 9 suite 50'),
]

# Metaclass to create individual test methods per test case.
class NormalizeAddressTester(type):
results = []
expected_results = [c[2] for c in cases]

def __new__(cls, name, bases, attrs):
cases = attrs.get('cases', [])
for case in cases:
results.append(normalize_address_str(case[1]))

for doc, message, expected in cases:
test = make_method(message, expected)
test_name = 'test_normalize_address_%s' % doc.lower().replace(' ', '_')
if test_name in attrs:
raise KeyError("Test name {0} duplicated".format(test_name))
test.__name__ = test_name
test.__doc__ = doc
attrs[test_name] = test
return super(NormalizeAddressTester, cls).__new__(cls, name, bases, attrs)


class NormalizeStreetAddressTests(TestCase):
__metaclass__ = NormalizeAddressTester

# test name, input, expected output
cases = [
('simple', '123 Test St.', '123 test st'),
('none input', None, None),
('empty input', '', None),
('missing number', 'Test St.', 'test st'),
('missing street', '123', '123'),
('integer address', 123, '123'),
('strip leading zeros', '0000123', '123'),
('street 1', 'STREET', 'st'),
('street 2', 'Street', 'st'),
('boulevard', 'Boulevard', 'blvd'),
('avenue', 'avenue', 'ave'),
('trailing direction', '123 Test St. NE', '123 test st ne'),
('prefix direction', '123 South Test St.', '123 s test st'),
('verbose direction', '123 Test St. Northeast', '123 test st ne'),
('two directions', '111 S West Main', '111 s west main'),
('numeric street and direction', '555 11th St. NW', '555 11th st nw'),
('direction 1', '100 Main S', '100 main s'),
('direction 2', '100 Main South', '100 main s'),
('direction 3', '100 Main S.', '100 main s'),
('direction 4', '100 Main', '100 main'),
# Found edge cases
# https://github.com/SEED-platform/seed/issues/378
('regression 1', '100 Peach Ave. East', '100 peach ave e'),
('regression 2', '100 Peach Avenue E.', '100 peach ave e'),
('multiple addresses', 'M St., N St., 4th St., Delaware St., SW', 'm st., n st., 4th st., delaware st., sw'),
# House numbers declared as ranges
('no range separator', '300 322 S Green St', '300-322 s green st'),
('- as separator no whitespace', '300-322 S Green St', '300-322 s green st'),
('/ as separator no whitespace', '300/322 S Green St', '300-322 s green st'),
('\\ as separator no whitespace', '300\\322 S Green St', '300-322 s green st'),
('- as separator whitespace', '300 - 322 S Green St', '300-322 s green st'),
('/ as separator whitespace', '300 / 322 S Green St', '300-322 s green st'),
('\\ as separator whitespace', '300 \\ 322 S Green St', '300-322 s green st'),
# Ranges which leave off common prefix.
('end of range leaves off common prefix', '300-22 S Green St', '300-322 s green st'),
# Odd characters
('unicode characters', '123 Main St\uFFFD', '123 main st'),
# Straight numbers
('straight numbers', 56195600100, '56195600100'),
]
# print(results)
# print(expected_results)
self.assertListEqual(results, expected_results)
41 changes: 35 additions & 6 deletions seed/utils/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,32 @@
import re

import usaddress
from past.builtins import basestring
# from past.builtins import basestring
from streetaddress import StreetAddressFormatter


def _normalize_subaddress_type(subaddress_type):
subaddress_type = subaddress_type.lower().replace('.', '')
map = {
'bldg': 'building',
'blg': 'building',
}
if subaddress_type in map:
return map[subaddress_type]
return subaddress_type


def _normalize_occupancy_type(occupancy_id):
occupancy_id = occupancy_id.lower().replace('.', '')
map = {
'ste': 'suite',
'suite': 'suite',
}
if occupancy_id in map:
return map[occupancy_id]
return occupancy_id


def _normalize_address_direction(direction):
direction = direction.lower().replace('.', '')
direction_map = {
Expand Down Expand Up @@ -83,17 +105,18 @@ def normalize_address_str(address_val):
If a valid address is provided, a normalized version is returned.
"""

# if this string is empty the regular expression in the sa wont
# like it, and fail, so leave returning nothing
if not address_val:
return None

# encode the string as utf-8
if not isinstance(address_val, basestring):
# if this is a byte string, then convert to a string-string
if isinstance(address_val, bytes):
address_val = address_val.decode('utf-8')
elif not isinstance(address_val, str):
address_val = str(address_val)
else:
address_val = str(address_val.encode('utf-8'))
pass

# Do some string replacements to remove odd characters that we come across
replacements = {
Expand Down Expand Up @@ -137,8 +160,14 @@ def normalize_address_str(address_val):
normalized_address = normalized_address + ' ' + _normalize_address_direction(
addr['StreetNamePostDirectional']) # NOQA

if 'SubaddressType' in addr and addr['SubaddressType'] is not None:
normalized_address = normalized_address + ' ' + _normalize_subaddress_type(addr['SubaddressType']) # NOQA

if 'SubaddressIdentifier' in addr and addr['SubaddressIdentifier'] is not None:
normalized_address = normalized_address + ' ' + addr['SubaddressIdentifier']

if 'OccupancyType' in addr and addr['OccupancyType'] is not None:
normalized_address = normalized_address + ' ' + addr['OccupancyType']
normalized_address = normalized_address + ' ' + _normalize_occupancy_type(addr['OccupancyType'])

if 'OccupancyIdentifier' in addr and addr['OccupancyIdentifier'] is not None:
normalized_address = normalized_address + ' ' + addr['OccupancyIdentifier']
Expand Down

0 comments on commit 837f414

Please sign in to comment.