Merge pull request #1868 from SEED-platform/1867-bldg-parsing

1867 bldg parsing
SEED-platform · May 4, 2019 · 837f414 · 837f414
2 parents 7606ca2 + 0939ce9
commit 837f414
Show file tree

Hide file tree

Showing 4 changed files with 134 additions and 72 deletions.
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -44,7 +44,7 @@ raven==6.9.0
 jellyfish==0.6.1
 Markdown==3.0.1
 python-dateutil==2.7.3
-street-address==0.3.0
+street-address==0.4.0
 unicodecsv==0.14.1
 unidecode==1.0.22
 usaddress==0.5.10

diff --git a/seed/migrations/0102_auto_20190503_1251.py b/seed/migrations/0102_auto_20190503_1251.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11.20 on 2019-05-03 19:51
+from __future__ import unicode_literals
+
+from django.db import migrations, transaction
+
+from seed.utils.address import normalize_address_str
+
+
+def forwards(apps, schema_editor):
+    PropertyState = apps.get_model("seed", "PropertyState")
+    TaxLotState = apps.get_model("seed", "TaxLotState")
+
+    with transaction.atomic():
+        for index, p in enumerate(PropertyState.objects.filter(address_line_1__isnull=False)):
+            if index % 1000 == 0:
+                print('iterating ... %s' % index)
+
+            p.normalized_address = normalize_address_str(p.address_line_1)
+            p.save(update_fields=["normalized_address"])
+
+        for index, t in enumerate(TaxLotState.objects.filter(address_line_1__isnull=False)):
+            if index % 1000 == 0:
+                print('iterating ... %s' % index)
+
+            t.normalized_address = normalize_address_str(t.address_line_1)
+            t.save(update_fields=["normalized_address"])
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ('seed', '0101_auto_20190318_1835'),
+    ]
+
+    operations = [
+        migrations.RunPython(forwards),
+    ]
diff --git a/seed/tests/test_address_normalization.py b/seed/tests/test_address_normalization.py
@@ -9,72 +9,68 @@
 from seed.utils.address import normalize_address_str
 
 
-def make_method(message, expected):
-    def run(self):
-        result = normalize_address_str(message)
-        self.assertEquals(expected, result)
-    return run
+class TestColumnListSettings(TestCase):
 
+    def test_adding_columns(self):
+        cases = [
+            # case, test str, expected resulting string, actual response
+            ('simple', '123 Test St.', '123 test st'),
+            ('none input', None, None),
+            ('empty input', '', None),
+            ('missing number', 'Test St.', 'test st'),
+            ('missing street', '123', '123'),
+            ('integer address', 123, '123'),
+            ('strip leading zeros', '0000123', '123'),
+            ('street 1', 'STREET', 'st'),
+            ('street 2', 'Street', 'st'),
+            ('boulevard', 'Boulevard', 'blvd'),
+            ('avenue', 'avenue', 'ave'),
+            ('trailing direction', '123 Test St. NE', '123 test st ne'),
+            ('prefix direction', '123 South Test St.', '123 s test st'),
+            ('verbose direction', '123 Test St. Northeast', '123 test st ne'),
+            ('two directions', '111 S West Main', '111 s west main'),
+            ('numeric street and direction', '555 11th St. NW', '555 11th st nw'),
+            ('direction 1', '100 Main S', '100 main s'),
+            ('direction 2', '100 Main South', '100 main s'),
+            ('direction 3', '100 Main S.', '100 main s'),
+            ('direction 4', '100 Main', '100 main'),
+            # Found edge cases
+            # https://github.com/SEED-platform/seed/issues/378
+            ('regression 1', '100 Peach Ave. East', '100 peach ave e'),
+            ('regression 2', '100 Peach Avenue E.', '100 peach ave e'),
+            ('multiple addresses', 'M St., N St., 4th St., Delaware St., SW',
+             'm st., n st., 4th st., delaware st., sw'),
+            # House numbers declared as ranges
+            ('no range separator', '300 322 S Green St', '300-322 s green st'),
+            ('- as separator no whitespace', '300-322 S Green St', '300-322 s green st'),
+            ('/ as separator no whitespace', '300/322 S Green St', '300-322 s green st'),
+            ('\\ as separator no whitespace', '300\\322 S Green St', '300-322 s green st'),
+            ('- as separator whitespace', '300 - 322 S Green St', '300-322 s green st'),
+            ('/ as separator whitespace', '300 / 322 S Green St', '300-322 s green st'),
+            ('\\ as separator whitespace', '300 \\ 322 S Green St', '300-322 s green st'),
+            # Ranges which leave off common prefix.
+            ('end of range leaves off common prefix', '300-22 S Green St', '300-322 s green st'),
+            # Odd characters
+            ('unicode characters', '123 Main St\uFFFD', '123 main st'),
+            # Straight numbers
+            ('straight numbers', 56195600100, '56195600100'),
+            # bytestrings
+            ('bytestring', b'123456 Main St', '123456 main st'),
+            # Suites / building ids
+            ('suite 1', '2655   SEELY AV Suite 9', '2655 seely av suite 9'),
+            ('suite 2', '2655   SEELY AV Ste 9', '2655 seely av suite 9'),
+            ('bldg 1', '2655   SEELY AV BLDG 9', '2655 seely av building 9'),
+            ('bldg 2', '2655   SEELY AV BUILDING 9', '2655 seely av building 9'),
+            ('b+s 1', '2655   SEELY AV BUILDING 9a ste 50', '2655 seely av building 9a suite 50'),
+            ('b+s 2', '2655   SEELY AV BUILDING 9 suite 50', '2655 seely av building 9 suite 50'),
+        ]
 
-# Metaclass to create individual test methods per test case.
-class NormalizeAddressTester(type):
+        results = []
+        expected_results = [c[2] for c in cases]
 
-    def __new__(cls, name, bases, attrs):
-        cases = attrs.get('cases', [])
+        for case in cases:
+            results.append(normalize_address_str(case[1]))
 
-        for doc, message, expected in cases:
-            test = make_method(message, expected)
-            test_name = 'test_normalize_address_%s' % doc.lower().replace(' ', '_')
-            if test_name in attrs:
-                raise KeyError("Test name {0} duplicated".format(test_name))
-            test.__name__ = test_name
-            test.__doc__ = doc
-            attrs[test_name] = test
-        return super(NormalizeAddressTester, cls).__new__(cls, name, bases, attrs)
-
-
-class NormalizeStreetAddressTests(TestCase):
-    __metaclass__ = NormalizeAddressTester
-
-    # test name, input, expected output
-    cases = [
-        ('simple', '123 Test St.', '123 test st'),
-        ('none input', None, None),
-        ('empty input', '', None),
-        ('missing number', 'Test St.', 'test st'),
-        ('missing street', '123', '123'),
-        ('integer address', 123, '123'),
-        ('strip leading zeros', '0000123', '123'),
-        ('street 1', 'STREET', 'st'),
-        ('street 2', 'Street', 'st'),
-        ('boulevard', 'Boulevard', 'blvd'),
-        ('avenue', 'avenue', 'ave'),
-        ('trailing direction', '123 Test St. NE', '123 test st ne'),
-        ('prefix direction', '123 South Test St.', '123 s test st'),
-        ('verbose direction', '123 Test St. Northeast', '123 test st ne'),
-        ('two directions', '111 S West Main', '111 s west main'),
-        ('numeric street and direction', '555 11th St. NW', '555 11th st nw'),
-        ('direction 1', '100 Main S', '100 main s'),
-        ('direction 2', '100 Main South', '100 main s'),
-        ('direction 3', '100 Main S.', '100 main s'),
-        ('direction 4', '100 Main', '100 main'),
-        # Found edge cases
-        # https://github.com/SEED-platform/seed/issues/378
-        ('regression 1', '100 Peach Ave. East', '100 peach ave e'),
-        ('regression 2', '100 Peach Avenue E.', '100 peach ave e'),
-        ('multiple addresses', 'M St., N St., 4th St., Delaware St., SW', 'm st., n st., 4th st., delaware st., sw'),
-        # House numbers declared as ranges
-        ('no range separator', '300 322 S Green St', '300-322 s green st'),
-        ('- as separator no whitespace', '300-322 S Green St', '300-322 s green st'),
-        ('/ as separator no whitespace', '300/322 S Green St', '300-322 s green st'),
-        ('\\ as separator no whitespace', '300\\322 S Green St', '300-322 s green st'),
-        ('- as separator whitespace', '300 - 322 S Green St', '300-322 s green st'),
-        ('/ as separator whitespace', '300 / 322 S Green St', '300-322 s green st'),
-        ('\\ as separator whitespace', '300 \\ 322 S Green St', '300-322 s green st'),
-        # Ranges which leave off common prefix.
-        ('end of range leaves off common prefix', '300-22 S Green St', '300-322 s green st'),
-        # Odd characters
-        ('unicode characters', '123 Main St\uFFFD', '123 main st'),
-        # Straight numbers
-        ('straight numbers', 56195600100, '56195600100'),
-    ]
+        # print(results)
+        # print(expected_results)
+        self.assertListEqual(results, expected_results)
diff --git a/seed/utils/address.py b/seed/utils/address.py
@@ -8,10 +8,32 @@
 import re
 
 import usaddress
-from past.builtins import basestring
+# from past.builtins import basestring
 from streetaddress import StreetAddressFormatter
 
 
+def _normalize_subaddress_type(subaddress_type):
+    subaddress_type = subaddress_type.lower().replace('.', '')
+    map = {
+        'bldg': 'building',
+        'blg': 'building',
+    }
+    if subaddress_type in map:
+        return map[subaddress_type]
+    return subaddress_type
+
+
+def _normalize_occupancy_type(occupancy_id):
+    occupancy_id = occupancy_id.lower().replace('.', '')
+    map = {
+        'ste': 'suite',
+        'suite': 'suite',
+    }
+    if occupancy_id in map:
+        return map[occupancy_id]
+    return occupancy_id
+
+
 def _normalize_address_direction(direction):
     direction = direction.lower().replace('.', '')
     direction_map = {
@@ -83,17 +105,18 @@ def normalize_address_str(address_val):
 
     If a valid address is provided, a normalized version is returned.
     """
-
     # if this string is empty the regular expression in the sa wont
     # like it, and fail, so leave returning nothing
     if not address_val:
         return None
 
-    # encode the string as utf-8
-    if not isinstance(address_val, basestring):
+    # if this is a byte string, then convert to a string-string
+    if isinstance(address_val, bytes):
+        address_val = address_val.decode('utf-8')
+    elif not isinstance(address_val, str):
         address_val = str(address_val)
     else:
-        address_val = str(address_val.encode('utf-8'))
+        pass
 
     # Do some string replacements to remove odd characters that we come across
     replacements = {
@@ -137,8 +160,14 @@ def normalize_address_str(address_val):
             normalized_address = normalized_address + ' ' + _normalize_address_direction(
                 addr['StreetNamePostDirectional'])  # NOQA
 
+        if 'SubaddressType' in addr and addr['SubaddressType'] is not None:
+            normalized_address = normalized_address + ' ' + _normalize_subaddress_type(addr['SubaddressType'])  # NOQA
+
+        if 'SubaddressIdentifier' in addr and addr['SubaddressIdentifier'] is not None:
+            normalized_address = normalized_address + ' ' + addr['SubaddressIdentifier']
+
         if 'OccupancyType' in addr and addr['OccupancyType'] is not None:
-            normalized_address = normalized_address + ' ' + addr['OccupancyType']
+            normalized_address = normalized_address + ' ' + _normalize_occupancy_type(addr['OccupancyType'])
 
         if 'OccupancyIdentifier' in addr and addr['OccupancyIdentifier'] is not None:
             normalized_address = normalized_address + ' ' + addr['OccupancyIdentifier']