Merge pull request #6 from TTitcombe/identify_street_names

Identify street names
TTitcombe · Feb 25, 2020 · 10176fc · 10176fc
2 parents f8a35b0 + bee3029
commit 10176fc
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 29 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,7 +9,7 @@ repos:
     rev: 19.3b0
     hooks:
     -   id: black
-- repo: https://github.com/pre-commit/mirrors-isort
+-   repo: https://github.com/pre-commit/mirrors-isort
     rev: v4.3.20
     hooks:
     - id: isort
diff --git a/environment.yml b/environment.yml
@@ -13,6 +13,7 @@ dependencies:
 # unit testing
   - pytest >=4.4  # for unit testing
 # code quality
+  - pre-commit  # for automatic code quality checking
   - black  # for automatic code formatting
   - isort  # for import standardization
   - flake8  # for linting

diff --git a/privacypanda/__init__.py b/privacypanda/__init__.py
@@ -1,3 +1,3 @@
-from .addresses import check_addresses
+from .addresses import *
 
 __version__ = "0.1.0dev"
diff --git a/privacypanda/addresses.py b/privacypanda/addresses.py
@@ -7,29 +7,34 @@
 import numpy as np
 import pandas as pd
 
+__all__ = ["check_addresses"]
+
 OBJECT_DTYPE = np.dtype("O")
 
-# Regex constants
+# ----- Regex constants ----- #
 LETTER = "[a-zA-Z]"
+
+# UK Postcode
 UK_POSTCODE_PATTERN = re.compile(
     LETTER + LETTER + "\\d{1,2}" + "\\s+" + "\\d" + LETTER + LETTER
 )
 
+# Street names
+STREET_ENDINGS = "[street|road|way|avenue]"
+
+# Simple address is up to a four digit number + street name with 1-10 characters
+# + one of "road", "street", "way", "avenue"
+SIMPLE_ADDRESS_PATTERN = re.compile(
+    "[0-9]{1,4}\\s[a-z]{1,10}\\s" + STREET_ENDINGS, re.I
+)
+
 
 def check_addresses(df: pd.DataFrame) -> List:
     """
     Check a dataframe for columns containing addresses. Returns a list of column
     names which contain at least one address
 
-    "Addresses" currently only concerns UK postcodes, which are of the form:
-    * Two letters
-    * One or two digits
-    * Whitespace
-    * One digit
-    * Two letters
-    E.g.:
-    * AB1 1AB
-    * AB12 1AB
+    "Addresses" currently only concerns UK postcodes and simple street names.
     This implementation does not consider whether the addresses are real.
 
     Parameters
@@ -41,7 +46,6 @@ def check_addresses(df: pd.DataFrame) -> List:
     -------
     List
         The names of columns which contain at least one address
-
     """
     private_cols = []
 
@@ -52,7 +56,9 @@ def check_addresses(df: pd.DataFrame) -> List:
         if row.dtype == OBJECT_DTYPE:
             for item in row:
 
-                if UK_POSTCODE_PATTERN.match(item):
+                if UK_POSTCODE_PATTERN.match(item) or SIMPLE_ADDRESS_PATTERN.match(
+                    item
+                ):
                     private_cols.append(col)
                     break  # 1 failure is enough
 

diff --git a/tests/test_address_identification.py b/tests/test_address_identification.py
@@ -7,9 +7,10 @@
 import privacypanda as pp
 
 
-def test_can_identify_column_containing_simple_UK_postcode():
+@pytest.mark.parametrize("postcode", ["AB1 1AB", "AB12 1AB", "AB1    1AB"])
+def test_can_identify_column_containing_UK_postcode(postcode):
     df = pd.DataFrame(
-        {"privateColumn": ["a", "AB1 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
+        {"privateColumn": ["a", postcode, "c"], "nonPrivateColumn": ["a", "b", "c"]}
     )
 
     actual_private_columns = pp.check_addresses(df)
@@ -18,9 +19,21 @@ def test_can_identify_column_containing_simple_UK_postcode():
     assert actual_private_columns == expected_private_columns
 
 
-def test_can_identify_column_containing_simple_UK_postcode_with_extra_digit():
+@pytest.mark.parametrize(
+    "address",
+    [
+        "10 Downing Street",
+        "10 downing street",
+        "1 the Road",
+        "01 The Road",
+        "1234 The Road",
+        "55 Maple Avenue",
+        "4 Python Way",
+    ],
+)
+def test_can_identify_column_containing_simple_street_names(address):
     df = pd.DataFrame(
-        {"privateColumn": ["a", "AB12 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
+        {"privateColumn": ["a", address, "c"], "nonPrivateColumn": ["a", "b", "c"]}
     )
 
     actual_private_columns = pp.check_addresses(df)
@@ -38,14 +51,3 @@ def test_address_check_returns_empty_list_if_no_addresses_found():
     expected_private_columns = []
 
     assert actual_private_columns == expected_private_columns
-
-
-def test_identifies_UK_postcode_with_tab_separated_sections():
-    df = pd.DataFrame(
-        {"privateColumn": ["a", "AB12   1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
-    )
-
-    actual_private_columns = pp.check_addresses(df)
-    expected_private_columns = ["privateColumn"]
-
-    assert actual_private_columns == expected_private_columns