diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ef992f4..ff98b7b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: rev: 19.3b0 hooks: - id: black -- repo: https://github.com/pre-commit/mirrors-isort +- repo: https://github.com/pre-commit/mirrors-isort rev: v4.3.20 hooks: - id: isort diff --git a/environment.yml b/environment.yml index eba697d..9547fa0 100644 --- a/environment.yml +++ b/environment.yml @@ -13,6 +13,7 @@ dependencies: # unit testing - pytest >=4.4 # for unit testing # code quality + - pre-commit # for automatic code quality checking - black # for automatic code formatting - isort # for import standardization - flake8 # for linting diff --git a/privacypanda/__init__.py b/privacypanda/__init__.py index 6a4a8db..a13b45a 100644 --- a/privacypanda/__init__.py +++ b/privacypanda/__init__.py @@ -1,3 +1,3 @@ -from .addresses import check_addresses +from .addresses import * __version__ = "0.1.0dev" diff --git a/privacypanda/addresses.py b/privacypanda/addresses.py index 05c4128..5198ecc 100644 --- a/privacypanda/addresses.py +++ b/privacypanda/addresses.py @@ -7,29 +7,34 @@ import numpy as np import pandas as pd +__all__ = ["check_addresses"] + OBJECT_DTYPE = np.dtype("O") -# Regex constants +# ----- Regex constants ----- # LETTER = "[a-zA-Z]" + +# UK Postcode UK_POSTCODE_PATTERN = re.compile( LETTER + LETTER + "\\d{1,2}" + "\\s+" + "\\d" + LETTER + LETTER ) +# Street names +STREET_ENDINGS = "[street|road|way|avenue]" + +# Simple address is up to a four digit number + street name with 1-10 characters +# + one of "road", "street", "way", "avenue" +SIMPLE_ADDRESS_PATTERN = re.compile( + "[0-9]{1,4}\\s[a-z]{1,10}\\s" + STREET_ENDINGS, re.I +) + def check_addresses(df: pd.DataFrame) -> List: """ Check a dataframe for columns containing addresses. Returns a list of column names which contain at least one address - "Addresses" currently only concerns UK postcodes, which are of the form: - * Two letters - * One or two digits - * Whitespace - * One digit - * Two letters - E.g.: - * AB1 1AB - * AB12 1AB + "Addresses" currently only concerns UK postcodes and simple street names. This implementation does not consider whether the addresses are real. Parameters @@ -41,7 +46,6 @@ def check_addresses(df: pd.DataFrame) -> List: ------- List The names of columns which contain at least one address - """ private_cols = [] @@ -52,7 +56,9 @@ def check_addresses(df: pd.DataFrame) -> List: if row.dtype == OBJECT_DTYPE: for item in row: - if UK_POSTCODE_PATTERN.match(item): + if UK_POSTCODE_PATTERN.match(item) or SIMPLE_ADDRESS_PATTERN.match( + item + ): private_cols.append(col) break # 1 failure is enough diff --git a/tests/test_address_identification.py b/tests/test_address_identification.py index 09bf2e3..f6100c0 100644 --- a/tests/test_address_identification.py +++ b/tests/test_address_identification.py @@ -7,9 +7,10 @@ import privacypanda as pp -def test_can_identify_column_containing_simple_UK_postcode(): +@pytest.mark.parametrize("postcode", ["AB1 1AB", "AB12 1AB", "AB1 1AB"]) +def test_can_identify_column_containing_UK_postcode(postcode): df = pd.DataFrame( - {"privateColumn": ["a", "AB1 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]} + {"privateColumn": ["a", postcode, "c"], "nonPrivateColumn": ["a", "b", "c"]} ) actual_private_columns = pp.check_addresses(df) @@ -18,9 +19,21 @@ def test_can_identify_column_containing_simple_UK_postcode(): assert actual_private_columns == expected_private_columns -def test_can_identify_column_containing_simple_UK_postcode_with_extra_digit(): +@pytest.mark.parametrize( + "address", + [ + "10 Downing Street", + "10 downing street", + "1 the Road", + "01 The Road", + "1234 The Road", + "55 Maple Avenue", + "4 Python Way", + ], +) +def test_can_identify_column_containing_simple_street_names(address): df = pd.DataFrame( - {"privateColumn": ["a", "AB12 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]} + {"privateColumn": ["a", address, "c"], "nonPrivateColumn": ["a", "b", "c"]} ) actual_private_columns = pp.check_addresses(df) @@ -38,14 +51,3 @@ def test_address_check_returns_empty_list_if_no_addresses_found(): expected_private_columns = [] assert actual_private_columns == expected_private_columns - - -def test_identifies_UK_postcode_with_tab_separated_sections(): - df = pd.DataFrame( - {"privateColumn": ["a", "AB12 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]} - ) - - actual_private_columns = pp.check_addresses(df) - expected_private_columns = ["privateColumn"] - - assert actual_private_columns == expected_private_columns