Skip to content

Commit

Permalink
Merge pull request #6 from TTitcombe/identify_street_names
Browse files Browse the repository at this point in the history
Identify street names
  • Loading branch information
TTitcombe committed Feb 25, 2020
2 parents f8a35b0 + bee3029 commit 10176fc
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 29 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ repos:
rev: 19.3b0
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-isort
- repo: https://github.com/pre-commit/mirrors-isort
rev: v4.3.20
hooks:
- id: isort
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
# unit testing
- pytest >=4.4 # for unit testing
# code quality
- pre-commit # for automatic code quality checking
- black # for automatic code formatting
- isort # for import standardization
- flake8 # for linting
Expand Down
2 changes: 1 addition & 1 deletion privacypanda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .addresses import check_addresses
from .addresses import *

__version__ = "0.1.0dev"
30 changes: 18 additions & 12 deletions privacypanda/addresses.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,34 @@
import numpy as np
import pandas as pd

__all__ = ["check_addresses"]

OBJECT_DTYPE = np.dtype("O")

# Regex constants
# ----- Regex constants ----- #
LETTER = "[a-zA-Z]"

# UK Postcode
UK_POSTCODE_PATTERN = re.compile(
LETTER + LETTER + "\\d{1,2}" + "\\s+" + "\\d" + LETTER + LETTER
)

# Street names
STREET_ENDINGS = "[street|road|way|avenue]"

# Simple address is up to a four digit number + street name with 1-10 characters
# + one of "road", "street", "way", "avenue"
SIMPLE_ADDRESS_PATTERN = re.compile(
"[0-9]{1,4}\\s[a-z]{1,10}\\s" + STREET_ENDINGS, re.I
)


def check_addresses(df: pd.DataFrame) -> List:
"""
Check a dataframe for columns containing addresses. Returns a list of column
names which contain at least one address
"Addresses" currently only concerns UK postcodes, which are of the form:
* Two letters
* One or two digits
* Whitespace
* One digit
* Two letters
E.g.:
* AB1 1AB
* AB12 1AB
"Addresses" currently only concerns UK postcodes and simple street names.
This implementation does not consider whether the addresses are real.
Parameters
Expand All @@ -41,7 +46,6 @@ def check_addresses(df: pd.DataFrame) -> List:
-------
List
The names of columns which contain at least one address
"""
private_cols = []

Expand All @@ -52,7 +56,9 @@ def check_addresses(df: pd.DataFrame) -> List:
if row.dtype == OBJECT_DTYPE:
for item in row:

if UK_POSTCODE_PATTERN.match(item):
if UK_POSTCODE_PATTERN.match(item) or SIMPLE_ADDRESS_PATTERN.match(
item
):
private_cols.append(col)
break # 1 failure is enough

Expand Down
32 changes: 17 additions & 15 deletions tests/test_address_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import privacypanda as pp


def test_can_identify_column_containing_simple_UK_postcode():
@pytest.mark.parametrize("postcode", ["AB1 1AB", "AB12 1AB", "AB1 1AB"])
def test_can_identify_column_containing_UK_postcode(postcode):
df = pd.DataFrame(
{"privateColumn": ["a", "AB1 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
{"privateColumn": ["a", postcode, "c"], "nonPrivateColumn": ["a", "b", "c"]}
)

actual_private_columns = pp.check_addresses(df)
Expand All @@ -18,9 +19,21 @@ def test_can_identify_column_containing_simple_UK_postcode():
assert actual_private_columns == expected_private_columns


def test_can_identify_column_containing_simple_UK_postcode_with_extra_digit():
@pytest.mark.parametrize(
"address",
[
"10 Downing Street",
"10 downing street",
"1 the Road",
"01 The Road",
"1234 The Road",
"55 Maple Avenue",
"4 Python Way",
],
)
def test_can_identify_column_containing_simple_street_names(address):
df = pd.DataFrame(
{"privateColumn": ["a", "AB12 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
{"privateColumn": ["a", address, "c"], "nonPrivateColumn": ["a", "b", "c"]}
)

actual_private_columns = pp.check_addresses(df)
Expand All @@ -38,14 +51,3 @@ def test_address_check_returns_empty_list_if_no_addresses_found():
expected_private_columns = []

assert actual_private_columns == expected_private_columns


def test_identifies_UK_postcode_with_tab_separated_sections():
df = pd.DataFrame(
{"privateColumn": ["a", "AB12 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
)

actual_private_columns = pp.check_addresses(df)
expected_private_columns = ["privateColumn"]

assert actual_private_columns == expected_private_columns

0 comments on commit 10176fc

Please sign in to comment.