Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Identify street names #6

Merged
merged 5 commits into from
Feb 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ repos:
rev: 19.3b0
hooks:
- id: black
- repo: https://github.com/pre-commit/mirrors-isort
- repo: https://github.com/pre-commit/mirrors-isort
rev: v4.3.20
hooks:
- id: isort
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies:
# unit testing
- pytest >=4.4 # for unit testing
# code quality
- pre-commit # for automatic code quality checking
- black # for automatic code formatting
- isort # for import standardization
- flake8 # for linting
Expand Down
2 changes: 1 addition & 1 deletion privacypanda/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .addresses import check_addresses
from .addresses import *

__version__ = "0.1.0dev"
30 changes: 18 additions & 12 deletions privacypanda/addresses.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,34 @@
import numpy as np
import pandas as pd

__all__ = ["check_addresses"]

OBJECT_DTYPE = np.dtype("O")

# Regex constants
# ----- Regex constants ----- #
LETTER = "[a-zA-Z]"

# UK Postcode
UK_POSTCODE_PATTERN = re.compile(
LETTER + LETTER + "\\d{1,2}" + "\\s+" + "\\d" + LETTER + LETTER
)

# Street names
STREET_ENDINGS = "[street|road|way|avenue]"

# Simple address is up to a four digit number + street name with 1-10 characters
# + one of "road", "street", "way", "avenue"
SIMPLE_ADDRESS_PATTERN = re.compile(
"[0-9]{1,4}\\s[a-z]{1,10}\\s" + STREET_ENDINGS, re.I
)


def check_addresses(df: pd.DataFrame) -> List:
"""
Check a dataframe for columns containing addresses. Returns a list of column
names which contain at least one address

"Addresses" currently only concerns UK postcodes, which are of the form:
* Two letters
* One or two digits
* Whitespace
* One digit
* Two letters
E.g.:
* AB1 1AB
* AB12 1AB
"Addresses" currently only concerns UK postcodes and simple street names.
This implementation does not consider whether the addresses are real.

Parameters
Expand All @@ -41,7 +46,6 @@ def check_addresses(df: pd.DataFrame) -> List:
-------
List
The names of columns which contain at least one address

"""
private_cols = []

Expand All @@ -52,7 +56,9 @@ def check_addresses(df: pd.DataFrame) -> List:
if row.dtype == OBJECT_DTYPE:
for item in row:

if UK_POSTCODE_PATTERN.match(item):
if UK_POSTCODE_PATTERN.match(item) or SIMPLE_ADDRESS_PATTERN.match(
item
):
private_cols.append(col)
break # 1 failure is enough

Expand Down
32 changes: 17 additions & 15 deletions tests/test_address_identification.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import privacypanda as pp


def test_can_identify_column_containing_simple_UK_postcode():
@pytest.mark.parametrize("postcode", ["AB1 1AB", "AB12 1AB", "AB1 1AB"])
def test_can_identify_column_containing_UK_postcode(postcode):
df = pd.DataFrame(
{"privateColumn": ["a", "AB1 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
{"privateColumn": ["a", postcode, "c"], "nonPrivateColumn": ["a", "b", "c"]}
)

actual_private_columns = pp.check_addresses(df)
Expand All @@ -18,9 +19,21 @@ def test_can_identify_column_containing_simple_UK_postcode():
assert actual_private_columns == expected_private_columns


def test_can_identify_column_containing_simple_UK_postcode_with_extra_digit():
@pytest.mark.parametrize(
"address",
[
"10 Downing Street",
"10 downing street",
"1 the Road",
"01 The Road",
"1234 The Road",
"55 Maple Avenue",
"4 Python Way",
],
)
def test_can_identify_column_containing_simple_street_names(address):
df = pd.DataFrame(
{"privateColumn": ["a", "AB12 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
{"privateColumn": ["a", address, "c"], "nonPrivateColumn": ["a", "b", "c"]}
)

actual_private_columns = pp.check_addresses(df)
Expand All @@ -38,14 +51,3 @@ def test_address_check_returns_empty_list_if_no_addresses_found():
expected_private_columns = []

assert actual_private_columns == expected_private_columns


def test_identifies_UK_postcode_with_tab_separated_sections():
df = pd.DataFrame(
{"privateColumn": ["a", "AB12 1AB", "c"], "nonPrivateColumn": ["a", "b", "c"]}
)

actual_private_columns = pp.check_addresses(df)
expected_private_columns = ["privateColumn"]

assert actual_private_columns == expected_private_columns