# Token Signature Outliers for Street Names

Find street names that do not include at least one token from token signature that represents street names in U.S. address columns. Uses the **NYC Parking Violations Issued - Fiscal Year 2014** dataset.

In [1]:
# Download the full 'DOB Job Application Fiings' dataset.

import gzip
import os

from openclean.data.source.socrata import Socrata

datafile = './jt7v-77mi.tsv.gz'

with gzip.open(datafile, 'wb') as f:
    ds = Socrata().dataset('jt7v-77mi')
    ds.write(f)


# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/jt7v-77mi.tsv.gz'

# Setup the environment for this demo. All downloaded reference
# data files will be stored in a subfolder refdata_tmp.

from refdata.config import ENV_BASEDIR

os.environ[ENV_BASEDIR] = './refdata_tmp'

In [2]:
# Download the street abbreviation reference dataset.

import openclean.data.refdata as refdata

refdata.download('usps:street_abbrev')

In [3]:
# Use streaming function to avoid having to load the full dataset
# into memory.

from openclean.pipeline import stream

df = stream(datafile)

In [4]:
# Get distinct set of street names. By computing the distinct set of
# street names first we avoid computing keys for each distinct street
# name multiple times.

streets = df.select('Street').distinct()

print('{} distinct street names'.format(len(streets)))

115567 distinct street names


In [5]:
# Create a token signature from the street abbreviations.

from openclean.operator.map.groupby import groupby
from openclean.operator.transform.apply import apply
from openclean.profiling.pattern.token_signature import token_signature

# Convert all values to liwer case.
street_abbrev = refdata.load('usps:street_abbrev').df()
street_abbrev = apply(street_abbrev, columns=street_abbrev.columns, func=str.lower)
# Create one signature entry for each unique primary suffix.
groups = groupby(street_abbrev, columns='primary_suffix')
signature = token_signature(groups, columns=list(street_abbrev.columns))

In [6]:
# Print the token signature.
signature

[{'hllw', 'hollow', 'hollows', 'holw', 'holws'},
 {'ramp'},
 {'wall'},
 {'clfs', 'cliffs'},
 {'plaza', 'plz', 'plza'},
 {'gardens', 'gdns', 'grdns'},
 {'bch', 'beach'},
 {'lake', 'lk'},
 {'shoars', 'shores', 'shrs'},
 {'unions', 'uns'},
 {'ranch', 'ranches', 'rnch', 'rnchs'},
 {'glens', 'glns'},
 {'div', 'divide', 'dv', 'dvd'},
 {'haven', 'hvn'},
 {'sq', 'sqr', 'sqre', 'squ', 'square'},
 {'rd', 'road'},
 {'pr', 'prairie', 'prr'},
 {'brdge', 'brg', 'bridge'},
 {'exp', 'expr', 'express', 'expressway', 'expw', 'expy'},
 {'isle', 'isles'},
 {'forges', 'frgs'},
 {'skwy', 'skyway'},
 {'bgs', 'burgs'},
 {'point', 'pt'},
 {'glen', 'gln'},
 {'park', 'parks'},
 {'ldg', 'ldge', 'lodg', 'lodge'},
 {'groves', 'grvs'},
 {'knls', 'knolls'},
 {'cen', 'cent', 'center', 'centr', 'centre', 'cnter', 'cntr', 'ctr'},
 {'route', 'rte'},
 {'manor', 'mnr'},
 {'spgs', 'spngs', 'springs', 'sprngs'},
 {'courts', 'cts'},
 {'stream', 'streme', 'strm'},
 {'smt', 'sumit', 'sumitt', 'summit'},
 {'flat', 'flt'},
 {'pik

In [7]:
# Identify outliers in the set of street names that do not match
# at least one entry in the signature.

from openclean.profiling.anomalies.pattern import TokenSignatureOutliers

outliers = TokenSignatureOutliers(signature=signature).process(streets)

print('found {} outliers in list of {} street names'.format(len(outliers), len(streets)))

found 27250 outliers in list of 115567 street names


In [8]:
# Print sample of 100 values from discovered outliers.

from random import Random

for val in sorted(Random(41).choices(outliers, k=100)):
    print(val)

100  C/N OF SCHERMER
100FT PARKING LOT OF
137 STRET
15 FEET EAST OF 5TH
2  S/W C/O DAHILL
37 AVENUE+
53STREET
ATLANTIC AVE3
ATLANTICA
B 121
BENTON
BLDG 72 JFK AIRPORT
BRIGHTON 14
BRONX P EAST
C/O 64 CIRLCE
C/O B 60
C/O E 167 T
C/O E 21
C/O HURON
C/O LENEVAR
C/O W 127
C/O W 188ST
C/O WYCOFF
CNETRAL PK WEST
COLUMBUS NYCH
COMMONWEALTH BL
E 108
E 174ST
E/S 32 P
FIB
FLUSHING MUPPY LOT
FORSYTH STQ
GROTE
HANGAR 1 - 16 A
HERRING
HICKS
I/O CARROLL
INSIDE 188 LINCOLN A
INSIDE CONNISNSHON P
INSIDE FMCP INSIDE
IO E 135
LYDIG
MAC KAY
MANHATTAN AVW
MMPW
N 6
N/B FRANCIS LEWIS BL
N/E C/O E 88
N/E C/O FORSYTHE
N/E C/O YORK
N/E CRNR BAYRIDGE PK
N/S C/R OF MILFORD S
N/W C/O E 35
N/W W 239 STRET
N/W/C JAMAICA
NORDSTRAN
ODELL
OPP 104
P K LOT 566 HAMILTON
PARKING LOT #7
PIKTIN
Q 83 BUS OF
Q4 BUS STOP S/S ARCH
R/O 1455 HARROD
R/O 245-10 FRANCIS L
R/O 2946
R/SIDE BUILDING OF 3
REAR OF 131
REAR OF 48 MONUMENT
REAR OF EVANS
REV JMS PLT
S/B WEBSTER 450  FRO
S/E ARCHER 50FT
S/E C/O RIVINGTON
S/E GOUVERNEUR
S/O 81