# Identifiy Potential Misspellings in Street Name Column

Find groups of different street names that might be alternative representations of the same street. Uses the **NYC Parking Violations Issued - Fiscal Year 2014** dataset.

In [1]:
# Download the full 'DOB Job Application Fiings' dataset.

import gzip

from openclean.data.source.socrata import Socrata

datafile = './jt7v-77mi.tsv.gz'

with gzip.open(datafile, 'wb') as f:
    ds = Socrata().dataset('jt7v-77mi')
    ds.write(f)


# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/jt7v-77mi.tsv.gz'

In [2]:
# Get distinct set of street names. By computing the distinct set of
# street names first we avoid computing keys for each distinct street
# name multiple times.

from openclean.pipeline import stream

df = stream(datafile)

streets = df.select('Street').distinct()

print('{} distinct streets (for {} total values)'.format(len(streets), sum(streets.values())))


115567 distinct streets (for 9100278 total values)


In [3]:
# Cluster street names using key collision (with the default key generator).
# Remove clusters that contain less than seven distinct values (for display
# purposes). Use multiple threads (4) to generate value keys in parallel.

from openclean.cluster.key import key_collision

# Minimum cluster size. Use seven as defaultfor the full dataset (to limit
# the number of clusters that are printed in the next cell).
minsize = 7

# Use minimum cluster size of 2 when using the dataset sample
# minsize = 2

clusters = key_collision(values=streets, minsize=minsize, threads=4)

print('{} clusters of size {} or greater'.format(len(clusters), minsize))

13 clusters of size 7 or greater


In [4]:
# For each cluster print cluster values, their frequency counts,
# and the suggested common value for the cluster.

def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster:
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))
    
# Sort clusters by decreasing number of distinct values.
clusters.sort(key=lambda c: len(c), reverse=True)

for i in range(len(clusters)):
    print_cluster(i + 1, clusters[i])


Cluster 1 (of size 8)

2ND AVE (4075)
2nd Ave (67751)
2ND  AVE (5)
2ND AVE. (1)
AVE 2ND (1)
2ND      AVE (1)
2ND    AVE (2)
2ND       AVE (1)

Suggested value: 2nd Ave


Cluster 2 (of size 8)

ST NICHOLAS AVE (2451)
ST. NICHOLAS AVE (125)
St Nicholas Ave (23462)
ST, NICHOLAS AVE (1)
ST NICHOLAS  AVE (9)
ST NICHOLAS   AVE (1)
ST  NICHOLAS AVE (4)
ST. NICHOLAS  AVE (1)

Suggested value: St Nicholas Ave


Cluster 3 (of size 8)

LAWRENCE ST (165)
ST LAWRENCE (34)
LAWRENCE  ST (1)
Lawrence St (2368)
ST. LAWRENCE (2)
ST LAWRENCE ST (1)
LAWRENCE ST. (1)
ST. LAWRENCE ST (1)

Suggested value: Lawrence St


Cluster 4 (of size 8)

ST NICHOLAS (847)
ST NICHOLAS ST (31)
NICHOLAS ST (27)
ST. NICHOLAS (27)
ST  NICHOLAS (2)
ST NICHOLAS  ST (1)
Nicholas St (79)
ST. NICHOLAS ST (1)

Suggested value: ST NICHOLAS


Cluster 5 (of size 7)

W 125 ST (3365)
W 125    ST (1)
W. 125 ST. (1)
W .125 ST (5)
W  125 ST (2)
W 125  ST (1)
W. 125 ST (3)

Suggested value: W 125 ST


Cluster 6 (of size 7)

FERRY LOT 2 (74

In [5]:
# Create mapping of common street type abbreviations to a
# standardized abbreviation. Generated from
# https://github.com/VIDA-NYU/openclean-pattern/blob/master/resources/data/street_abvs.csv

mapping = {
    'ALLEE': 'ALY',
    'ALLEY': 'ALY',
    'ALLY': 'ALY',
    'ANEX': 'ANX',
    'ANNEX': 'ANX',
    'ANNX': 'ANX',
    'ARCADE': 'ARC',
    'AV': 'AVE',
    'AVEN': 'AVE',
    'AVENU': 'AVE',
    'AVENUE': 'AVE',
    'AVN': 'AVE',
    'AVNUE': 'AVE',
    'BAYOO': 'BYU',
    'BAYOU': 'BYU',
    'BEACH': 'BCH',
    'BEND': 'BND',
    'BLUF': 'BLF',
    'BLUFF': 'BLF',
    'BLUFFS': 'BLFS',
    'BOT': 'BTM',
    'BOTTM': 'BTM',
    'BOTTOM': 'BTM',
    'BOUL': 'BLVD',
    'BOULEVARD': 'BLVD',
    'BOULV': 'BLVD',
    'BRNCH': 'BR',
    'BRANCH': 'BR',
    'BRDGE': 'BRG',
    'BRIDGE': 'BRG',
    'BROOK': 'BRK',
    'BROOKS': 'BRKS',
    'BURG': 'BG',
    'BURGS': 'BGS',
    'BYPA': 'BYP',
    'BYPAS': 'BYP',
    'BYPASS': 'BYP',
    'BYPS': 'BYP',
    'CAMP': 'CP',
    'CMP': 'CP',
    'CANYN': 'CYN',
    'CANYON': 'CYN',
    'CNYN': 'CYN',
    'CAPE': 'CPE',
    'CAUSEWAY': 'CSWY',
    'CAUSWA': 'CSWY',
    'CEN': 'CTR',
    'CENT': 'CTR',
    'CENTER': 'CTR',
    'CENTR': 'CTR',
    'CENTRE': 'CTR',
    'CNTER': 'CTR',
    'CNTR': 'CTR',
    'CENTERS': 'CTRS',
    'CIRC': 'CIR',
    'CIRCL': 'CIR',
    'CIRCLE': 'CIR',
    'CRCL': 'CIR',
    'CRCLE': 'CIR',
    'CIRCLES': 'CIRS',
    'CLIFF': 'CLF',
    'CLIFFS': 'CLFS',
    'CLUB': 'CLB',
    'COMMON': 'CMN',
    'COMMONS': 'CMNS',
    'CORNER': 'COR',
    'CORNERS': 'CORS',
    'COURSE': 'CRSE',
    'COURT': 'CT',
    'COURTS': 'CTS',
    'COVE': 'CV',
    'COVES': 'CVS',
    'CREEK': 'CRK',
    'CRESCENT': 'CRES',
    'CRSENT': 'CRES',
    'CRSNT': 'CRES',
    'CREST': 'CRST',
    'CROSSING': 'XING',
    'CRSSNG': 'XING',
    'CROSSROAD': 'XRD',
    'CROSSROADS': 'XRDS',
    'CURVE': 'CURV',
    'DALE': 'DL',
    'DAM': 'DM',
    'DIV': 'DV',
    'DIVIDE': 'DV',
    'DVD': 'DV',
    'DRIV': 'DR',
    'DRIVE': 'DR',
    'DRV': 'DR',
    'DRIVES': 'DRS',
    'ESTATE': 'EST',
    'ESTATES': 'ESTS',
    'EXP': 'EXPY',
    'EXPR': 'EXPY',
    'EXPRESS': 'EXPY',
    'EXPRESSWAY': 'EXPY',
    'EXPW': 'EXPY',
    'EXTENSION': 'EXT',
    'EXTN': 'EXT',
    'EXTNSN': 'EXT',
    'FALLS': 'FLS',
    'FERRY': 'FRY',
    'FRRY': 'FRY',
    'FIELD': 'FLD',
    'FIELDS': 'FLDS',
    'FLAT': 'FLT',
    'FLATS': 'FLTS',
    'FORD': 'FRD',
    'FORDS': 'FRDS',
    'FOREST': 'FRST',
    'FORESTS': 'FRST',
    'FORG': 'FRG',
    'FORGE': 'FRG',
    'FORGES': 'FRGS',
    'FORK': 'FRK',
    'FORKS': 'FRKS',
    'FORT': 'FT',
    'FRT': 'FT',
    'FREEWAY': 'FWY',
    'FREEWY': 'FWY',
    'FRWAY': 'FWY',
    'FRWY': 'FWY',
    'GARDEN': 'GDN',
    'GARDN': 'GDN',
    'GRDEN': 'GDN',
    'GRDN': 'GDN',
    'GARDENS': 'GDNS',
    'GRDNS': 'GDNS',
    'GATEWAY': 'GTWY',
    'GATEWY': 'GTWY',
    'GATWAY': 'GTWY',
    'GTWAY': 'GTWY',
    'GLEN': 'GLN',
    'GLENS': 'GLNS',
    'GREEN': 'GRN',
    'GREENS': 'GRNS',
    'GROV': 'GRV',
    'GROVE': 'GRV',
    'GROVES': 'GRVS',
    'HARB': 'HBR',
    'HARBOR': 'HBR',
    'HARBR': 'HBR',
    'HRBOR': 'HBR',
    'HARBORS': 'HBRS',
    'HAVEN': 'HVN',
    'HT': 'HTS',
    'HIGHWAY': 'HWY',
    'HIGHWY': 'HWY',
    'HIWAY': 'HWY',
    'HIWY': 'HWY',
    'HWAY': 'HWY',
    'HILL': 'HL',
    'HILLS': 'HLS',
    'HLLW': 'HOLW',
    'HOLLOW': 'HOLW',
    'HOLLOWS': 'HOLW',
    'HOLWS': 'HOLW',
    'ISLAND': 'IS',
    'ISLND': 'IS',
    'ISLANDS': 'ISS',
    'ISLNDS': 'ISS',
    'ISLES': 'ISLE',
    'JCTION': 'JCT',
    'JCTN': 'JCT',
    'JUNCTION': 'JCT',
    'JUNCTN': 'JCT',
    'JUNCTON': 'JCT',
    'JCTNS': 'JCTS',
    'JUNCTIONS': 'JCTS',
    'KEY': 'KY',
    'KEYS': 'KYS',
    'KNOL': 'KNL',
    'KNOLL': 'KNL',
    'KNOLLS': 'KNLS',
    'LAKE': 'LK',
    'LAKES': 'LKS',
    'LANDING': 'LNDG',
    'LNDNG': 'LNDG',
    'LANE': 'LN',
    'LIGHT': 'LGT',
    'LIGHTS': 'LGTS',
    'LOAF': 'LF',
    'LOCK': 'LCK',
    'LOCKS': 'LCKS',
    'LDGE': 'LDG',
    'LODG': 'LDG',
    'LODGE': 'LDG',
    'LOOPS': 'LOOP',
    'MANOR': 'MNR',
    'MANORS': 'MNRS',
    'MEADOW': 'MDW',
    'MDW': 'MDWS',
    'MEADOWS': 'MDWS',
    'MEDOWS': 'MDWS',
    'MILL': 'ML',
    'MILLS': 'MLS',
    'MISSN': 'MSN',
    'MSSN': 'MSN',
    'MOTORWAY': 'MTWY',
    'MNT': 'MT',
    'MOUNT': 'MT',
    'MNTAIN': 'MTN',
    'MNTN': 'MTN',
    'MOUNTAIN': 'MTN',
    'MOUNTIN': 'MTN',
    'MTIN': 'MTN',
    'MNTNS': 'MTNS',
    'MOUNTAINS': 'MTNS',
    'NECK': 'NCK',
    'ORCHARD': 'ORCH',
    'ORCHRD': 'ORCH',
    'OVL': 'OVAL',
    'OVERPASS': 'OPAS',
    'PRK': 'PARK',
    'PARKS': 'PARK',
    'PARKWAY': 'PKWY',
    'PARKWY': 'PKWY',
    'PKWAY': 'PKWY',
    'PKY': 'PKWY',
    'PARKWAYS': 'PKWY',
    'PKWYS': 'PKWY',
    'PASSAGE': 'PSGE',
    'PATHS': 'PATH',
    'PIKES': 'PIKE',
    'PINE': 'PNE',
    'PINES': 'PNES',
    'PLAIN': 'PLN',
    'PLAINS': 'PLNS',
    'PLAZA': 'PLZ',
    'PLZA': 'PLZ',
    'POINT': 'PT',
    'POINTS': 'PTS',
    'PORT': 'PRT',
    'PORTS': 'PRTS',
    'PRAIRIE': 'PR',
    'PRR': 'PR',
    'RAD': 'RADL',
    'RADIAL': 'RADL',
    'RADIEL': 'RADL',
    'RANCH': 'RNCH',
    'RANCHES': 'RNCH',
    'RNCHS': 'RNCH',
    'RAPID': 'RPD',
    'RAPIDS': 'RPDS',
    'REST': 'RST',
    'RDGE': 'RDG',
    'RIDGE': 'RDG',
    'RIDGES': 'RDGS',
    'RIVER': 'RIV',
    'RVR': 'RIV',
    'RIVR': 'RIV',
    'ROAD': 'RD',
    'ROADS': 'RDS',
    'ROUTE': 'RTE',
    'SHOAL': 'SHL',
    'SHOALS': 'SHLS',
    'SHOAR': 'SHR',
    'SHORE': 'SHR',
    'SHOARS': 'SHRS',
    'SHORES': 'SHRS',
    'SKYWAY': 'SKWY',
    'SPNG': 'SPG',
    'SPRING': 'SPG',
    'SPRNG': 'SPG',
    'SPNGS': 'SPGS',
    'SPRINGS': 'SPGS',
    'SPRNGS': 'SPGS',
    'SPURS': 'SPUR',
    'SQR': 'SQ',
    'SQRE': 'SQ',
    'SQU': 'SQ',
    'SQUARE': 'SQ',
    'SQRS': 'SQS',
    'SQUARES': 'SQS',
    'STATION': 'STA',
    'STATN': 'STA',
    'STN': 'STA',
    'STRAV': 'STRA',
    'STRAVEN': 'STRA',
    'STRAVENUE': 'STRA',
    'STRAVN': 'STRA',
    'STRVN': 'STRA',
    'STRVNUE': 'STRA',
    'STREAM': 'STRM',
    'STREME': 'STRM',
    'STREET': 'ST',
    'STRT': 'ST',
    'STR': 'ST',
    'STREETS': 'STS',
    'SUMIT': 'SMT',
    'SUMITT': 'SMT',
    'SUMMIT': 'SMT',
    'TERR': 'TER',
    'TERRACE': 'TER',
    'THROUGHWAY': 'TRWY',
    'TRACE': 'TRCE',
    'TRACES': 'TRCE',
    'TRACK': 'TRAK',
    'TRACKS': 'TRAK',
    'TRK': 'TRAK',
    'TRKS': 'TRAK',
    'TRAFFICWAY': 'TRFY',
    'TRAIL': 'TRL',
    'TRAILS': 'TRL',
    'TRLS': 'TRL',
    'TRAILER': 'TRLR',
    'TRLRS': 'TRLR',
    'TUNEL': 'TUNL',
    'TUNLS': 'TUNL',
    'TUNNEL': 'TUNL',
    'TUNNELS': 'TUNL',
    'TUNNL': 'TUNL',
    'TRNPK': 'TPKE',
    'TURNPIKE': 'TPKE',
    'TURNPK': 'TPKE',
    'UNDERPASS': 'UPAS',
    'UNION': 'UN',
    'UNIONS': 'UNS',
    'VALLEY': 'VLY',
    'VALLY': 'VLY',
    'VLLY': 'VLY',
    'VALLEYS': 'VLYS',
    'VDCT': 'VIA',
    'VIADCT': 'VIA',
    'VIADUCT': 'VIA',
    'VIEW': 'VW',
    'VIEWS': 'VWS',
    'VILL': 'VLG',
    'VILLAG': 'VLG',
    'VILLAGE': 'VLG',
    'VILLG': 'VLG',
    'VILLIAGE': 'VLG',
    'VILLAGES': 'VLGS',
    'VILLE': 'VL',
    'VIST': 'VIS',
    'VISTA': 'VIS',
    'VST': 'VIS',
    'VSTA': 'VIS',
    'WALKS': 'WALK',
    'WY': 'WAY',
    'WELL': 'WL',
    'WELLS': 'WLS',
    'E': 'EAST',
    'W': 'WEST',
    'N': 'NORTH',
    'S': 'SOUTH'
}

In [6]:
# Use custom key generator that normalizes street type abbreviation and
# splits strings by character type, e.g., 'W35ST' is tokenized as ['W', '35', 'ST']

from openclean.function.token.base import Tokens, UpdateTokens
from openclean.function.token.split import ChartypeSplit
from openclean.function.value.mapping import Standardize

tokenizer = Tokens(
    tokenizer=ChartypeSplit(chartypes=[str.isalpha, str.isdigit]),
    transformer=UpdateTokens(Standardize(mapping)),
    delim=' ',
    sort=True,
    unique=True
)

clusters = key_collision(values=streets, func=tokenizer, minsize=minsize, threads=4)

print('{} clusters of size {} or greater'.format(len(clusters), minsize))

222 clusters of size 7 or greater


In [7]:
# Print the top 10 clusters (in number of distinct values)

    
# Sort clusters by decreasing number of distinct values.
clusters.sort(key=lambda c: len(c), reverse=True)

for i in range(10):
    print_cluster(i + 1, clusters[i])


Cluster 1 (of size 18)

EAST 21 ST (72)
E 21 ST (720)
E 21ST ST (66)
EAST 21 STREET (39)
E 21ST STREET (21)
EAST 21ST STREET (11)
E 21 STREET (87)
EAST 21ST ST (14)
E 21 ST STREET (4)
E 21 STR (2)
EAST 21ST (2)
E 21 ST ST (10)
E21 ST (6)
E21ST ST (1)
E 21ST (6)
EAST 21 ST STREET (1)
EAST21ST ST (1)
EAST E 21 ST (1)

Suggested value: E 21 ST


Cluster 2 (of size 18)

WEST 31 ST ST (3)
W 31 ST (552)
W 31ST ST (50)
WEST 31 STREET (118)
WEST 31ST STREET (12)
WEST 31ST ST (26)
WEST 31 ST (99)
W 31 STREET (66)
W 31 STRT (11)
W 31ST STREET (17)
W 31ST (8)
WEST 31ST (1)
WEST 31 ST STREET (2)
W 31 ST ST (1)
WEST W 31 ST (1)
W 31 STR (10)
W31 ST (1)
W 31 ST STREET (1)

Suggested value: W 31 ST


Cluster 3 (of size 17)

E 31 ST (341)
E 31ST ST (33)
EAST 31 ST (55)
EAST 31 STREET (44)
E 31 STREET (48)
EAST 31ST STREET (5)
E 31ST STREET (8)
EAST 31ST (3)
E 31 STR (6)
E 31ST (3)
EAST 31ST ST (4)
E 31 ST STREET (1)
E 31STREET (1)
E 31 STRT (1)
E31 ST (2)
E 31 ST ST (3)
EAST E 31 ST (1)

Suggested val