# Clustering and Standardization of Street Names

This example demonstrates the specified collision key generator and standardization operator for U.S. street names. Uses the **NYC Parking Violations Issued - Fiscal Year 2014** dataset.

In [1]:
# Download the full 'NYC Parking Violations' dataset.
# Note that the dataset file is over 380MB in size.

import gzip
import os

from openclean.data.source.socrata import Socrata

dataset_id = 'jt7v-77mi'
dataset = Socrata().dataset(dataset_id)
print(dataset.name)

datafile = './{}.tsv.gz'.format(dataset_id)

# Download file only if it does not exist already.
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        print('Downloading ...\n')
        ds.write(f)
        


# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/jt7v-77mi.tsv.gz'

Parking Violations Issued - Fiscal Year 2014


In [2]:
# Create data stream for the downloaded file to avoid having
# to load the whole file into main-memory.

from openclean.pipeline import stream

ds = stream(datafile)

In [3]:
# Show the column names in dataset schema.

ds.columns

['Summons Number',
 'Plate ID',
 'Registration State',
 'Plate Type',
 'Issue Date',
 'Violation Code',
 'Vehicle Body Type',
 'Vehicle Make',
 'Issuing Agency',
 'Street Code1',
 'Street Code2',
 'Street Code3',
 'Vehicle Expiration Date',
 'Violation Location',
 'Violation Precinct',
 'Issuer Precinct',
 'Issuer Code',
 'Issuer Command',
 'Issuer Squad',
 'Violation Time',
 'Time First Observed',
 'Violation County',
 'Violation In Front Of Or Opposite',
 'Number',
 'Street',
 'Intersecting Street',
 'Date First Observed',
 'Law Section',
 'Sub Division',
 'Violation Legal Code',
 'Days Parking In Effect    ',
 'From Hours In Effect',
 'To Hours In Effect',
 'Vehicle Color',
 'Unregistered Vehicle?',
 'Vehicle Year',
 'Meter Number',
 'Feet From Curb',
 'Violation Post Code',
 'Violation Description',
 'No Standing or Stopping Violation',
 'Hydrant Violation',
 'Double Parking Violation']

In [4]:
# Cluster streen names using 'Key Collision' clustering with the
# default fingerprint key generator.

streets = ds.select('Street')

from openclean.cluster.key import KeyCollision
from openclean.function.value.key.fingerprint import Fingerprint

clusters = streets.cluster(clusterer=KeyCollision(func=Fingerprint()))

In [5]:
# Define simple helper method to print the k largest clusters.

def print_k_clusters(clusters, k=10):
    clusters = sorted(clusters, key=lambda x: len(x), reverse=True)
    val_count = sum([len(c) for c in clusters])
    print('Total number of clusters is {} with {} values'.format(len(clusters), val_count))
    for i in range(min(k, len(clusters))):
        print('\nCluster {}'.format(i + 1))
        for key, cnt in clusters[i].items():
            if key == '':
                key = "''"
            print('  {} (x {})'.format(key, cnt))

In [6]:
print_k_clusters(clusters)

Total number of clusters is 8478 with 18836 values

Cluster 1
  2ND AVE (x 4075)
  2nd Ave (x 67751)
  2ND  AVE (x 5)
  2ND AVE. (x 1)
  AVE 2ND (x 1)
  2ND      AVE (x 1)
  2ND    AVE (x 2)
  2ND       AVE (x 1)

Cluster 2
  ST NICHOLAS AVE (x 2451)
  ST. NICHOLAS AVE (x 125)
  St Nicholas Ave (x 23462)
  ST, NICHOLAS AVE (x 1)
  ST NICHOLAS  AVE (x 9)
  ST NICHOLAS   AVE (x 1)
  ST  NICHOLAS AVE (x 4)
  ST. NICHOLAS  AVE (x 1)

Cluster 3
  LAWRENCE ST (x 165)
  ST LAWRENCE (x 34)
  LAWRENCE  ST (x 1)
  Lawrence St (x 2368)
  ST. LAWRENCE (x 2)
  ST LAWRENCE ST (x 1)
  LAWRENCE ST. (x 1)
  ST. LAWRENCE ST (x 1)

Cluster 4
  ST NICHOLAS (x 847)
  ST NICHOLAS ST (x 31)
  NICHOLAS ST (x 27)
  ST. NICHOLAS (x 27)
  ST  NICHOLAS (x 2)
  ST NICHOLAS  ST (x 1)
  Nicholas St (x 79)
  ST. NICHOLAS ST (x 1)

Cluster 5
  W 125 ST (x 3365)
  W 125    ST (x 1)
  W. 125 ST. (x 1)
  W .125 ST (x 5)
  W  125 ST (x 2)
  W 125  ST (x 1)
  W. 125 ST (x 3)

Cluster 6
  FERRY LOT 2 (x 743)
  FERRY LOT #2 

In [7]:
# Convert all street names to upper case before clustering.

streets = ds.select('Street').update('Street', str.upper)

clusters = streets.cluster(clusterer=KeyCollision(func=Fingerprint()))

In [8]:
print_k_clusters(clusters)

Total number of clusters is 4119 with 9164 values

Cluster 1
  W 125 ST (x 3365)
  W 125    ST (x 1)
  W. 125 ST. (x 1)
  W .125 ST (x 5)
  W  125 ST (x 2)
  W 125  ST (x 1)
  W. 125 ST (x 3)

Cluster 2
  FERRY LOT 2 (x 743)
  FERRY LOT #2 (x 140)
  FERRY  LOT #2 (x 1)
  FERRY LOT  2 (x 3)
  FERRY LOT # 2 (x 121)
  FERRY LOT  # 2 (x 2)
  FERRY LOT  #2 (x 1)

Cluster 3
  2ND AVE (x 71826)
  2ND  AVE (x 5)
  2ND AVE. (x 1)
  AVE 2ND (x 1)
  2ND      AVE (x 1)
  2ND    AVE (x 2)
  2ND       AVE (x 1)

Cluster 4
  ST NICHOLAS AVE (x 25913)
  ST. NICHOLAS AVE (x 125)
  ST, NICHOLAS AVE (x 1)
  ST NICHOLAS  AVE (x 9)
  ST NICHOLAS   AVE (x 1)
  ST  NICHOLAS AVE (x 4)
  ST. NICHOLAS  AVE (x 1)

Cluster 5
  LGA TERMINAL B (x 26)
  LGA, TERMINAL B (x 1)
  LGA/ TERMINAL B (x 1)
  TERMINAL B LGA (x 20)
  TERMINAL B - LGA (x 2)
  TERMINAL B -LGA (x 1)
  LGA TERMINAL B, (x 1)

Cluster 6
  EL GRANT HWY (x 67)
  E.L GRANT HWY (x 10)
  E.L. GRANT HWY (x 19)
  EL GRANT    HWY (x 1)
  EL. GRANT HWY (x 2

In [9]:
# Use a key generator that was specifically designed for street names.

from openclean.cluster.key import KeyCollision
from openclean_geo.address.usstreet import USStreetNameKey

clusters = streets.cluster(clusterer=KeyCollision(func=USStreetNameKey()))

In [10]:
print_k_clusters(clusters)

Total number of clusters is 10386 with 33342 values

Cluster 1
  W 43 STREET (x 200)
  W 43 ST (x 1666)
  W 43RD ST (x 19864)
  WEST 43 STREET (x 425)
  W 43RD STREET (x 52)
  WEST 43RD ST (x 147)
  WEST 43 ST (x 366)
  WEST 43RD STREET (x 210)
  W 43ST (x 11)
  W 43 RD ST (x 8)
  W 43TH ST (x 10)
  WEST 43RD  STREET (x 1)
  WEST  43 ST (x 1)
  W.43 RD ST (x 1)
  W.43 STREET (x 3)
  W.43RD ST (x 1)
  W.43 ST (x 9)
  WEST 43ST (x 10)
  W43 ST (x 9)
  W. 43 STREET (x 3)
  W43ST (x 1)
  W. 43RD ST (x 1)
  WEST 43TH ST (x 1)
  W .43RD ST (x 2)
  W 43RD  ST (x 1)
  W. 43 ST (x 1)
  W 43 RD STREET (x 1)
  W 43  STREET (x 1)
  W  43 ST (x 1)
  W43RD ST (x 1)
  WEST 43  STREET (x 1)
  W.43 TH  ST (x 1)
  W.43 TH ST (x 1)
  WEST 43TH STREET (x 1)
  WEST 43  ST (x 1)
  W .43 ST (x 1)
  WEST  43ST (x 1)
  WEST 43 RD ST (x 1)

Cluster 2
  W 125 ST (x 3365)
  W 125    ST (x 1)
  W 125 STREET (x 451)
  WEST 125 ST (x 522)
  WEST 125TH ST (x 81)
  W 125TH ST (x 11611)
  WEST 125 STREET (x 354)
  W 12

In [11]:
# Use street name standardization operator to modify street names
# before clustering them using the default fingerprint operator.

from openclean_geo.address.usstreet import StandardizeUSStreetName

streets = ds.select('Street').update('Street', StandardizeUSStreetName(characters='upper'))

clusters = streets.cluster(clusterer=KeyCollision(func=Fingerprint()))

In [12]:
print_k_clusters(clusters)

Total number of clusters is 2354 with 5075 values

Cluster 1
  LGA TERMINAL B (x 26)
  LGA , TERMINAL B (x 1)
  LGA / TERMINAL B (x 1)
  TERMINAL B LGA (x 20)
  TERMINAL B - LGA (x 6)
  LGA TERMINAL B , (x 1)

Cluster 2
  B WAY (x 211)
  B - WAY (x 11)
  B / WAY (x 2)
  B . WAY (x 42)
  B . WAY . (x 1)
  B ; WAY (x 1)

Cluster 3
  LGA , CTB (x 1)
  LGA / CTB (x 1)
  LGA CTB (x 10)
  CTB LGA (x 3)
  LGA - CTB (x 1)
  CTB - LGA (x 1)

Cluster 4
  EAST L GRANT HWY (x 48)
  EAST . L GRANT HWY (x 18)
  EAST . L . GRANT HWY (x 25)
  EAST L . GRANT HWY (x 1)
  EAST / L / GRANT HWY (x 1)
  EAST - L GRANT HWY (x 1)

Cluster 5
  JOHN ST (x 4395)
  ST JOHN (x 10)
  ST JOHN ST (x 8)
  ST . JOHN ST (x 1)
  ST . JOHN (x 1)
  JOHN ST . (x 1)

Cluster 6
  LAWRENCE ST (x 2549)
  ST LAWRENCE (x 34)
  ST . LAWRENCE (x 3)
  ST LAWRENCE ST (x 1)
  LAWRENCE ST . (x 1)
  ST . LAWRENCE ST (x 1)

Cluster 7
  WEST 146 ST (x 1916)
  WEST . 146 ST (x 5)
  WEST 146 ST . (x 1)
  WEST / 146 ST (x 1)
  WEST 146 ST ST

In [13]:
# Use option to remove special characters (keep only alpha-numeric tokens)
# when standardizing street names.

streets = ds.select('Street').update('Street', StandardizeUSStreetName(characters='upper', alphanum=True))

clusters = streets.cluster(clusterer=KeyCollision(func=Fingerprint()))

In [14]:
print_k_clusters(clusters)

Total number of clusters is 901 with 1824 values

Cluster 1
  SOUTH E C O 14 ST (x 1)
  SOUTH E C O E 14 ST (x 1)
  SOUTH O C O E 14 ST (x 1)

Cluster 2
  20 FT FROM C O S W (x 1)
  20 FT FROM S W C O (x 1)
  20 FT FROM S W C O C (x 1)

Cluster 3
  NORTH W C O NORTH 4 ST (x 1)
  NORTH W C O W 4 ST (x 1)
  NORTH W C O 4 ST (x 1)

Cluster 4
  SOUTH W C O W 41 (x 5)
  SOUTH W C O 41 (x 3)
  SOUTH W W C O 41 (x 1)

Cluster 5
  SOUTH S 5 ST (x 6)
  SOUTH S SOUTH 5 ST (x 1)
  SOUTH S S 5 ST (x 1)

Cluster 6
  NORTH E C O E 71 (x 5)
  NORTH O C O E 71 (x 1)
  NORTH E C O 71 (x 1)

Cluster 7
  WEST 8 ST (x 6097)
  WEST 8 8 ST (x 1)
  WEST 8 ST ST (x 1)

Cluster 8
  ANN ST (x 1171)
  ST ANN ST (x 1)
  ST ANN (x 2)

Cluster 9
  NORTH E C O AVE C (x 1)
  NORTH E C O AVE O (x 2)
  NORTH E C O AVE (x 1)

Cluster 10
  SOUTH E C O AVE (x 2)
  SOUTH E C O AVE O (x 2)
  SOUTH E C O AVE C (x 1)


In [15]:
# Add option to remove repeated tokens.

streets = ds\
    .select('Street')\
    .update('Street', StandardizeUSStreetName(characters='upper', alphanum=True, repeated=False))

clusters = streets.cluster(clusterer=KeyCollision(func=Fingerprint()))

In [16]:
print_k_clusters(clusters)

Total number of clusters is 761 with 1541 values

Cluster 1
  SOUTH E C O 14 ST (x 1)
  SOUTH E C O E 14 ST (x 1)
  SOUTH O C O E 14 ST (x 1)

Cluster 2
  20 FT FROM C O S W (x 1)
  20 FT FROM S W C O (x 1)
  20 FT FROM S W C O C (x 1)

Cluster 3
  NORTH W C O NORTH 4 ST (x 1)
  NORTH W C O W 4 ST (x 1)
  NORTH W C O 4 ST (x 1)

Cluster 4
  NORTH E C O E 71 (x 5)
  NORTH O C O E 71 (x 1)
  NORTH E C O 71 (x 1)

Cluster 5
  ANN ST (x 1171)
  ST ANN ST (x 1)
  ST ANN (x 2)

Cluster 6
  NORTH E C O AVE C (x 1)
  NORTH E C O AVE O (x 2)
  NORTH E C O AVE (x 1)

Cluster 7
  SOUTH E C O AVE (x 2)
  SOUTH E C O AVE O (x 2)
  SOUTH E C O AVE C (x 1)

Cluster 8
  C O AVE O (x 9)
  C O AVE (x 3)
  C O AVE C (x 3)

Cluster 9
  LGA TERMINAL C (x 19)
  TERMINAL C LGA (x 16)
  LGA C TERMINAL (x 1)

Cluster 10
  ST FELIX ST (x 865)
  ST FELIX (x 3)
  FELIX ST (x 1)
