# Anomalous Patterns in Street Names

Find groups of different street names that might be outliers as per their syntactic representations. This is an example for the anomaly detection supported by **openclean-pattern**. Uses the **NYC Parking Violations Issued - Fiscal Year 2014** dataset. 

In [1]:
# Download the full 'DOB Job Application Fiings' dataset.

import gzip

from openclean.data.source.socrata import Socrata

datafile = './jt7v-77mi.tsv.gz'

# with gzip.open(datafile, 'wb') as f:
#     ds = Socrata().dataset('jt7v-77mi')
#     ds.write(f)


# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/jt7v-77mi.tsv.gz'

In [2]:
# Use streaming function to avoid having to load the full dataset
# into memory.

from openclean.pipeline import stream

df = stream(datafile)

In [3]:
import sys
n = sys.getrecursionlimit() / 10 # to limit recursions
c = 'Street'

In [4]:
sample = df.select(c).sample(n=n, random_state=42).to_df()
sample[c]

1105615               HAWTHORNE
8308530                Grand St
1954320              Barnes Ave
948801                   3rd St
5262992             Bedford Ave
                   ...         
5352610              W 114th St
5451063                Avenue U
30101      ATLANTIC AVE. (W/B)@
3430618               W 72nd St
496365                  3rd Ave
Name: Street, Length: 300, dtype: object

In [7]:
# tokenize and encode the column

from openclean_pattern.datatypes.resolver import AddressDesignatorResolver, DefaultTypeResolver
from openclean_pattern.tokenize.regex import RegexTokenizer

tr = DefaultTypeResolver(interceptors=AddressDesignatorResolver())
rt = RegexTokenizer(type_resolver=tr)

tokenized = rt.encode(sample[c].tolist())

tokenized[:5]

[(_'ALPHA'_(9,'hawthorne'),),
 (_'ALPHA'_(5,'grand'), _'\\S'_(1,' '), _'STREET'_(2,'st')),
 (_'ALPHA'_(6,'barnes'), _'\\S'_(1,' '), _'STREET'_(3,'ave')),
 (_'ALPHANUM'_(3,'3rd'), _'\\S'_(1,' '), _'STREET'_(2,'st')),
 (_'ALPHA'_(7,'bedford'), _'\\S'_(1,' '), _'STREET'_(3,'ave'))]

In [10]:
# cluster the closest values into groups

from openclean_pattern.collect.group import Group
from openclean_pattern.collect.cluster import Cluster


# collector = Group()
collector = Cluster()
groups = collector.collect(tokenized)

# remember that per group, the no. of recursions shouldn't exceed the system recursion limit
len(groups)

10

In [11]:
# align

from openclean_pattern.align.progressive import ProgressiveAligner
from pprint import pprint

aligner = ProgressiveAligner(gap_penalty=20)
aligned = aligner.align(tokenized, groups)
aligned[:5]

[((_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(9,'hawthorne')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(6,'banyer')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(6,'bowery')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'broadway')),
  (_'ALPHA'_(8,'steinway')),
  (_'ALPHA'_(8,'broadway'))),
 ((_'ALPHA'_(6,'barnes'), _'\\S'_(1,' '), _'STREET'_(3,'ave')),
  (_'ALPHA'_(5,'grand'), _'\\S'_(1,' '), _'STREET'_(2,'st')),
  (_'ALPHA'_(7,'bedford'), _'\\S'_(1,' '), _'STREET'_(3,'ave')),
  (_'ALPHA'_(7,'madison'), _'\\S'_(1,' '), _'STREET'_(3,'ave')),
  (_'ALPHA'_(6,'bowery'), _'\\S'_(1,' '), _'STREET'_(2,'st')),
  (_'ALPHA'_(4,'bell'), _'\\S'_(1,' '), _'STREET'_(4,'blvd')),
  (_'ALPHA'_(6,'darcey'), _'\\S'_(1,' '), _'STREET'_(2,'st')),
  (_'ALPHA'_(8,'berriman'), _'\\S'_(1,' '), _'STREET'_(2,'st')),
  (_'ALPHA'_(5,'ocean'), _'\

In [12]:
# compile

from openclean_pattern.regex.compiler import DefaultRegexCompiler

compiler = DefaultRegexCompiler(method='col') # use method='col' if using Alignments or todo: remove gaps from rowwise keygen

In [13]:
for al in aligned:
    print('######')
    print(al[0])
    patterns = compiler.compile_each(al)
    p = patterns.top(1, True)
    print(p)
    print()

######
(_'ALPHA'_(8,'broadway'),)
[ALPHA(6-9)]

######
(_'ALPHA'_(6,'barnes'), _'\\S'_(1,' '), _'STREET'_(3,'ave'))
[ALPHA(1-14), SPACE_REP(1-1), STREET(2-6)]

######
(_'ALPHANUM'_(4,'62nd'), _'\\S'_(1,' '), _'STREET'_(2,'dr'))
[ALPHANUM(3-5), SPACE_REP(1-1), STREET(2-3)]

######
(_'ALPHA'_(1,'e'), _'\\S'_(1,' '), _'ALPHANUM'_(4,'71st'), _'\\S'_(1,' '), _'STREET'_(2,'st'))
[ALPHA(1-1), SPACE_REP(1-1), ALPHANUM(3-5), SPACE_REP(1-1), STREET(2-2)]

######
(_'G'_(0,''), _'ALPHANUM'_(4,'12th'), _'\\S'_(1,' '), _'STREET'_(3,'ave'), _'PUNC'_(1,'.'), _'PUNC'_(1,'('), _'ALPHA'_(1,'n'), _'PUNC'_(1,'/'), _'ALPHA'_(1,'b'), _'PUNC'_(1,')'), _'PUNC'_(1,'@'), _'ALPHA'_(1,'w'), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'G'_(0,''), _'PUNC'_(1,'.'), _'G'_(0,''), 