# Anomalous Patterns in Street Addresses

Find groups of different street names that might be outliers as per their syntactic representations. This is an example for the anomaly detection supported by **openclean-pattern** and uses the **NYC Parking Violations Issued - Fiscal Year 2014** dataset. 

In [1]:
# Download the full 'DOB Job Application Fiings' dataset.

datafile = './jt7v-77mi.tsv.gz'

# import gzip
# from openclean.data.source.socrata import Socrata
# 
# with gzip.open(datafile, 'wb') as f:
#     ds = Socrata().dataset('jt7v-77mi')
#     ds.write(f)

In [2]:
import sys

c = 'Street'

In [3]:
# Use streaming function to avoid having to load the full dataset
# into memory.

from openclean.pipeline import stream
from openclean.function.value.text import to_lower

# we use 10k rows for this example
df = stream(datafile).sample(n=10000, random_state=41).update(columns=c, func=to_lower) 

In [4]:
# we learn patterns from a random sample

# to limit recursions for the progresive aligner's neighbor joining later
pattern_n = int(sys.getrecursionlimit() / 5) 

sample = df\
    .select(c)\
    .sample(n=pattern_n, random_state=42)\
    .to_df()\
    .reset_index(drop=True)

In [5]:
# tokenize and encode the sample

from openclean_pattern.datatypes.resolver import AddressDesignatorResolver, DefaultTypeResolver
from openclean_pattern.tokenize.regex import RegexTokenizer

tr = DefaultTypeResolver(interceptors=AddressDesignatorResolver())
rt = RegexTokenizer(type_resolver=tr)

tokenized = rt.encode(sample[c].tolist())

tokenized[:5]

[['e', ' ', '63rd', ' ', 'st'],
 ['e', ' ', '48th', ' ', 'st'],
 ['4th', ' ', 'ave'],
 ['9th', ' ', 'ave'],
 ['4th', ' ', 'ave']]

In [6]:
# cluster the closest values into groups. There should be a dominant pattern per group (excluding the noise group)

from openclean_pattern.collect.group import Group
from openclean_pattern.collect.cluster import Cluster

collector = Cluster()
groups = collector.collect(tokenized)

# remember that per group, the no. of recursions shouldn't exceed the system recursion limit
len(groups)

12

In [7]:
# let's look at all the groups

for g in groups:
    print("#### "+str(g)+" ####")
    for i in groups[g]:
        print(sample[c][i])
    print()

#### 0 ####
e 63rd st
e 48th st
w 11th st
w 64th st
e 210th st
e 68th st
w 41st st
e 74th st
w 153rd st
e 47th st
e 15th st
e 112th st
e 58th st
e 58th st
e 30th st
e 109th st
w 40th st
e 100th st
e 7th st
e 58th st
w 58th st
w 33rd st
bay 8th st
w 31st st
w 18th st
w 49th st
west 38th st
e 201st st
e 22nd st
w 41st st
e 53rd st
w 38th st
e 59th st
w 28th st
e 52nd st
w 33rd st
e 73rd st
w 21st st
w 35th st
w 19th st
w 44th st
w 52nd st
w 160th st
w 29th st
e 45th st
e 205th st
e 188th st
w 91st st
w 104th st
e 12th st
w 27th st
w 21st st
w 164th st
e 50th st
w 141st st
e 8th st
w 56th st
e 55th st
e 62nd st
w 14th st
e 39th st
w 28th st
e 38th st
w 87th st
w 125th st
w 47th st
s 4th st
e 46th st
w 156th street
w 53rd st
w 9th st
w 153rd st
e 13th st
e 9th st
e 92nd st
e 38th st
e 38th st
w 49th st
w 60th st
w 35th st
w 56th st
e 19th st
w 23rd st
e 164th st
w 57th st
w 30th st
w 10th st
w 9th st
w 40th st
w 146th st
w 20th st
e 82nd st
w 47th st
w 94th st
w 51st st
w 46th st
e 53rd st

In [8]:
# removing noisy rows from dict (-1 is noise as per DBSCAN)

groups.pop(-1, None)

[16,
 22,
 30,
 32,
 55,
 59,
 68,
 75,
 80,
 91,
 95,
 102,
 104,
 105,
 112,
 114,
 130,
 138,
 146,
 147,
 149,
 172,
 178,
 185,
 186,
 215,
 223,
 234,
 252,
 254,
 277,
 278,
 304,
 305,
 312,
 316,
 321,
 338,
 339,
 352,
 353,
 357,
 372,
 374,
 379,
 393,
 411,
 414,
 419,
 421,
 432,
 446,
 451,
 473,
 480,
 496,
 509,
 526,
 535,
 547,
 557,
 560,
 577,
 597]

In [9]:
# align the groups (introduce gaps if values can be sequence aligned)

from openclean_pattern.align.progressive import ProgressiveAligner
from pprint import pprint

aligner = ProgressiveAligner(gap_penalty=20)
aligned = aligner.align(tokenized, groups)
aligned[:5]

[(('e', ' ', '48th', ' ', 'st'),
  ('e', ' ', '63rd', ' ', 'st'),
  ('w', ' ', '11th', ' ', 'st'),
  ('w', ' ', '64th', ' ', 'st'),
  ('e', ' ', '210th', ' ', 'st'),
  ('e', ' ', '68th', ' ', 'st'),
  ('w', ' ', '41st', ' ', 'st'),
  ('e', ' ', '74th', ' ', 'st'),
  ('w', ' ', '153rd', ' ', 'st'),
  ('e', ' ', '47th', ' ', 'st'),
  ('e', ' ', '15th', ' ', 'st'),
  ('e', ' ', '112th', ' ', 'st'),
  ('e', ' ', '58th', ' ', 'st'),
  ('e', ' ', '58th', ' ', 'st'),
  ('e', ' ', '30th', ' ', 'st'),
  ('e', ' ', '109th', ' ', 'st'),
  ('w', ' ', '40th', ' ', 'st'),
  ('e', ' ', '100th', ' ', 'st'),
  ('e', ' ', '7th', ' ', 'st'),
  ('e', ' ', '58th', ' ', 'st'),
  ('w', ' ', '58th', ' ', 'st'),
  ('w', ' ', '33rd', ' ', 'st'),
  ('bay', ' ', '8th', ' ', 'st'),
  ('w', ' ', '31st', ' ', 'st'),
  ('w', ' ', '18th', ' ', 'st'),
  ('w', ' ', '49th', ' ', 'st'),
  ('west', ' ', '38th', ' ', 'st'),
  ('e', ' ', '201st', ' ', 'st'),
  ('e', ' ', '22nd', ' ', 'st'),
  ('w', ' ', '41st', ' ', 'st'),
 

In [10]:
# compile a pattern per group

from openclean_pattern.regex.compiler import DefaultRegexCompiler

compiler = DefaultRegexCompiler(method='col') # use method='col' if using Alignments

In [11]:
# we see the dominant pattern per group in this column

pats = list()
for al in aligned:
    print('######')
    print(al[0])
    patterns = compiler.compile_each(al)
    p = patterns.top(1, True)
    pats.append(p)
    print(p)
    print()

######
('e', ' ', '48th', ' ', 'st')
[ALPHA(1-4), \S(1-1), ALPHANUM(3-5), \S(1-1), STREET(2-6)]

######
('9th', ' ', 'ave')
[ALPHANUM(3-5), \S(1-1), STREET(2-3)]

######
('main', ' ', 'st')
[ALPHA(4-12), \S(1-1), STREET(2-5)]

######
('avenue', ' ', 'x')
[STREET(2-6), \S(1-1), ALPHA(1-12)]

######
('chrystie', ' ', 'st', '.', '(', 'n', '/', 'b', ')', '@', 'e', '.')
[ALPHA(6-8), \S(1-1), STREET(2-4), PUNC(1-1), PUNC(1-1), ALPHA(1-1), PUNC(1-1), ALPHA(1-1), PUNC(1-1), PUNC(1-1), ALPHA(1-5), GAP(0-0)]

######
('w', ' ', '43', ' ', 'st')
[ALPHA(1-1), \S(1-1), NUMERIC(1-3), \S(1-1), STREET(2-2)]

######
('broadway',)
[ALPHA(6-8)]

######
('cross', ' ', 'island', ' ', 'pky')
[ALPHA(5-10), \S(1-1), STREET(4-6), \S(1-1), STREET(2-6)]

######
('n', ' ', 'moore', ' ', 'st')
[ALPHA(1-6), \S(1-1), ALPHA(3-8), \S(1-1), STREET(2-4)]

######
('41', ' ', 'st')
[NUMERIC(1-3), \S(1-1), STREET(2-6)]

######
('union', ' ', 'tpke')
[STREET(4-7), \S(1-1), STREET(2-4)]



In [12]:
# we find values not contained within any pattern by creating an openclean operator (eval function) 
# and applying it on the data stream

def is_match(x):
    for p in pats:
        match = p.compare(x, rt)
        if match:
            return True
    return False

from openclean.function.eval.base import Eval

non_anomalous_pattern = Eval(columns=[c], 
     func = is_match,
     is_unary = True)

In [13]:
non_anom = df.where(non_anomalous_pattern).to_df()

In [14]:
# non anomalous values (8650 rows out of 10k)
non_anom['Street']

1532741    amsterdam ave
1619344           3rd st
4721758     merrick blvd
1141807         122nd st
8165367          42nd st
               ...      
2185565          7th ave
603308       mckibben st
3125639         e 7th st
7211433     sutphin blvd
6351600         avenue u
Name: Street, Length: 8650, dtype: object

In [15]:
# anomalous values (that didn't match any pattern from the group - 1350 rows)
dfs = df.to_df()
dfs[~dfs.index.isin(non_anom.index)]['Street']

3145095           mcgraw avenue
6311880    wb 4th ave @ 41st st
5701324    astoria blvd.(w/b)@s
5836442           e gun hill rd
4940514    rockaway pkwy. (s/b)
                   ...         
2034664    queens blvd.(e/b)@70
1964869          s/s freeman st
73738      e. fordham rd.(w/b)@
8518236          s/w crnr 65 st
8225637    wb bch channel dr bc
Name: Street, Length: 1350, dtype: object