## Pattern Exercises

Goals:
1. pattern generation
2. basic anomaly detection
3. non-basic/advanced anomaly detection

In [1]:
import pandas as pd
import pprint

pp = pprint.PrettyPrinter(indent=4)

In [2]:
# load data
df = pd.read_csv('../resources/dev/urban.csv')
pp.pprint(df.shape)
df.head()

(31450, 17)


Unnamed: 0,Registry Number,Business Name,Entity Type,Registry Date,Associated Name Type,First Name,Middle Name,Last Name,Suffix,Not of Record Entity,Entity of Record Reg Number,Entity of Record Name,Address,Address Continued,City,State,Zip Code
0,167799296,"ANOVAWORKS, PLLC",FOREIGN LIMITED LIABILITY COMPANY,07/01/2020,MAILING ADDRESS,,,,,,,,413 N MISSION ST,,WENATCHEE,WA,98801
1,167799296,"ANOVAWORKS, PLLC",FOREIGN LIMITED LIABILITY COMPANY,07/01/2020,PRINCIPAL PLACE OF BUSINESS,,,,,,,,413 N MISSION ST,,WENATCHEE,WA,98801
2,167799296,"ANOVAWORKS, PLLC",FOREIGN LIMITED LIABILITY COMPANY,07/01/2020,REGISTERED AGENT,,,,,,51329093.0,NORTHWEST REGISTERED AGENT LLC,5305 RIVER RD N,STE B,KEIZER,OR,97303
3,168957992,BLACK CROWS CORP.,FOREIGN BUSINESS CORPORATION,07/01/2020,MAILING ADDRESS,,,,,,,,C/O PRAMEX INTERNATIONAL,"1251 AVENUE OF THE AMERICAS, FL3",NEW YORK,NY,10020
4,168957992,BLACK CROWS CORP.,FOREIGN BUSINESS CORPORATION,07/01/2020,PRESIDENT,ERIC,,BASCLE,,,,,C/O PRAMEX INTERNATIONAL,"1251 AVENUE OF THE AMERICAS, FL3",NEW YORK,NY,10020


### Generate Patterns

In [3]:
# using the default patternfinder to quickly understand column patterns
from openclean_pattern.patternfinder import PatternFinder
column = 'Entity Type'

In [4]:
# create a new PatternFinder object with the data series
pf = PatternFinder(
    series = df[column]
)

In [5]:
# find patterns
patterns = pf.find()

In [6]:
pp.pprint(patterns)

{   5: [PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA)],
    7: [PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA)],
    9: [PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA)]}


In [7]:
# analyze all identified patterns from the column 
for i in range(1,len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(len(pf.top(i).idx))
    pp.pprint([pf.series[c] for i, c in enumerate(list(pf.top(i).idx)) if i < 10])
    print()

'## Pattern Ranked 1 for column: Entity Type'
[PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA)]
10
[   'foreign professional corporation',
    'domestic limited partnership',
    'foreign nonprofit corporation',
    'assumed business name',
    'domestic business trust',
    'domestic professional corporation',
    'domestic nonprofit corporation',
    'foreign limited partnership',
    'foreign business corporation',
    'domestic business corporation']

'## Pattern Ranked 2 for column: Entity Type'
[PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA)]
2
[   'domestic registered limited liability partnership',
    'foreign registered limited liability partnership']

'## Pattern Ranked 3 for column: Entity Type'
[PatternElement(ALPHA), PatternEle

### Basic Outliers

In [8]:
# create a new PatternFinder object with the data series
column = 'Zip Code'
pf = PatternFinder(
    series = df[column]
)
patterns = pf.find()

In [9]:
# looking at the patterns discovered in the zipcode column, we see there are 36 values 
# that deviate from the rest of the column 
for i in range(1,len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(len(pf.top(i).idx))
    pp.pprint([pf.series[c] for i, c in enumerate(list(pf.top(i).idx)) if i < 10])
    print()

'## Pattern Ranked 1 for column: Zip Code'
[PatternElement(DIGIT)]
1148
[   '32256',
    '10022',
    '80905',
    '33411',
    '92865',
    '97294',
    '49008',
    '77433',
    '97282',
    '33716']

'## Pattern Ranked 2 for column: Zip Code'
[PatternElement(ALPHANUM), PatternElement(SPACE_REP), PatternElement(ALPHANUM)]
9
[   'm5e 1k3',
    'v6z 0c8',
    't6j 7j8',
    'm4m 3g3',
    'v6c 1c7',
    'v4k 0a5',
    'h9r 6b1',
    'v5m 4t5',
    'v4n 5k8']



In [10]:
# they are canadian zipcodes
df[df['Zip Code'].str.lower() == 'v5m 4t5']

Unnamed: 0,Registry Number,Business Name,Entity Type,Registry Date,Associated Name Type,First Name,Middle Name,Last Name,Suffix,Not of Record Entity,Entity of Record Reg Number,Entity of Record Name,Address,Address Continued,City,State,Zip Code
16351,169564193,QUARTECH CORRECTIONS LLC,FOREIGN LIMITED LIABILITY COMPANY,07/17/2020,MAILING ADDRESS,,,,,,,,2889 12TH AVENUE E,SUITE 650,VANCOUVER,BRITISH COLUMBIA,V5M 4T5
16352,169564193,QUARTECH CORRECTIONS LLC,FOREIGN LIMITED LIABILITY COMPANY,07/17/2020,MEMBER,DAVID,,MARSHALL,,,,,2889 12TH AVENUE E,SUITE 650,VANCOUVER,BRITISH COLUMBIA,V5M 4T5
16353,169564193,QUARTECH CORRECTIONS LLC,FOREIGN LIMITED LIABILITY COMPANY,07/17/2020,PRINCIPAL PLACE OF BUSINESS,,,,,,,,2889 12TH AVENUE E,SUITE 650,VANCOUVER,BRITISH COLUMBIA,V5M 4T5


### Non-Basic Outliers

In [11]:
# Businesses

In [12]:
from openclean_pattern.tokenize.regex import RegexTokenizer
from openclean_pattern.datatypes.resolver import BusinessEntityResolver, DefaultTypeResolver

# create a new DefaultTypeResolver object (identifies basic types)
# intercepted by a BusinessEntityResolver (identifies company suffixes)
dtr = DefaultTypeResolver(interceptors=BusinessEntityResolver())

# create a new RegexTokenizer that'll tokenize the remaining values not identified by the type resolvers
# on all delimiters except dots(.) because they're abbreviation characters
rt = RegexTokenizer(type_resolver=dtr, abbreviations=True)

# create a new PatternFinder object with the data series and the TypeResolvers to use as part of the Tokenizer
column = 'Business Name'
pf = PatternFinder(
    series = df[column],
    tokenizer = rt
)

In [13]:
patterns = pf.find()

In [14]:
#  Ranked # 3 and 7 dont have any business suffixes
# these businesses dont have a suffix in their names, these could be potential errors
for i in range(1, len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(pf.top(i).freq)
    print()

'## Pattern Ranked 1 for column: Business Name'
[PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(BE)]
1174

'## Pattern Ranked 2 for column: Business Name'
[PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(BE)]
904

'## Pattern Ranked 3 for column: Business Name'
[PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA)]
641

'## Pattern Ranked 4 for column: Business Name'
[PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(PUNCTUATION), PatternElement(SPACE_REP), PatternElement(BE)]
469

'## Pattern Ranked 5 for column: Business Name'
[PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(PUNCTUATION), PatternElement(SPACE_REP), PatternElement(BE)]
394

'## Pattern Ranked

In [15]:
#  Address

In [16]:
from openclean_pattern.datatypes.resolver import AddressDesignatorResolver, GeoSpatialResolver

In [17]:
# create a new DefaultTypeResolver object (identifies basic types)
# intercepted by an Address and Geospatial Resolver (identify addresses and countries)
dtr = DefaultTypeResolver(interceptors=[AddressDesignatorResolver()])

# create a new RegexTokenizer that'll tokenize the remaining values not identified by the type resolvers
# on all delimiters
rt = RegexTokenizer(type_resolver=dtr)

# create a new PatternFinder object with the data series and the TypeResolvers to use as part of the Tokenizer
column = 'Address '
pf = PatternFinder(
    series = df[column],
    tokenizer = rt
)

In [18]:
patterns = pf.find()

In [19]:
# address patterns ranked 10+ seem like possible outliers
for i in range(1, len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(pf.top(i).freq)
    print()

'## Pattern Ranked 1 for column: Address '
[PatternElement(DIGIT), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(STREET)]
2551

'## Pattern Ranked 2 for column: Address '
[PatternElement(DIGIT), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(STREET)]
1715

'## Pattern Ranked 3 for column: Address '
[PatternElement(DIGIT), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(STREET), PatternElement(SPACE_REP), PatternElement(SUD), PatternElement(SPACE_REP), PatternElement(DIGIT)]
387

'## Pattern Ranked 4 for column: Address '
[PatternElement(DIGIT), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(STREET), PatternElement(SPACE_REP), PatternElement(SUD), PatternElement(SPACE_REP), PatternElement(DIGIT)]
191

'## Patter

In [20]:
for i in range(10,16):
    pp.pprint(df.loc[pf.top(i).idx, column].unique())

array(['42340 NW DEPOT ST', '61775 HARMONY LN', '2037 FREMONT ST',
       '250 PRINCETON AVE STE 104', '26331 S HARMS RD'], dtype=object)
array(['5305 RIVER RD N STE B', '3223 N CRATER LANE',
       '12270 SW GINGHAM LANE', '1120 112TH AVE NE'], dtype=object)
array(['5115 SE 111TH AVE', '12265 SW HALL BLVD #7'], dtype=object)
array(['75 SE YAMHILL ST SUITE 202'], dtype=object)
array(['4605 NE FREMONT ST'], dtype=object)
array(['165 S MARBLE DRIVE'], dtype=object)


In [21]:
# Geospatial (without using the PatternFinder class)

In [22]:
# Combined Geo
geo = df[['Address ', 'Address Continued' ,'City' ,'State', 'Zip Code']]
geo.loc[:, 'Full Address'] = df['Address '] + ',\n' + df['Address Continued'].fillna('') + '\n' + df['City'] + ', '+df['State']+', '  + df['Zip Code']
pp.pprint(geo['Full Address'].iloc[0])

'413 N MISSION ST,\n\nWENATCHEE, WA, 98801'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [23]:
# create a new DefaultTypeResolver object (identifies basic types)
# intercepted by an Address and Geospatial Resolver (identify addresses and countries)
dtr = DefaultTypeResolver(interceptors=[AddressDesignatorResolver(), GeoSpatialResolver()])

# create a new RegexTokenizer that'll tokenize the remaining values not identified by the type resolvers
# on all delimiters
rt = RegexTokenizer(type_resolver=dtr)

# encode unique values using the TypeResolvers in the Tokenizer
series = geo['Full Address'].unique().tolist()
encoded = rt.encode(series)

In [24]:
pp.pprint(encoded[0:5])

[   (   _'NUMERIC'_(3,'413'),
        _'\\S'_(1,' '),
        _'ALPHA'_(1,'n'),
        _'\\S'_(1,' '),
        _'STREET'_(7,'mission'),
        _'\\S'_(1,' '),
        _'STREET'_(2,'st'),
        _'PUNC'_(1,','),
        _'\\S'_(1,'\n'),
        _'\\S'_(1,'\n'),
        _'ALPHA'_(9,'wenatchee'),
        _'PUNC'_(1,','),
        _'\\S'_(1,' '),
        _'ADMIN_2'_(2,'wa'),
        _'PUNC'_(1,','),
        _'\\S'_(1,' '),
        _'NUMERIC'_(5,'98801')),
    (   _'NUMERIC'_(4,'5305'),
        _'\\S'_(1,' '),
        _'STREET'_(5,'river'),
        _'\\S'_(1,' '),
        _'STREET'_(2,'rd'),
        _'\\S'_(1,' '),
        _'ALPHA'_(1,'n'),
        _'PUNC'_(1,','),
        _'\\S'_(1,'\n'),
        _'SUD'_(3,'ste'),
        _'\\S'_(1,' '),
        _'ALPHA'_(1,'b'),
        _'\\S'_(1,'\n'),
        _'ALPHA'_(6,'keizer'),
        _'PUNC'_(1,','),
        _'\\S'_(1,' '),
        _'ALPHA'_(2,'or'),
        _'PUNC'_(1,','),
        _'\\S'_(1,' '),
        _'NUMERIC'_(5,'97303')),
    (   _'ALPH

In [25]:
# align the column
from openclean_pattern.align.group import GroupAligner

ga = GroupAligner()
aligned = ga.align(encoded)

In [26]:
# compile the pattern
from openclean_pattern.regex.compiler import DefaultRegexCompiler

rws = DefaultRegexCompiler()
compiled = rws.compile(encoded, aligned)

In [27]:
# and the anomalies
anomalies = rws.anomalies(encoded, aligned)

In [28]:
# lets look at a pattern and it's anomaly
compiled[20]

[PatternElement(DIGIT), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(SPACE_REP), PatternElement(STREET), PatternElement(PUNCTUATION), PatternElement(SPACE_REP), PatternElement(SUD), PatternElement(SPACE_REP), PatternElement(DIGIT), PatternElement(SPACE_REP), PatternElement(ADMIN_LEVEL_1), PatternElement(PUNCTUATION), PatternElement(SPACE_REP), PatternElement(ALPHA), PatternElement(PUNCTUATION), PatternElement(SPACE_REP), PatternElement(DIGIT)]

In [29]:
[series[i] for i in anomalies[20]]
# e.g. the pattern for #1 is actually:
# '5305 RIVER RD N,\nSTE B\nKEIZER, OR, 97303'
# DIGIT \S ALPHA \S STREET \s ALPHA ... 

['5305 RIVER RD N,\nSTE B\nKEIZER, OR, 97303',
 '5305 RIVER RD N,\nSUITE B\nKEIZER, OR, 97303',
 '5305 RIVER ROAD N,\nSTE B\nKEIZER, OR, 97303',
 '740 NE 3RD STREET,\nSUITE 3\nBEND, OR, 97702',
 '1244 NE 2ND ST,\nUNIT 1\nBEND, OR, 97701',
 '1302 NE 3RD ST,\nSTE 1\nBEND, OR, 97701',
 '8191 N LOMBARD ST,\nUNIT 107\nPORTLAND, OR, 97203',
 '4150 N WILLIAMS AVE,\nAPT 418\nPORTLAND, OR, 97217',
 '4150 N WILLIAMS AVE,\nAPT 312\nPORTLAND, OR, 97217',
 '4357 N WILLIAMS AVE,\nAPT 312\nPORTLAND, OR, 97217',
 '2121 SE BELMONT ST,\nAPT 125\nPORTLAND, OR, 97214',
 '3600 N WILLIAMS AVENUE,\nAPT 216\nPORTLAND, OR, 97227',
 '1926 W BURNSIDE ST,\nUNIT 317\nPORTLAND, OR, 97209',
 '15597 NW ATHENS DRIVE,\nAPT 243\nPORTLAND, OR, 97229',
 '18625 EAST BURNSIDE STREET,\nUNIT 21\nPORTLAND, OR, 97233',
 '1259 DEBRICK RD,\n1259 DEBRICK RD\nEUGENE, OR, 97401',
 '2501 CAPITAL AVE,\n2501 CAPITAL AVE\nMEDFORD, OR, 97504',
 '1120 112TH AVE NE,\nSUITE 600\nBELLEVUE, WA, 98004',
 '1909 NE SHEPARD RD,\nSUITE E\nBEND, OR