## Pattern Exercises

Goals:
1. generate patterns
2. basic anomalies
3. advanced anomalies

In [38]:
import pandas as pd
import pprint

pp = pprint.PrettyPrinter(indent=4)

In [39]:
# load data
df = pd.read_csv('../resources/dev/urban.csv')
pp.pprint(df.shape)
df.head()

(31450, 17)


Unnamed: 0,Registry Number,Business Name,Entity Type,Registry Date,Associated Name Type,First Name,Middle Name,Last Name,Suffix,Not of Record Entity,Entity of Record Reg Number,Entity of Record Name,Address,Address Continued,City,State,Zip Code
0,167799296,"ANOVAWORKS, PLLC",FOREIGN LIMITED LIABILITY COMPANY,07/01/2020,MAILING ADDRESS,,,,,,,,413 N MISSION ST,,WENATCHEE,WA,98801
1,167799296,"ANOVAWORKS, PLLC",FOREIGN LIMITED LIABILITY COMPANY,07/01/2020,PRINCIPAL PLACE OF BUSINESS,,,,,,,,413 N MISSION ST,,WENATCHEE,WA,98801
2,167799296,"ANOVAWORKS, PLLC",FOREIGN LIMITED LIABILITY COMPANY,07/01/2020,REGISTERED AGENT,,,,,,51329093.0,NORTHWEST REGISTERED AGENT LLC,5305 RIVER RD N,STE B,KEIZER,OR,97303
3,168957992,BLACK CROWS CORP.,FOREIGN BUSINESS CORPORATION,07/01/2020,MAILING ADDRESS,,,,,,,,C/O PRAMEX INTERNATIONAL,"1251 AVENUE OF THE AMERICAS, FL3",NEW YORK,NY,10020
4,168957992,BLACK CROWS CORP.,FOREIGN BUSINESS CORPORATION,07/01/2020,PRESIDENT,ERIC,,BASCLE,,,,,C/O PRAMEX INTERNATIONAL,"1251 AVENUE OF THE AMERICAS, FL3",NEW YORK,NY,10020


### Generate Patterns

In [40]:
# using the default patternfinder to quickly understand column patterns
from openclean_pattern.patternfinder import PatternFinder
column = 'Entity Type'

In [41]:
# create a new PatternFinder object with the data series
pf = PatternFinder(
    series = df[column]
)

In [42]:
# find patterns
patterns = pf.find()

In [43]:
pp.pprint(patterns)

{   5: PatternRow(ALPHA(7-8) (1-1)ALPHA(7-12) (1-1)ALPHA(4-11)),
    7: PatternRow(XoXeXXXX(7-8) (1-1)limited(7-7) (1-1)liability(9-9) (1-1)company(7-7)),
    9: PatternRow(XoXeXXXX(7-8) (1-1)registered(10-10) (1-1)limited(7-7) (1-1)liability(9-9) (1-1)partnership(11-11))}


In [44]:
# analyze all identified patterns from the column 
for i in range(1,len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(pf.top(i).freq)
    pp.pprint(df.loc[list(pf.top(i).idx), column].unique())
    print()

'## Pattern Ranked 1 for column: Entity Type'
PatternRow(XoXeXXXX(7-8) (1-1)limited(7-7) (1-1)liability(9-9) (1-1)company(7-7))
21020
array(['FOREIGN LIMITED LIABILITY COMPANY',
       'DOMESTIC LIMITED LIABILITY COMPANY'], dtype=object)

'## Pattern Ranked 2 for column: Entity Type'
PatternRow(ALPHA(7-8) (1-1)ALPHA(7-12) (1-1)ALPHA(4-11))
10413
array(['FOREIGN BUSINESS CORPORATION', 'ASSUMED BUSINESS NAME',
       'DOMESTIC PROFESSIONAL CORPORATION',
       'DOMESTIC BUSINESS CORPORATION', 'DOMESTIC NONPROFIT CORPORATION',
       'FOREIGN LIMITED PARTNERSHIP', 'FOREIGN NONPROFIT CORPORATION',
       'FOREIGN PROFESSIONAL CORPORATION', 'DOMESTIC BUSINESS TRUST',
       'DOMESTIC LIMITED PARTNERSHIP'], dtype=object)

'## Pattern Ranked 3 for column: Entity Type'
PatternRow(XoXeXXXX(7-8) (1-1)registered(10-10) (1-1)limited(7-7) (1-1)liability(9-9) (1-1)partnership(11-11))
17
array(['FOREIGN REGISTERED LIMITED LIABILITY PARTNERSHIP',
       'DOMESTIC REGISTERED LIMITED LIABILITY PARTNERSH

### Basic Outliers

In [45]:
# create a new PatternFinder object with the data series
column = 'Zip Code'
pf = PatternFinder(
    series = df[column]
)
patterns = pf.find()

In [46]:
# looking at the patterns discovered in the zipcode column, we see there are 36 values 
# that deviate from the rest of the column 
for i in range(1,len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(pf.top(i).freq)
    pp.pprint(df.loc[list(pf.top(i).idx), column].unique())
    print()

'## Pattern Ranked 1 for column: Zip Code'
PatternRow(NUMERIC(5-5))
31414
array(['98801', '97303', '10020', ..., '95441', '57702', '27103'],
      dtype=object)

'## Pattern Ranked 2 for column: Zip Code'
PatternRow(ALPHANUM(3-3) (1-1)ALPHANUM(3-3))
36
array(['H9R 6B1', 'T6J 7J8', 'M5E 1K3', 'V4K 0A5', 'V6Z 0C8', 'M4M 3G3',
       'V4N 5K8', 'V5M 4T5', 'V6C 1C7'], dtype=object)



In [50]:
# they are canadian zipcodes
df.loc[list(pf.top(i).idx)]

Unnamed: 0,Registry Number,Business Name,Entity Type,Registry Date,Associated Name Type,First Name,Middle Name,Last Name,Suffix,Not of Record Entity,Entity of Record Reg Number,Entity of Record Name,Address,Address Continued,City,State,Zip Code
21377,169759496,NOBLE FOODS NUTRITION USA INC.,FOREIGN BUSINESS CORPORATION,07/22/2020,SECRETARY,DOMENIC,,MANCINI,,,,,250 AV AVRO,,POINTE-CLAIRE,QUEBEC,H9R 6B1
22915,169824894,CONNECTION HUB LLC,DOMESTIC LIMITED LIABILITY COMPANY,07/24/2020,INDIVIDUAL WITH DIRECT KNOWLEDGE,SAYED,SAEED,AHMAD,,,,,418-11511 27 AVE NW,,EDMONTON,ALBERTA,T6J 7J8
22916,169824894,CONNECTION HUB LLC,DOMESTIC LIMITED LIABILITY COMPANY,07/24/2020,MAILING ADDRESS,,,,,,,,418-11511 27 AVE NW,,EDMONTON,ALBERTA,T6J 7J8
12421,169404796,"MKII SERVICE, INC.",FOREIGN BUSINESS CORPORATION,07/14/2020,PRESIDENT,CLIVE,,KINROSS,,,,,69 YONGE ST SUITE 600,,TORONTO,ONTARIO,M5E 1K3
12422,169404796,"MKII SERVICE, INC.",FOREIGN BUSINESS CORPORATION,07/14/2020,PRINCIPAL PLACE OF BUSINESS,,,,,,,,69 YONGE ST SUITE 600,,TORONTO,ONTARIO,M5E 1K3
22917,169824894,CONNECTION HUB LLC,DOMESTIC LIMITED LIABILITY COMPANY,07/24/2020,MANAGER,SAYED,SAEED,AHMAD,,,,,418-11511 27 AVE NW,,EDMONTON,ALBERTA,T6J 7J8
12424,169404796,"MKII SERVICE, INC.",FOREIGN BUSINESS CORPORATION,07/14/2020,SECRETARY,CLIVE,,KINROSS,,,,,69 YONGE ST SUITE 600,,TORONTO,ONTARIO,M5E 1K3
15631,169469294,TASMAN AIR SERVICES LLC,DOMESTIC LIMITED LIABILITY COMPANY,07/16/2020,MANAGER,MARK,,KROEKER,,,,,#5 4340 KING ST,,DELTA,BRITISH COLUMBIA,V4K 0A5
21399,169765295,NOBLE FOODS NUTRITION USA HOLDINGS INC.,FOREIGN BUSINESS CORPORATION,07/22/2020,MAILING ADDRESS,,,,,,,,250 AV AVRO,,POINTE-CLAIRE,QUEBEC,H9R 6B1
21400,169765295,NOBLE FOODS NUTRITION USA HOLDINGS INC.,FOREIGN BUSINESS CORPORATION,07/22/2020,PRESIDENT,DOMENIC,,MANCINI,,,,,250 AV AVRO,,POINTE-CLAIRE,QUEBEC,H9R 6B1


### Non-Basic Outliers

In [115]:
# Businesses

In [58]:
from openclean_pattern.tokenize.regex import RegexTokenizer
from openclean_pattern.datatypes.resolver import BusinessEntityResolver, DefaultTypeResolver

# create a new DefaultTypeResolver object (identifies basic types)
# intercepted by a BusinessEntityResolver (identifies company suffixes)
dtr = DefaultTypeResolver(interceptors=BusinessEntityResolver())

# create a new RegexTokenizer that'll tokenize the remaining values not identified by the type resolvers
# on all delimiters except dots(.) because they're abbreviation characters
rt = RegexTokenizer(type_resolver=dtr, abbreviations=True)

# create a new PatternFinder object with the data series and the TypeResolvers to use as part of the Tokenizer
column = 'Business Name'
pf = PatternFinder(
    series = df[column],
    tokenizer = rt
)

In [59]:
patterns = pf.find()

In [80]:
#  Ranked # 5 and 9 dont have any business suffixes
for i in range(1, len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(pf.top(i).freq)
    print()

'## Pattern Ranked 1 for column: Business Name'
PatternRow(ALPHA(1-14) (1-1)ALPHA(1-14) (1-1)BUSINESS(2-25))
6037

'## Pattern Ranked 2 for column: Business Name'
PatternRow(ALPHA(1-12) (1-1)ALPHA(1-13) (1-1)ALPHA(1-14) (1-1)BUSINESS(2-25))
4646

'## Pattern Ranked 3 for column: Business Name'
PatternRow(ALPHA(1-14) (1-1)ALPHA(1-15)PUNC(,!) (1-1)BUSINESS(2-4))
2488

'## Pattern Ranked 4 for column: Business Name'
PatternRow(ALPHA(1-11) (1-1)ALPHA(1-14) (1-1)ALPHA(1-14)PUNC(,) (1-1)BUSINESS(2-9))
2034

'## Pattern Ranked 5 for column: Business Name'
PatternRow(ALPHA(1-13) (1-1)ALPHA(1-14))
2027

'## Pattern Ranked 6 for column: Business Name'
PatternRow(ALPHA(1-11) (1-1)ALPHA(1-14) (1-1)ALPHA(1-12) (1-1)ALPHA(2-14) (1-1)BUSINESS(3-25))
1317

'## Pattern Ranked 7 for column: Business Name'
PatternRow(ALPHA(3-18)PUNC(,) (1-1)BUSINESS(3-4))
674

'## Pattern Ranked 8 for column: Business Name'
PatternRow(ALPHA(1-11) (1-1)ALPHA(1-12) (1-1)ALPHA(1-14) (1-1)ALPHA(2-12)PUNC(,) (1-1)BUSINESS(2-3

In [82]:
# these businesses dont have a suffix in their names, these could be potential errors
df.loc[pf.top(9).idx, column].unique()

array(['SOULPRIMA', 'PROPHECYTHREE', 'THELOTTER', 'PACKWITHMETRAVEL',
       'OHT', 'OPENART', 'VIDIVY', 'OIRR', 'FIGURE', 'ONYXTELE',
       'LAWRENCEMORRELL.COM', 'WHIMSY', 'WAKEPORT', 'NOBLEKNOTS.PNW',
       'SPEAKUPWITHUS.ORG', 'CREECHARTS', 'DABUDEASE', 'OUELLETTE',
       'LUCIANOFIT', 'BOOPS', 'POPPLE', 'SAGEANDGRAE', 'SHOPHEYDONAE',
       'VISIONARYDREAMSFILMS', 'MTNSERVICES', 'ROBOSUSHI', 'SINGCOOKIES',
       'RHOJOS', 'LOCARE', 'LOWD', 'GUSTO', 'GREATNONPROFITS',
       'TCBADGERSONELOVE', 'SAFEWAY.COM', 'TINE', 'FERNWOOD',
       'REDCAKERECYCLE', 'LBHC', 'PERFEITO', 'SHAPED', 'SOUNDNSIGHT',
       'SQUAWKMEET.COM', 'ESERENITIES', 'TRANSTAT', 'SYNCQUILITY',
       'OKAASAN', 'DIGITIZED', 'FOUNDED', 'TAGIDI', 'DANU',
       'RECESSIONSAFETYNET.COM', 'MSARTISTRY', 'AAVANTGARDE', 'ALOE.BLUE',
       'NORTHSIDEFRIED', 'ALLOW', 'FULAMINGO', 'LUXREDUX', 'NURSETAP',
       'ANNIESCAPES', 'SPADIX', 'SHOPSHAR', 'SURAYA', "ZELDA'S",
       'YEONDAE', 'MINUTECLINIC', 'EXESTHETICS', 

In [117]:
#  Address

In [118]:
from openclean_pattern.datatypes.resolver import AddressDesignatorResolver, GeoSpatialResolver

In [119]:
# create a new DefaultTypeResolver object (identifies basic types)
# intercepted by an Address and Geospatial Resolver (identify addresses and countries)
dtr = DefaultTypeResolver(interceptors=[AddressDesignatorResolver()])

# create a new RegexTokenizer that'll tokenize the remaining values not identified by the type resolvers
# on all delimiters
rt = RegexTokenizer(type_resolver=dtr)

# create a new PatternFinder object with the data series and the TypeResolvers to use as part of the Tokenizer
column = 'Address '
pf = PatternFinder(
    series = df[column],
    tokenizer = rt
)

In [120]:
patterns = pf.find()

In [121]:
# address patterns ranked 10+ seem like possible outliers
for i in range(1, len(patterns)+1):
    pp.pprint("## Pattern Ranked {} for column: {}".format(i, column))
    pp.pprint(pf.top(i))
    pp.pprint(pf.top(i).freq)
    print()

'## Pattern Ranked 1 for column: Address '
PatternRow(NUMERIC(1-6) (1-1)ALPHA(1-12) (1-1)ALPHA(1-13) (1-1)STREET(2-9))
8353

'## Pattern Ranked 2 for column: Address '
PatternRow(NUMERIC(1-6) (1-1)ALPHA(1-14) (1-1)STREET(2-9))
5400

'## Pattern Ranked 3 for column: Address '
PatternRow(NUMERIC(1-5) (1-1)ALPHA(1-10) (1-1)ALPHA(2-12) (1-1)STREET(2-9) (1-1)SUD(3-5) (1-1)NUMERIC(1-5))
1474

'## Pattern Ranked 4 for column: Address '
PatternRow(NUMERIC(3-5) (1-1)ALPHA(1-9) (1-1)ALPHA(1-10) (1-1)STREET(3-8) (1-1)STREET(2-7))
682

'## Pattern Ranked 5 for column: Address '
PatternRow(NUMERIC(2-5) (1-1)ALPHA(1-5) (1-1)ALPHA(1-10) (1-1)STREET(2-6) (1-1)PUNC(#)NUMERIC(1-5))
155

'## Pattern Ranked 6 for column: Address '
PatternRow(NUMERIC(1-5) (1-1)ALPHA(5-14))
81

'## Pattern Ranked 7 for column: Address '
PatternRow(NUMERIC(3-5) (1-1)ALPHA(1-11) (1-1)STREET(2-7) (1-1)PUNC(#)NUMERIC(1-4))
80

'## Pattern Ranked 8 for column: Address '
PatternRow(NUMERIC(3-5) (1-1)ALPHA(1-2) (1-1)ALPHA(3-10) (1

In [122]:
for i in range(10,16):
    pp.pprint(df.loc[pf.top(i).idx, column].unique())

array(['57333', '41', '3518', '30636', '16101'], dtype=object)
array(['% IPM 1800 SW 1ST AVE STE 220', '% IPM 1800 SW 1ST AVE STE 200'],
      dtype=object)
array(['#2', '#162', '#7', '#226'], dtype=object)
array(['2738 SE 82ND AVE APT 101 #201A'], dtype=object)
array(['185TH AVE #158'], dtype=object)
array(['5305 RIVER RD N STE B KEIZER OR 97303'], dtype=object)


In [None]:
# Geospatial

In [155]:
# Combined Geo
geo = df[['Address ', 'Address Continued' ,'City' ,'State', 'Zip Code']]
geo.loc[:, 'combined'] = df['City'] + ' ' + df['Zip Code']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [156]:
geo['combined']

0          WENATCHEE 98801
1          WENATCHEE 98801
2             KEIZER 97303
3           NEW YORK 10020
4           NEW YORK 10020
               ...        
31445    OREGON CITY 97045
31446    OREGON CITY 97045
31447    OREGON CITY 97045
31448    OREGON CITY 97045
31449    OREGON CITY 97045
Name: combined, Length: 31450, dtype: object

In [157]:
# create a new DefaultTypeResolver object (identifies basic types)
# intercepted by an Address and Geospatial Resolver (identify addresses and countries)
dtr = DefaultTypeResolver(interceptors=[GeoSpatialResolver(levels=[])])

# create a new RegexTokenizer that'll tokenize the remaining values not identified by the type resolvers
# on all delimiters
rt = RegexTokenizer(type_resolver=dtr)

# create a new PatternFinder object with the data series and the TypeResolvers to use as part of the Tokenizer
column = 'combined'
pf = PatternFinder(
    series = geo[column],
    tokenizer = rt
)

In [158]:
pf.find()

{3: PatternRow(ALPHA(3-13) (1-1)NUMERIC(5-5)),
 5: PatternRow(ALPHA(2-9) (1-1)STREET(3-8) (1-1)NUMERIC(5-5)),
 7: PatternRow(salt(4-4) (1-1)lake(4-4) (1-1)city(4-4) (1-1)841XX(5-5))}