## Identifying patterns in a date column

In [1]:
import pandas as pd

In [2]:
dates = pd.read_csv('../resources/dev/dates.txt', header=None, squeeze=True)
dates.sample(5, random_state=42)

0       2015-16
5       2012-13
46       FY2019
31    2009-2010
13      2013/14
Name: 0, dtype: object

In [3]:
# Tokenize the column
from openclean_pattern.tokenize.factory import TokenizerFactory
tokenizer = TokenizerFactory.create_tokenizer('default')
tokenizer.tokenize(dates)

[('2015', '-', '16'),
 ('2016', '-', '17'),
 ('2017', '-', '18'),
 ('2014', '-', '15'),
 ('2013', '-', '14'),
 ('2012', '-', '13'),
 ('2015', '/', '16'),
 ('2016', '/', '17'),
 ('2011', '-', '12'),
 ('2014', '/', '15'),
 ('2017', '/', '18'),
 ('2010', '-', '11'),
 ('2012', '/', '13'),
 ('2013', '/', '14'),
 ('2018', '-', '19'),
 ('2013', '-', '2014'),
 ('2015', '-', '2016'),
 ('2011', '-', '2013'),
 ('2013', '-', '2015'),
 ('2015', '-', '2017'),
 ('2009', '-', '2011'),
 ('2012', '-', '2014'),
 ('2014', '-', '2016'),
 ('2016', '-', '2017'),
 ('2010', '-', '2012'),
 ('2011', '-', '2012'),
 ('2014', '-', '2015'),
 ('2010', '-', '2011'),
 ('2011', '/', '12'),
 ('2012', '-', '2013'),
 ('2007', '-', '2008'),
 ('2009', '-', '2010'),
 ('2010', '/', '11'),
 ('2017', '-', '2018'),
 ('2008', '-', '2009'),
 ('2009', '/', '10'),
 ('2011', '–', '12'),
 ('2006', '-', '2007'),
 ('2007', '-', '2009'),
 ('2008', '-', '2010'),
 ('2013', '–', '14'),
 ('2014', '–', '15'),
 ('2015', '–', '16'),
 ('2018', '/

In [4]:
# Tokenize and convert column into internal token representations
enc = tokenizer.encode(dates)
enc

[(_'NUMERIC'_(4,'2015'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'16')),
 (_'NUMERIC'_(4,'2016'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'17')),
 (_'NUMERIC'_(4,'2017'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'18')),
 (_'NUMERIC'_(4,'2014'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'15')),
 (_'NUMERIC'_(4,'2013'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'14')),
 (_'NUMERIC'_(4,'2012'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'13')),
 (_'NUMERIC'_(4,'2015'), _'PUNC'_(1,'/'), _'NUMERIC'_(2,'16')),
 (_'NUMERIC'_(4,'2016'), _'PUNC'_(1,'/'), _'NUMERIC'_(2,'17')),
 (_'NUMERIC'_(4,'2011'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'12')),
 (_'NUMERIC'_(4,'2014'), _'PUNC'_(1,'/'), _'NUMERIC'_(2,'15')),
 (_'NUMERIC'_(4,'2017'), _'PUNC'_(1,'/'), _'NUMERIC'_(2,'18')),
 (_'NUMERIC'_(4,'2010'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'11')),
 (_'NUMERIC'_(4,'2012'), _'PUNC'_(1,'/'), _'NUMERIC'_(2,'13')),
 (_'NUMERIC'_(4,'2013'), _'PUNC'_(1,'/'), _'NUMERIC'_(2,'14')),
 (_'NUMERIC'_(4,'2018'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'19')),
 (_'NUMERIC'_(4,'2013'), _'PUNC'_(1,'-')

In [5]:
# Collect similar rows together

# from openclean_pattern.align.factory import CollectorFactory
# collector = CollectorFactory.create_collector('cluster')

from openclean_pattern.align.cluster import Cluster
collector = Cluster(min_samples=1)

col = collector.collect(enc)

In [6]:
# encoded rows clustered together. -1 is noise but doesnt exist because we set min_samples=1 in dbscan
col

defaultdict(None,
            {0: [0,
              1,
              2,
              3,
              4,
              5,
              6,
              7,
              8,
              9,
              10,
              11,
              12,
              13,
              14,
              15,
              16,
              17,
              18,
              19,
              20,
              21,
              22,
              23,
              24,
              25,
              26,
              27,
              28,
              29,
              30,
              31,
              32,
              33,
              34,
              35,
              36,
              37,
              38,
              39,
              40,
              41,
              42,
              43,
              47,
              48,
              49,
              50,
              51,
              52,
              53,
              54,
              55,
              56,
              57,

In [7]:
# Samples of encoded rows in each cluster
for j in col:
    print(j)
    for n, i in enumerate(col[j]):
        print(enc[i])
        if n > 5:
            break
    print()

0
(_'NUMERIC'_(4,'2015'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'16'))
(_'NUMERIC'_(4,'2016'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'17'))
(_'NUMERIC'_(4,'2017'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'18'))
(_'NUMERIC'_(4,'2014'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'15'))
(_'NUMERIC'_(4,'2013'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'14'))
(_'NUMERIC'_(4,'2012'), _'PUNC'_(1,'-'), _'NUMERIC'_(2,'13'))
(_'NUMERIC'_(4,'2015'), _'PUNC'_(1,'/'), _'NUMERIC'_(2,'16'))

1
(_'ALPHANUM'_(6,'fy2016'),)
(_'ALPHANUM'_(6,'fy2017'),)
(_'ALPHANUM'_(6,'fy2019'),)
(_'ALPHANUM'_(6,'fy2018'),)

2
(_'ALPHA'_(2,'fy'), _'\\S'_(1,' '), _'NUMERIC'_(4,'2015'))



In [8]:
# Compile patterns in each cluster
from openclean_pattern.regex.compiler import DefaultRegexCompiler

compiler = DefaultRegexCompiler()
compiled = compiler.compile(enc, col)

In [9]:
compiled

{0: RowPatterns(DIGIT PUNCTUATION DIGIT),
 1: RowPatterns(ALPHANUM),
 2: RowPatterns(ALPHA SPACE_REP DIGIT)}

In [10]:
# Coverage of patterns in each cluster
for k, pattern in compiled.items():
    print(k, pattern.stats())

0 defaultdict(<class 'float'>, {'DIGIT PUNCTUATION DIGIT': 1.0})
1 defaultdict(<class 'float'>, {'ALPHANUM': 1.0})
2 defaultdict(<class 'float'>, {'ALPHA SPACE_REP DIGIT': 1.0})
