# Identify Patterns for Date Columns from Socrata Datasets

Identify patterns for values in a single dominant cluster of terms from columns labeled as **calendar date** in Socrata datasets.

In [1]:
# All example column files are in the local directory 'resources/data/socrata/dates'.
# Files are tab-delimited with two columns: term and frequency count.

import os

inputdir = '../resources/data/socrata/dates'
files = os.listdir(inputdir)

files

['EMPLOYMENT_END_DATE.11.g9vh-zeiw.txt.gz',
 'DATA_FI_PRORROGA.23.hb6v-jcbf.txt.gz',
 'AM_Count.32.w76s-c5u4.txt.gz',
 'COISSUEDDATE.26.94uh-66xv.txt.gz']

In [2]:
# The pattern discovery function takes an input file as the only argument.
# It first extracts a sample of terms from the file. We then look for a
# cluster that covers a large fraction of the terms and derive a pattern
# for that cluster as the function result.

from openclean.pipeline import stream
from openclean_pattern.collect.group import Group
from openclean_pattern.regex.compiler import DefaultRegexCompiler
from openclean_pattern.tokenize.factory import TokenizerFactory


collector = Group()
compiler = DefaultRegexCompiler(method='col')
tokenizer = TokenizerFactory.create_tokenizer('default')


def find_pattern(ds, sample_size=1000, threshold=0.9):
    # Get a sample of terms from the column.
    terms = list(ds.sample(sample_size, random_state=42).to_df()['term'])
    # Tokenize and convert tokens into representation.
    tokenized_terms = tokenizer.encode(terms)
    # Group tokenized terms by number of tokens.
    clusters = collector.collect(tokenized_terms)
    for _, term_ids in clusters.items():
        if len(term_ids) / len(terms) < threshold:
            # Ignore small clusters.
            continue
        # Return the pattern for the found cluster. This assumes that
        # maximally one cluster can satisfy the threshold.
        return compiler.compile(tokenized_terms, {0: term_ids})[0]

In [3]:
from openclean.function.eval.base import Eval


for filename in files:
    print('processing {}'.format(filename))
    # Create data stream that returns the column terms only.
    ds = stream(os.path.join(inputdir, filename), header=['term', 'freq'], delim='\t', compressed=True)\
        .select('term')
    # Get pattern for largest cluster (if exists).
    patterns = find_pattern(ds, sample_size=100)
    if patterns:
        # The column yielded a pattern. List all terms that
        # do not match the pattern.
        print('\n\tPattern: {}\n'.format(patterns))
        pattern = patterns.top(n=1, pattern=True)
        func = pattern.compile(negate=True, tokenizer=tokenizer)
        outliers = ds.filter(Eval(columns='term', func=func)).distinct()
        if outliers:
            for key, _ in outliers.items():
                print('\t{}'.format(key))
        else:
            print('\tno outliers')
        print('\n')
    else:
        print('\n\tno pattern.\n')

processing EMPLOYMENT_END_DATE.11.g9vh-zeiw.txt.gz

	Pattern: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) 20XX(4-4))

	no outliers


processing DATA_FI_PRORROGA.23.hb6v-jcbf.txt.gz

	Pattern: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) 20XX(4-4))

	no outliers


processing AM_Count.32.w76s-c5u4.txt.gz

	Pattern: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) NUMERIC(4-4))

	no outliers


processing COISSUEDDATE.26.94uh-66xv.txt.gz

	Pattern: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) 20XX(4-4) \S() 12(2-2) PUNC(:) 00(2-2) PUNC(:) 00(2-2) \S() am(2-2))

	no outliers




## Using the PatternFinder

In [4]:
from openclean_pattern.opencleanpatternfinder import OpencleanPatternFinder as PF

In [5]:
opf = PF(
    frac = 1,
    distinct = True,
    tokenizer = tokenizer,
    collector = collector,
    compiler = compiler
)

In [6]:
for filename in files:
    print('processing {}'.format(filename))
    ds = stream(os.path.join(inputdir, filename), header=['term', 'freq'], delim='\t', compressed=True)\
        .select('term').to_df()['term']
    patterns = opf.find(ds.to_list())
    if patterns:
        print('\n\tPattern: {}\n'.format(patterns))
        outliers = opf.outliers
        print('\toutliers: {}'.format(ds[outliers]).tolist() if not ds[outliers].empty else '\tno outliers')
        print('\n')
    else:
        print('\n\tno pattern.\n')

processing EMPLOYMENT_END_DATE.11.g9vh-zeiw.txt.gz

	Pattern: {5: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) 20XX(4-4))}

	no outliers


processing DATA_FI_PRORROGA.23.hb6v-jcbf.txt.gz

	Pattern: {5: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) NUMERIC(4-4))}

	no outliers


processing AM_Count.32.w76s-c5u4.txt.gz

	Pattern: {5: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) NUMERIC(4-4))}

	no outliers


processing COISSUEDDATE.26.94uh-66xv.txt.gz

	Pattern: {13: RowPatterns(NUMERIC(2-2) PUNC(/) NUMERIC(2-2) PUNC(/) 20XX(4-4) \S() 12(2-2) PUNC(:) 00(2-2) PUNC(:) 00(2-2) \S() am(2-2))}

	no outliers


