# Fingerprinting
Entsprechend der OpenRefine-Implementierung basierend auf der Implementierung in https://gist.github.com/cjdd3b/0386f139bb953f046c6e

In [1]:
# -*- coding: utf-8 -*-

import re, string
#from unidecode import unidecode

PUNCTUATION = re.compile('[ %s]' % re.escape(string.punctuation))

class Fingerprinter(object):
    '''
    Python implementation of Google Refine fingerprinting algorithm described here:
    https://github.com/OpenRefine/OpenRefine/wiki/Clustering-In-Depth
    Requires the unidecode module: https://github.com/iki/unidecode
    '''
    def __init__(self, string):
        self.string = self._preprocess(string)

    def _preprocess(self, string):
        '''
        Strip leading and trailing whitespace, lowercase the string, remove all punctuation,
        in that order.
        '''
        return PUNCTUATION.sub('', string.strip().lower())

    def _latinize(self, string):
        '''
        Replaces unicode characters with closest Latin equivalent. For example,
        Alejandro González Iñárritu becomes Alejando Gonzalez Inarritu.
        '''
        return string #.decode('utf-8')

    def _unique_preserving_order(self, seq):
        '''
        Returns unique tokens in a list, preserving order. Fastest version found in this
        exercise: http://www.peterbe.com/plog/uniqifiers-benchmark
        '''
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]
        
    def get_fingerprint(self):
        '''
        Gets conventional fingerpint.
        '''
        return self._latinize(''.join(
            self._unique_preserving_order(
                sorted(self.string.split())
            )
        ))

    def get_ngram_fingerprint(self, n=1):
        '''
        Gets ngram fingerpint based on n-length shingles of the string.
        Default is 1.
        '''
        return self._latinize(''.join(
            self._unique_preserving_order(
                sorted([self.string[i:i + n] for i in range(len(self.string) - n + 1)])
            )
        ))

In [4]:
def n_gram_fingerprinting (s, n=2):
    PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))
    s = ''.join(sorted(s.split(' ')))
    s = PUNCTUATION.sub('', s.strip().lower())
    sorted_ngrams = sorted([s[i:i + n] for i in range(len(s) - n + 1)])
    seen = set()
    seen_add = seen.add
    return ''.join([x for x in sorted_ngrams if not (x in seen or seen_add(x))])

In [33]:
f = Fingerprinter('Tom Cruise')
print(f.get_fingerprint())
print(f.get_ngram_fingerprint(n=2))

tomcruise
crismcomrusetoui


In [34]:
f = Fingerprinter('Cruise, Tom')
print(f.get_fingerprint())
print(f.get_ngram_fingerprint(n=2))

cruisetom
cretisomrusetoui


In [14]:
f = Fingerprinter('Pflege Basis-Kurs')
print(f.get_ngram_fingerprint(n=2))
print(n_gram_fingerprinting('Pflege Basis Kurs',n=1))

asbaebegflgeiskulepfrssiskur
abefgiklprsu


In [17]:
f = Fingerprinter('Pflege-Basiskurs')
print(f.get_ngram_fingerprint(n=2))
print(n_gram_fingerprinting('Pflege-Basis-Kurs',n=1))

asbaebegflgeiskulepfrssiskur
abefgiklprsu


In [19]:
f = Fingerprinter('Pflegebasiskurs')
print(f.get_ngram_fingerprint(n=2))
print(n_gram_fingerprinting('Kulturauschuss',n=2))

asbaebegflgeiskulepfrssiskur
auchhukultrascsstuulurus


In [60]:
f = Fingerprinter('Basis Kurs "Pflege"')
print(f.get_fingerprint())
print(f.get_ngram_fingerprint(n=2))

basiskurspflege
asbaegflgeiskulepfrssiskspur


In [25]:
print(n_gram_fingerprinting('flache Flasche',n=2))
print(n_gram_fingerprinting('Sauerstoffflasche',n=2))

acaschefflhelasc
asaucherffflhelaofrssascsttoue


In [10]:
print(n_gram_fingerprinting('Delikatesssenf',n=2))
print(n_gram_fingerprinting('Delikatessensenf',n=2))

atdeelenesikkalinfsesste
atdeelenesikkalinfnssesste
