In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from unidecode import unidecode

def splitName(s):
    '''
    Split author's name into a list. 
    Remove punctuations, stopwords.
    Remove accents.
    Remove single letter, e.g. J. Smith -> Smith.
    Use all letters in lower case.
    '''
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(s)
    stop = stopwords.words('italian')
    return [unidecode(i.lower()) for i in intermediate if i not in stop and len(i) > 1]

In [2]:
def getYear(yearStr):
    '''
    Return a list of year.
    '''
    tokenizer = RegexpTokenizer(r'[0-9]+')
    year = tokenizer.tokenize(yearStr)
    return [int(y) for y in year if len(y)==4 and int(y)<2000 and int(y)>1000]  # whether it's a 'year'

In [3]:
def blurYear(yearStr):
    '''
    Return a list of blurred year: [Year-1, Year, Year+1]
    '''
    year = getYear(yearStr)
    blurList = []
    for y in year:
        blurList.append(y)
        blurList.append(y - 1)
        blurList.append(y + 1)
    return blurList

In [4]:
class HashGenerator():
    '''
    '''
    def generate(citation):
        raise NotImplementedError 

In [5]:
class CitationAuthorHashGenerator(HashGenerator):
    '''
    Hash Generator with Author only
    '''
    def generate(self, citation):
        author = splitName(citation['author'])
        
        for a in author[:4]:  # Take only the first four in author as hash
            yield a

In [6]:
class CitationAuthorYearHashGenerator(HashGenerator):
    '''
    Hash Generator with Author and Year
    '''
    def generate(self, citation):
        author = splitName(citation['author']) 
        year = getYear(citation['year'])
        
        for a in author[:4]:  # Take only the first four in author as hash
            for y in year: 
                yield a+"#"+str(y)

In [7]:
class CitationAuthorBlurYearHashGenerator(HashGenerator):
    '''
    Hash Generator with Author and Blurred Year
    '''
    def generate(self, citation):
        author = splitName(citation['author']) 
        year = blurYear(citation['year'])
        
        for a in author[:4]:  # Take only the first four in author as hash
            for y in year: 
                yield a+"#"+str(y)

In [None]:
class CitationAuthorYearNumHashGenerator(HashGenerator):
    '''
    TODO: Hash Generator with Author and Year and all other Numbers
    '''
    def generate(self, citation):
        author = splitName(citation['author']) 
        year = yearBlur(citation['year'])
        # TODO
        num = citation['number']  
        # Which numbers to use?
        
        for a in author[:4]:  # Take only the first four in author as hash
            for y in year: 
                s = a+"#"+str(y)
                for n in num:
                    s = s+"#"+str(n)
                yield s

Test

In [8]:
# How to define a full citation?
# What if in some articles there is no such full citation?
citation = {
    "author" :  "M. Sañudo",
    "title"  :  "Itinerario per la terra di Venezia nel 1483,",
    "place"  :  "Padova,",
    "year"   :  "1847."
}

In [9]:
g1 = CitationAuthorHashGenerator()
g2 = CitationAuthorYearHashGenerator()
g3 = CitationAuthorBlurYearHashGenerator()

In [10]:
for g in [g1, g2, g3]:
    print(list(g.generate(citation)))

['sanudo']
['sanudo#1847']
['sanudo#1847', 'sanudo#1846', 'sanudo#1848']
