In [1]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

def splitName(s):
    '''
    Split author's name into a list. 
    Remove punctuations, stopwords.
    Use all letters in lower case.
    '''
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(s)
    stop = stopwords.words('italian')
    return [i.lower() for i in intermediate if i not in stop]

In [2]:
def yearBlur(yearStr):
    '''
    Return a list of blurred year: [Year-1, Year, Year+1]
    '''
    tokenizer = RegexpTokenizer(r'[0-9]+')
    year = tokenizer.tokenize(yearStr)
    year = [int(y) for y in year if len(y)==4 and int(y)<2000 and int(y)>1000]  # whether it's a 'year'
    blurList = []
    for y in year:
        blurList.append(y)
        blurList.append(y - 1)
        blurList.append(y + 1)
    return blurList

In [3]:
class HashGenerator():
    '''
    '''
    def generate(citation):
        raise NotImplementedError 

In [4]:
class CitationAuthorHashGenerator(HashGenerator):
    '''
    Hash Generator with Author only
    '''
    def generate(self, citation):
        author = splitName(citation['author'])
        # How to deal with abbreviation?
        # 1. Not consider abbreviation 
        # author = [a for a in author if len(a) > 1]  
        # 2. Develop additional methods
        # e.g. John Smith -> [JSmith, SJohn， John, Simth]
        
        for a in author[:4]:  # Take only the first four as hash
            yield a

In [5]:
class CitationAuthorYearHashGenerator(HashGenerator):
    '''
    Hash Generator with Author and Year
    '''
    def generate(self, citation):
        author = splitName(citation['author']) 
        # How to deal with abbreviation?
        # 1. Not consider abbreviation 
        # author = [a for a in author if len(a) > 1]  
        # 2. Develop additional methods
        # e.g. John Smith -> [JSmith, SJohn， John, Simth]
        year = yearBlur(citation['year'])
        
        for a in author[:4]:  # Take only the first four as hash
            for y in year: 
                yield a+"#"+str(y)

In [None]:
class CitationAuthorYearNumHashGenerator(HashGenerator):
    '''
    TODO: Hash Generator with Author and Year and all other Numbers
    '''
    def generate(self, citation):
        author = splitName(citation['author']) 
        # How to deal with abbreviation?
        # 1. Not consider abbreviation 
        # author = [a for a in author if len(a) > 1]  
        # 2. Develop additional methods
        # e.g. John Smith -> [JSmith, SJohn， John, Simth]
        year = yearBlur(citation['year'])
        # TODO
        num = citation['number']  
        # Which numbers to use?
        
        for a in author[:4]:  # Take only the first four as hash
            for y in year: 
                s = a+"#"+str(y)
                for n in num:
                    s = s+"#"+str(n)
                yield s

In [6]:
# How to define a full citation?
# What if in some articles there is no such full citation?
citation = {
    "author" :  "M. Sañudo",
    "title"  :  "Itinerario per la terra di Venezia nel 1483,",
    "place"  :  "Padova,",
    "year"   :  "1847."
}

In [7]:
g1 = CitationAuthorHashGenerator()
g2 = CitationAuthorYearHashGenerator()

In [8]:
list(g1.generate(citation))

['m', 'sañudo']

In [9]:
list(g2.generate(citation))

['m#1847', 'm#1846', 'm#1848', 'sañudo#1847', 'sañudo#1846', 'sañudo#1848']