In [1]:
from nltk import ngrams
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from unidecode import unidecode
from collections import Counter
import math

In [29]:
def tokenize(s):
    """
    Tokenize a string into a list. 
    Remove punctuations, stopwords.
    Remove accents.
    Remove single letter, e.g. J. Smith -> Smith.
    Use all letters in lower case.
    """
    tokenizer = RegexpTokenizer(r'\w+')
    intermediate = tokenizer.tokenize(s)
    stop = stopwords.words()
    return [unidecode(i.lower()) for i in intermediate if i.lower() not in stop and len(i) > 1]

In [30]:
def getNgrams(s, n):
    """
    Get the Ngrams list from a string.
    """
    return list(ngrams(tokenize(s), n))

In [38]:
# Transplant from the Scala code

def interSimilarity(a, b):
    """
    Caluclate the intersected similarity.
    """
    vec1 = Counter(a)
    vec2 = Counter(b)
    
    intersection = vec1 & vec2  # min(vec1[x], vec2[x]) 
    numerator = sum(intersection.values())
    
    sum1 = sum(vec1.values())
    sum2 = sum(vec2.values())
    denominator = sum1 + sum2
    
    if denominator:
            return 2 * float(numerator) / denominator
    else:
        return 0.0

In [39]:
def trigramSimilarity(s1, s2):
    """
    Caluclate the intersected similarity of Trigrams.
    """
    return interSimilarity(getNgrams(s1, 3), getNgrams(s2, 3))

In [40]:
def tokenSimilarity(s1, s2):
    """
    Caluclate the intersected similarity of Tokens.
    """
    return interSimilarity(tokenize(s1), tokenize(s2))

In [34]:
# Source: https://rosettacode.org/wiki/Longest_common_subsequence
# Change the return type

def lcs(a, b):
    """
    Return the list of the LCS of two lists.
    """
    lengths = [[0 for j in range(len(b)+1)] for i in range(len(a)+1)]
    # row 0 and column 0 are initialized to 0 already
    for i, x in enumerate(a):
        for j, y in enumerate(b):
            if x == y:
                lengths[i+1][j+1] = lengths[i][j] + 1
            else:
                lengths[i+1][j+1] = max(lengths[i+1][j], lengths[i][j+1])
    # read the substring out from the matrix
    result = []
    x, y = len(a), len(b)
    while x != 0 and y != 0:
        if lengths[x][y] == lengths[x-1][y]:
            x -= 1
        elif lengths[x][y] == lengths[x][y-1]:
            y -= 1
        else:
            assert a[x-1] == b[y-1]
            result.insert(0, a[x-1])
            x -= 1
            y -= 1
    return result

In [35]:
def lcsSimilarity(s1, s2):
    """
    Calculate the LCS similarity.
    """
    t1 = tokenize(s1)
    t2 = tokenize(s2)
    minlen = min(len(t1), len(t2))
    if minlen:
        lcslen = len(lcs(t1, t2))
        return lcslen / minlen
    else:
        return 0.0

In [60]:
def matchSimilarity(s1, s2):
    """
    Calcualte the match similarity.
    """
    if s1==s2:
        return 1.0
    else:
        return 0.0

In [59]:
def featureVectorBuilder(ref, seed_ref):
    fv = dict()
    
    fv['authorTrigram'] = trigramSimilarity(ref['author'], seed_ref['author'])
    fv['authorToken'] = tokenSimilarity(ref['author'], seed_ref['author'])
    
    fv['titleTrigram'] = trigramSimilarity(ref['title'], seed_ref['title'])
    fv['titleToken'] = tokenSimilarity(ref['title'], seed_ref['title'])
    
    fv['publisherLCS'] = lcsSimilarity(ref['publisher'], seed_ref['publisher'])
    
    fv['yearMatch'] = matchSimilarity(ref['year'], seed_ref['year'])
    
    return fv

---

In [52]:
a = "Some random title and publisher"
b = "title publisher"

print(trigramSimilarity(a, b), tokenSimilarity(a, b), lcsSimilarity(a, b))

0.0 0.8 1.0
