In [18]:
import sys
import re, math, collections

def tokenize(_str):
    stopwords = ['and', 'for', 'if', 'too', 'as', 'the', 'then', 'be', 'is', 'are', 'will', 'in', 'it', 'to', 'that']
    tokens = collections.defaultdict(int)
    for m in re.finditer(r"(\w+)", _str, re.UNICODE):
        m = m.group(1).lower()
        if len(m) < 2: continue
        if m in stopwords: continue
        tokens[m] += 1
    return tokens
#end of tokenize

def kldiv(_s, _t):
    if len(_s) == 0:
        return 1e33
    if len(_t) == 0:
        return 1e33
    ssum = 0. + sum(_s.values())
    slen = len(_s)
    tsum = 0. + sum(_t.values())
    tlen = len(_t)
    vocabdiff = set(_s.keys()).difference(set(_t.keys()))
    lenvocabdiff = len(vocabdiff)

    print(f"_s: {len(_s)}, sum: {ssum}")
    print(f"_t: {len(_t)}, sum: {tsum}")
    print("%s" % len(vocabdiff))

    """ epsilon """
    epsilon = min(min(_s.values())/ssum, min(_t.values())/tsum) * 0.001
    print("epsilon: %s" % epsilon)

    """ gamma """
    gamma = 1 - lenvocabdiff * epsilon
    print("gamma: %s" % gamma)

    """ Check if distribution probabilities sum to 1"""
    sc = sum([v/ssum for v in _s.values()])
    st = sum([v/tsum for v in _t.values()])

    if sc < 9e-6:
        print("Sum P: %e, Sum Q: %e" % (sc, st))
        print("*** ERROR: sc does not sum up to 1. Bailing out ..")
        sys.exit(2)
    if st < 9e-6:
        print("Sum P: %e, Sum Q: %e" % (sc, st))
        print("*** ERROR: st does not sum up to 1. Bailing out ..")
        sys.exit(2)

    div = 0.
    for t, v in _s.items():
        pts = v / ssum
        ptt = epsilon
        if t in _t:
            ptt = gamma * (_t[t] / tsum)

        ckl = (pts - ptt) * math.log(pts / ptt)

        div +=  ckl
    return div

#end of kldiv

d1 = r"""Many research publications want you to use BibTeX, which better
organizes the whole process. Suppose for concreteness your source
file is x.tex. Basically, you create a file x.bib containing the
bibliography, and run bibtex on that file."""

d2 = r"""In this case you must supply both a \left and a \right because the
delimiter height are made to match whatever is contained between the two commands.
But, the \left doesn't have to be an actual 'left
delimiter', that is you can use '\left)' if there were some reason
to do it."""

d3 = r"""Many research publications want you to use BibTeX, which better
organizes the whole process. Suppose for concreteness your source
file is x.tex.But, the \left doesn't have to be an actual 'left
delimiter', that is you can use '\left)' if there were some reason
to do it."""


print("KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)))
print("KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)))
print("KL-divergence between d1 and d3:", kldiv(tokenize(d1), tokenize(d3)))
print("KL-divergence between d2 and d3:", kldiv(tokenize(d2), tokenize(d3)))

_s: 25, sum: 29.0
_t: 30, sum: 35.0
23
epsilon: 2.857142857142857e-05
gamma: 0.9993428571428571
KL-divergence between d1 and d2: 6.52185430963571
_s: 30, sum: 35.0
_t: 25, sum: 29.0
28
epsilon: 2.857142857142857e-05
gamma: 0.9992
KL-divergence between d2 and d1: 6.511423630945803
_s: 25, sum: 29.0
_t: 31, sum: 35.0
7
epsilon: 2.857142857142857e-05
gamma: 0.9998
KL-divergence between d1 and d3: 1.872591587570143
_s: 30, sum: 35.0
_t: 31, sum: 35.0
15
epsilon: 2.857142857142857e-05
gamma: 0.9995714285714286
KL-divergence between d2 and d3: 3.005348407337924


In [19]:
d1 = r"john fell down harry fell as-well down by the stream the sun shone before it went down mary was fine"
d2 = r"bill fell down jeff fell too down by the river the sun shone until it sunk down belinda was ill"
d3 = r"Clyde/Gucci (my cat) climbed the old oak tree, its green eyes sparkling in the sunlight as he was fearless of the height"

print("KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2)))~~
print("KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1)))
print("KL-divergence between d1 and d3:", kldiv(tokenize(d1), tokenize(d3)))
print("KL-divergence between d2 and d3:", kldiv(tokenize(d2), tokenize(d3)))

_s: 14, sum: 17.0
_t: 13, sum: 16.0
8
epsilon: 5.882352941176471e-05
gamma: 0.9995294117647059
KL-divergence between d1 and d2: 3.2494321222681566
_s: 13, sum: 16.0
_t: 14, sum: 17.0
7
epsilon: 5.882352941176471e-05
gamma: 0.9995882352941177
KL-divergence between d2 and d1: 3.0478297683519773
_s: 14, sum: 17.0
_t: 18, sum: 18.0
13
epsilon: 5.555555555555555e-05
gamma: 0.9992777777777778
KL-divergence between d1 and d3: 6.825694089312702
_s: 13, sum: 16.0
_t: 18, sum: 18.0
12
epsilon: 5.555555555555555e-05
gamma: 0.9993333333333333
KL-divergence between d2 and d3: 6.875119093221768
