In [1]:
"""Calculate information-theoretic measures of distributional
similarity based on word frequencies in two texts
"""

import collections
import math



def get_counts(word_list):
    return collections.Counter(word_list)


def create_prob_dist(count_dict):
    total_ct = sum(count_dict.values())
    p = {x: ct / total_ct for x, ct in count_dict.items()}
    return p


def count_smoothing(freq_dist, vocabulary, alpha=1):
    """Implement simple count-based probability smoothing.
    Given a target vocabulary and a set of observed count frequencies,
    calculate a new set of counts so that Count(x) > 0 for all words
    in the target vocabulary.  This is achieved by adding `alpha`
    to each observed count
    """
    return {w: freq_dist.get(w, 0) + alpha for w in vocabulary}


def entropy(p):
    """Calculate entropy H(p) for a probability distribution represented
    as a mapping (dictionary) from word tokens to probabilities
    """
    h = 0

    # TODO -- Calculate entropy value in nats for probability distribution `p`
    for x in p:
        h -= p[x] * math.log(p[x])

    return h


def cross_entropy(p1, p2):
    """Calculate cross-entropy H(p1, p2) for two probability distributions
    represented as a mapping (dictionary) from word tokens to
    probabilities
    """
    xh = 0

    # TODO -- Calculate cross-entropy value H(p1, p2) in nats
    for x in p1:
        xh -= p1[x] * math.log(p2[x])

    return xh


def kl_divergence(p1, p2):
    """Calculate Kullback-Leibler divergence D_{KL}(p1||p2) for two
    probability distributions represented as a mapping (dictionary)
    from word tokens to probabilities
    """
    kl = 0

    # TODO -- Calculate KL divergence D_{KL}(p1||p2) in nats
    kl = cross_entropy(p1, p2) - entropy(p1)

    return kl




In [2]:
with open('cc-tokens.txt') as f:
    cc_tokens = f.read().lower().splitlines()

In [2]:
with open('biology-tokens.txt') as f:
    biology_tokens = f.read().lower().splitlines()

In [3]:
print(biology_tokens[:10])

['in', 'spite', 'of', 'the', 'morphological', 'and', 'developmental', 'differences', 'between', 'vertebrate']


In [4]:
# biology
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(biology_tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

1.3953291406940274
1.3917604586536854


In [4]:
with open('chemistry-tokens.txt') as f:
    biology_tokens = f.read().lower().splitlines()
    
# chemistry
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(biology_tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

1.4892076798817788
1.370622024644197


In [5]:
with open('computer-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# computer science
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

1.100465612965392
1.026490990467484


In [3]:
with open('economics-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# economics
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

0.7843515970190342
0.6954866376363178


In [3]:
with open('engineering-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# engineering
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

0.9704102935975527
0.8744780093122486


In [4]:
with open('philosophy-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# philosophy
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

0.8798484125993156
1.009452987475873


In [5]:
with open('psychology-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# psychology
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

0.8602824444201698
0.8758825158417558


In [6]:
with open('sociology-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# sociology
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

0.5554565950380557
0.5205863947668457


In [3]:
with open('art-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# sociology
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

0.7645831018494826
1.0654439636740385


In [4]:
with open('physics-tokens.txt') as f:
    tokens = f.read().lower().splitlines()
    
# sociology
ct_a = get_counts(cc_tokens)
# print(ct_a)
ct_b = get_counts(tokens)
# print(ct_b)

vocab = set(ct_a.keys()) | set(ct_b.keys())
ct_a = count_smoothing(ct_a, vocab)
# print(ct_a)
ct_b = count_smoothing(ct_b, vocab)

p_a = create_prob_dist(ct_a)
p_b = create_prob_dist(ct_b)


kl_ab = kl_divergence(p_a, p_b)
print(kl_ab)
    
kl_ba = kl_divergence(p_b, p_a)
print(kl_ba)

1.5620148386710095
1.39688622264058
