In [5]:
import nltk
from nltk.corpus import brown

fieldids = nltk.corpus.brown.fileids()
training_corpus = brown.sents(fieldids)


In [12]:
START_SYMBOL = '<s>'
STOP_SYMBOL = '</s>'

# Calculates unigram, bigram, and trigram probabilities given a training corpus
# training_corpus: is a list of the sentences. Each sentence is a list, ending in a '.' character.
# This function outputs three python dictionaries, where the keys are tuples expressing the ngram and the value is the log probability of that ngram
def calc_probabilities(training_corpus):
    import collections
    import math
    unigram_c = collections.defaultdict(int)
    bigram_c = collections.defaultdict(int)
    trigram_c = collections.defaultdict(int)


    for sentence in training_corpus:
        tokens0 = sentence[:-1]             #removing last '.'
        tokens1 = tokens0 + [STOP_SYMBOL]
        tokens2 = [START_SYMBOL] + tokens0 + [STOP_SYMBOL]
        tokens3 = [START_SYMBOL] + [START_SYMBOL] + tokens0 + [STOP_SYMBOL]
        # unigrams
        for unigram in tokens1:
            unigram_c[unigram] += 1  #unigram_c is unigram count

        # bigrams
        for bigram in nltk.bigrams(tokens2):
            bigram_c[bigram] += 1

        # trigrams
        for trigram in nltk.trigrams(tokens3):
            trigram_c[trigram] += 1

    unigrams_len = sum(unigram_c.values())
    unigram_p = {k: math.log(float(v) / unigrams_len, 2) for k, v in unigram_c.items()}

    # calc P(W2|W1) = P(W2,W1) / P(W1) = C(W2,W1) / C(W1)
    unigram_c[START_SYMBOL] = len(training_corpus)
    bigram_p = {k: math.log(float(v) / unigram_c[k[0]], 2) for k, v in bigram_c.items()}

    bigram_c[(START_SYMBOL, START_SYMBOL)] = len(training_corpus)
    trigram_p = {k: math.log(float(v) / bigram_c[k[:2]], 2) for k, v in trigram_c.items()}
    return unigram_p, bigram_p, trigram_p


In [24]:
from itertools import islice
n_items = list(islice(calc_probabilities(training_corpus)[1].items(), 10))
print(n_items)

[(('<s>', 'The'), -3.1315177287659792), (('The', 'Fulton'), -12.82515755470108), (('Fulton', 'County'), -1.5025003405291832), (('County', 'Grand'), -6.409390936137703), (('Grand', 'Jury'), -3.1699250014423126), (('Jury', 'said'), -2.0), (('said', 'Friday'), -8.923327485419193), (('Friday', 'an'), -5.906890595608519), (('an', 'investigation'), -8.98299357469431), (('investigation', 'of'), -1.5193741590935794)]
