N-Gram Models and their probabilities

A first step in making use of n-gram models is using them to estimate the probabilities of certain events.

In order to make use of these models we will first want to count the words in a document.

In [None]:
import nltk.data
from os import listdir
from os.path import isfile, join
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()


dir_base = "/Users/teacher/repos/f20_ds_nlp/classes/week_2/data"




def read_file(filename):
    input_file_text = open(filename , encoding='utf-8').read()
    return input_file_text

    
def read_directory_files(directory):
    file_texts = ""
    files = [f for f in listdir(directory) if isfile(join(directory, f))]
    for f in files:
        file_texts += read_file(join(directory, f) )
    return file_texts
    
text_corpus = read_directory_files(dir_base)
print(text_corpus)

So now we have a single text variable that represents all the text in a corpus.

Can we get counts and what not of this?

In [None]:
bigrams = nltk.bigrams(text_corpus)
freq_bi = nltk.FreqDist(bigrams)

In [None]:
for i in bigrams:
    print(i)

In [None]:
freq_bi.plot(10)

What's the problem with this data?

Characters only.

In [None]:
punkt_sentences = sentence_tokenizer.tokenize(text_corpus)
sentences_words = [treebank_tokenizer.tokenize(sentence) for sentence in punkt_sentences]
all_tokens = [word for sentence in sentences_words for word in sentence]
bigrams = nltk.bigrams(all_tokens)
freq_bi = nltk.FreqDist(bigrams)

In [None]:
print(freq_bi.most_common(20))
freq_bi.plot(10)

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
content = [w for w in all_tokens if w.lower() not in stop_words]
bigrams = nltk.bigrams(content)
freq_bi = nltk.FreqDist(bigrams)

In [None]:
print(freq_bi.most_common(20))
freq_bi.plot(10)

What other things might we remove?

Numbers? Punctuation?

In [None]:
print(freq_bi.hapaxes())
print(freq_bi.N())
print(freq_bi.freq(('financial','condition')))

In [None]:
MLE_Dist = nltk.MLEProbDist(freq_bi)
MLE_Dist.max()
print(MLE_Dist.prob(('Air', 'Force')))
print(MLE_Dist.logprob(('Air', 'Force')) )

But what if we use words that we've never seen before?

In [None]:
print(MLE_Dist.prob(('Chair', 'Force')))
print(MLE_Dist.logprob(('Chair', 'Force')) )

In [None]:
Smoothed_dist = nltk.LaplaceProbDist(freq_bi)
print(Smoothed_dist.prob(('Chair', 'Force')))
print(Smoothed_dist.logprob(('Chair', 'Force')) )

In [None]:
print(Smoothed_dist.generate())

In [None]:
test_sentence_1 = "The quantity decrease of the production units and the removal of funds occurred because the decided to use procurement funds to develop a higher priority air superiority program."
test_sentence_2 = "It is like totally rad that we were able to go out and see a bus with all the new doors on it."

def get_sentence_bigrams(sentence):
    sentence_words = treebank_tokenizer.tokenize(sentence)
    word_count = len(sentence_words)
    bigrams = nltk.bigrams(sentence_words)
    return bigrams, word_count
    
def estimate_sentence_probability(bigram_sentence, word_length):
    slogprob = 0
    for bigram_words in bigram_sentence:
        logprob= Smoothed_dist.logprob(bigram_words)
        slogprob += logprob
     
    return slogprob/word_length


bigram_sentence,word_count = get_sentence_bigrams(test_sentence_1)
estimate_probability = estimate_sentence_probability(bigram_sentence, word_count)
print(estimate_probability)

bigram_sentence_2,word_count = get_sentence_bigrams(test_sentence_2)
estimate_probability_2 = estimate_sentence_probability(bigram_sentence_2, word_count)
print(estimate_probability_2)



Do these numbers look right?

In [None]:
def get_better_sentence_bigrams(sentence):
    sentence_words = treebank_tokenizer.tokenize(sentence)
    stop_words = nltk.corpus.stopwords.words('english')
    content = [w for w in sentence_words if w.lower() not in stop_words]
    word_count = len(sentence_words)
    bigrams = nltk.bigrams(content)
    return bigrams, word_count
    

bigram_sentence, word_count = get_better_sentence_bigrams(test_sentence_1)
estimate_probability = estimate_sentence_probability(bigram_sentence, word_count)
print(estimate_probability)

bigram_sentence, word_count = get_better_sentence_bigrams(test_sentence_2)
estimate_probability = estimate_sentence_probability(bigram_sentence, word_count)
print(estimate_probability)