In [1]:
import Utils.utils as ut
import Utils.models as lm

In [2]:
# load vocab
vocab = ut.loadVocab("vocab.txt")

In [3]:
dataDir = "corpus.txt"
train_data = ut.loadData(dataDir)

In [4]:
# preprocess the training data 
preprocessed_train_data = ut.preprocess(train_data, vocab)

In [5]:
# create the language models
unigramLM = lm.UnigramLM(vocab)
bigramLM = lm.BigramLM(vocab)
trigramLM = lm.TrigramLM(vocab)

In [6]:
# train the language models
unigramLM.train(preprocessed_train_data)
bigramLM.train(preprocessed_train_data)
trigramLM.train(preprocessed_train_data)

In [11]:
# print the three language models in ARPA format like in corpus.3bo file
# first print the data section of the file
data_section = "\\data\\\n"

# print the trigram model
bigrams = []
tri_grams_number = 0
trigram_section = "\\3-grams:\n"
for word1 in vocab:
    for word2 in vocab:
        for word3 in vocab:
            if trigramLM.checkBackOff(word1, word2, word3):
                bigrams.append((word1, word2))
            else: 
                trigram_section += str(trigramLM.calcProbability(word1, word2, word3)) + "\t" + word1 + " " + word2 + " " + word3 + "\n"
                tri_grams_number += 1

# print the bigram model
bigram_section = "\\2-grams:\n"
unigrams = []
bi_grams_number = 0
for word1 in vocab:
    for word2 in vocab:
        if bigramLM.checkBackOff(word1, word2):
            unigrams.append(word1)
        elif (word1, word2) in bigrams:
            bigram_section += str(bigramLM.calcProbability(word1, word2)) + "\t" + word1 + " " + word2 + "\t" + str(ut.backOffWeight_bi_context((word1, word2), bigramLM, trigramLM)) + "\n"
            bi_grams_number += 1
        else:
            bigram_section += str(bigramLM.calcProbability(word1, word2)) + "\t" + word1 + " " + word2 + "\n"
            bi_grams_number += 1

# print the unigram model
unigram_section = "\\1-grams:\n"
uni_grams_number = 0
for word in vocab:
    if not unigramLM.checkBackOff(word) and word in unigrams:
        unigram_section += str(unigramLM.calcProbability(word)) + "\t" + word1 + "\t" + str(ut.backOffWeight_uni_context(word, bigramLM, unigramLM)) + "\n"
        uni_grams_number += 1 
    elif not unigramLM.checkBackOff(word) and word not in unigrams:
        unigram_section += str(unigramLM.calcProbability(word)) + "\t" + word1 + "\n"
        uni_grams_number += 1 


data_section += "ngram 1=" + str(uni_grams_number) + "\n"
data_section += "ngram 2=" + str(bi_grams_number) + "\n"
data_section += "ngram 3=" + str(tri_grams_number) + "\n"

# print end section
end_section = "\\end\\"

# print the whole file
with open("LM.3bo", 'w') as f:
    f.write("\n" + data_section + "\n" + unigram_section + "\n" + bigram_section + "\n" + trigram_section + "\n" + end_section)


