In [33]:
DATASETS = ['brown', 'gutenberg']


In [41]:
import random
import nltk
from nltk.corpus import brown

fieldids = nltk.corpus.brown.fileids()
#print(fieldids)
corpus = list(brown.sents(fieldids))
print((corpus)[:100])
random.shuffle(corpus)

training_corpus = corpus[:60]
dev_corpus = corpus[:20]
test_corpus = corpus[:20]



In [42]:
START = '<s>'
STOP = '</s>'
DEV = False

def get_model(training_corpus):
    '''returns a tuple of dict objects (unigrams, bigrams, trigrams) that map from n-grams to counts'''
    import collections
    unigram_c = collections.defaultdict(int)
    bigram_c = collections.defaultdict(int)
    trigram_c = collections.defaultdict(int)


    for sentence in training_corpus:
        tokens0 = sentence[:-1]             #removing last '.'
        tokens1 = tokens0 + [STOP]
        tokens2 = [START] + tokens0 + [STOP]
        tokens3 = [START] + [START] + tokens0 + [STOP]
        # unigrams
        for unigram in tokens1:
            unigram_c[unigram] += 1  #unigram_c is unigram count

        # bigrams
        for bigram in nltk.bigrams(tokens2):
            bigram_c[bigram] += 1

        # trigrams
        for trigram in nltk.trigrams(tokens3):
            trigram_c[trigram] += 1
    
    unigram_c[START] = len(training_corpus)
    bigram_c[(START, START)] = len(training_corpus)
    model = (unigram_c,bigram_c,trigram_c)
    return model 
    



In [43]:
def eval_model(test_corpus, model, log_prob_func):
    '''Returns the perplexity of the model on a specified test set.'''

    log_prob_sum = 0
    word_count = 0

    for sentence in test_corpus:
        prob = eval_sentence(sentence, model, log_prob_func)
        log_prob_sum += prob
        word_count += len(sentence)
        
    average_log_prob = log_prob_sum / word_count
    perplexity = 2**(-average_log_prob)
    return perplexity


def eval_sentence(sentence, model, log_prob_func):
    '''Returns log probability of a sentence and how many tokens were in the sentence.'''

    tokens0 = sentence[:-1]             #removing last '.'
    tokens1 = tokens0 + [STOP]
    tokens2 = [START] + tokens0 + [STOP]
    tokens3 = [START] + [START] + tokens0 + [STOP]
    
    log_prob_sum = 0
    # trigrams
    for n_gram in nltk.trigrams(tokens3):
        next_prob = log_prob_func(n_gram, model)
        log_prob_sum += next_prob

    return log_prob_sum


In [44]:
def main():
    
    # returns the log probability of a specified n-gram
    def get_log_prob(n_gram, model):
        import math
         # tri-gram part
        tri_numer = model[2][n_gram]
        tri_denom = model[1][n_gram[1:]]
        trigram_part = 0
        if tri_denom != 0:
            trigram_part = LAMBDA_1 * tri_numer / tri_denom

     # bi-gram part
        bi_numer = model[1][n_gram[1:]]
        bi_denom = model[0][n_gram[2:]]
        bigram_part = 0
        if bi_denom != 0:
            bigram_part = LAMBDA_2 * bi_numer / bi_denom
            
       # uni-gram part
        uni_numer = model[0][n_gram[2:]]
        uni_denom = sum(model[0].values()) - model[0][(START)]
        unigram_part = 0
        if uni_denom != 0:
            unigram_part = LAMBDA_3 * uni_numer / uni_denom
            
        prob = trigram_part + bigram_part + unigram_part
        log_prob = math.log(prob, 2)
        return log_prob 
    
    
    model= get_model(training_corpus)  
    LAMBDA_1s = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
    LAMBDA_2s = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]

    for i in LAMBDA_1s:
        for j in LAMBDA_2s:
            LAMBDA_1 = i
            LAMBDA_2 = j
            if LAMBDA_1 + LAMBDA_2 < 0.9:
                LAMBDA_3 = 1 - LAMBDA_1 - LAMBDA_2
                print('LAMBDA1: ' + str(LAMBDA_1) + ' LAMBDA2: ' + str(LAMBDA_2) + ' LAMBDA3: ' + str(LAMBDA_3))
                

                if DEV:
                    perplexity = eval_model(dev_corpus, model, get_log_prob)
                else:
                    perplexity = eval_model(test_corpus, model, get_log_prob)

                print("perplexity is = ", perplexity,"\n\n")
    
    
    
    
    
    
   
    '''
    model= get_model(training_corpus) 
    perplexity = eval_model(corpus, model, get_log_prob)
    print("perplexity on training_corpus is = ", perplexity)
    '''
    
    
    
    
    
    
#calling function main
main()


LAMBDA1: 0.1 LAMBDA2: 0.1 LAMBDA3: 0.8
perplexity is =  11.030849326288966 


LAMBDA1: 0.1 LAMBDA2: 0.2 LAMBDA3: 0.7
perplexity is =  11.030849326288966 


LAMBDA1: 0.1 LAMBDA2: 0.3 LAMBDA3: 0.6000000000000001
perplexity is =  11.030849326288966 


LAMBDA1: 0.1 LAMBDA2: 0.4 LAMBDA3: 0.5
perplexity is =  11.030849326288966 


LAMBDA1: 0.1 LAMBDA2: 0.5 LAMBDA3: 0.4
perplexity is =  11.030849326288966 


LAMBDA1: 0.1 LAMBDA2: 0.6 LAMBDA3: 0.30000000000000004
perplexity is =  11.030849326288966 


LAMBDA1: 0.1 LAMBDA2: 0.7 LAMBDA3: 0.20000000000000007
perplexity is =  11.030849326288966 


LAMBDA1: 0.2 LAMBDA2: 0.1 LAMBDA3: 0.7000000000000001
perplexity is =  5.515424663144477 


LAMBDA1: 0.2 LAMBDA2: 0.2 LAMBDA3: 0.6000000000000001
perplexity is =  5.515424663144477 


LAMBDA1: 0.2 LAMBDA2: 0.3 LAMBDA3: 0.5
perplexity is =  5.515424663144477 


LAMBDA1: 0.2 LAMBDA2: 0.4 LAMBDA3: 0.4
perplexity is =  5.515424663144477 


LAMBDA1: 0.2 LAMBDA2: 0.5 LAMBDA3: 0.30000000000000004
perplexity is 