In [54]:
import sys, os
_snlp_book_dir = "../../../../"
sys.path.append(_snlp_book_dir) 
import statnlpbook.lm as lm
import statnlpbook.ohhla as ohhla
import math
import collections

In [55]:
_snlp_train_dir = _snlp_book_dir + "/data/ohhla/train"
_snlp_dev_dir = _snlp_book_dir + "/data/ohhla/dev"
_snlp_train_song_words = ohhla.words(ohhla.load_all_songs(_snlp_train_dir))
_snlp_dev_song_words = ohhla.words(ohhla.load_all_songs(_snlp_dev_dir))
assert(len(_snlp_train_song_words)==1041496)

Could not load ../../../..//data/ohhla/train/www.ohhla.com/anonymous/nas/distant/tribal.nas.txt.html


In [56]:
class new_NGramLM(lm.CountLM):
    def __init__(self, train, order):
        """
        Create an NGram language model.
        Args:
            train: list of training tokens.
            order: order of the LM.
        """
        super().__init__(set(train), order)
        self._counts = collections.defaultdict(float)
        self._norm = collections.defaultdict(float)
        self._counts_history = collections.defaultdict(float)
        self._counts_word = collections.defaultdict(float)
        for i in range(self.order, len(train)):
            history = tuple(train[i - self.order + 1 : i])
            word = train[i]
            self._counts[(word,) + history] += 1.0
            self._norm[history] += 1.0
        for j in list(self._counts.keys()):
            
            self._counts_word[list(j)[0]] += 1.0
        for k in list(self._counts.keys()):
            self._counts_history[list(k)[1]] += 1.0
    def counts(self, word_and_history):
        #print(self._counts)
        #print(word_and_history, self._counts[word_and_history])
        return self._counts[word_and_history]
    def norm(self, history):
        #print(history, self._norm[history])
        return self._norm[history]
    def counts_history(self, history):
        return self._counts_history[history]
    def counts_word(self, word):
        return self._counts_word[word]
    def counts_all(self):
        return len(list(self._counts.keys()))

In [57]:
class KneseNeyLM(lm.LanguageModel):
    def __init__(self, main, backoff, discount):
        super().__init__(main.vocab, main.order)
        self.main = main
        self.backoff = backoff
        self.discount = discount
    def probability(self, word, *history):
        sub_history = tuple(history[-1:])
        word_and_history = (word,) + sub_history
        #print(word_and_history)
        history = "".join(sub_history)
        
        #if self.main.counts_history(history) == 0.0:
        if self.backoff.counts((history,)) == 0:
            
            return self.backoff.probability(word)

        else:
            
            p1 = max((self.main.counts(word_and_history) - self.discount),0)/ self.backoff.counts((history,))
            lmb = self.discount / self.backoff.counts((history,)) * self.main.counts_history(history)
            pc = self.main.counts_word(word) / self.main.counts_all()

            #print(p1+lmb*pc)

            return p1 + lmb * pc

In [76]:
oov_train = lm.inject_OOVs(_snlp_train_song_words + _snlp_dev_song_words)
bigram = new_NGramLM(oov_train, 2)
unigram = lm.NGramLM(oov_train,1)
my_lm =KneseNeyLM(bigram, unigram, 75)
oov_vocab = set(oov_train)

Create your model

In [77]:
def create_lm(vocab):
    """
    Return an instance of `lm.LanguageModel` defined over the given vocabulary.
    Args:
        vocab: the vocabulary the LM should be defined over. It is the union of the training and test words.
    Returns:
        a language model, instance of `lm.LanguageModel`.
    """
    
    
    return lm.OOVAwareLM(my_lm, vocab - oov_vocab)

In [78]:
#! SETUP 3
_snlp_test_dir = _snlp_book_dir + "/data/ohhla/dev"

In [79]:
#! SETUP 4
_snlp_test_song_words = ohhla.words(ohhla.load_all_songs(_snlp_test_dir))
_snlp_test_vocab = set(_snlp_test_song_words)
_snlp_dev_vocab = set(_snlp_dev_song_words)
_snlp_train_vocab = set(_snlp_train_song_words)
_snlp_vocab = _snlp_test_vocab | _snlp_train_vocab | _snlp_dev_vocab
_snlp_lm = create_lm(_snlp_vocab)

In [81]:
#! ASSESSMENT 1
_snlp_test_token_indices = [100, 1000, 10000]
_eps = 0.000001
for i in _snlp_test_token_indices:
    print(i)
    result = sum([_snlp_lm.probability(word, *_snlp_test_song_words[i-_snlp_lm.order+1:i]) for word in _snlp_vocab])
    print("Sum: {sum}, ~1: {approx_1}, <=1: {leq_1}".format(sum=result, 
                                                            approx_1=abs(result - 1.0) < _eps, 
                                                            leq_1=result - _eps <= 1.0))

100
Sum: 0.9999675691209431, ~1: False, <=1: True
1000
Sum: 11.937828439739745, ~1: False, <=1: False
10000
Sum: 25.809477620261173, ~1: False, <=1: False


In [80]:
lm.perplexity(_snlp_lm, _snlp_test_song_words)

33.32932582901255