<a href="https://colab.research.google.com/github/TasnubaS/Random-Solutions/blob/master/language_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Statistical Language Modeling

In [None]:
import urllib.request
from os.path import isfile
if not isfile("lang-model.txt"):
    url = "https://yangfengji.net/uva-nlp-course/data/lang-model.txt.zip"
    print("Downloading ...")
    filename, headers = urllib.request.urlretrieve(url, filename="lang-model.txt.zip")

    print("Decompressing the file ...")
    !unzip lang-model.txt.zip

sents = open("lang-model.txt").read().split("\n")
print("Read {} sentences".format(len(sents)))

In [None]:
from collections import defaultdict
from math import log2, pow
from numpy.random import choice

In [None]:
class BigramLM(object):
    def __init__(self):
        self.vocab = {"<start>":0, "<end>":1}
        self.model = {}
        self.tok_counter = '__total__'

    def build(self, fname):
        """ Build a Bigram LM
        """
        fin = open(fname)
        for line in fin:
            tokens = line.strip().split()
            L = len(tokens)
            for i in range(1, L):
                prev_tok = tokens[i-1]
                curr_tok = tokens[i]
                if curr_tok not in self.vocab:
                    self.vocab.update({curr_tok : len(self.vocab)})
                try:
                    self.model[prev_tok][curr_tok] += 1.0
                except KeyError:
                    self.model[prev_tok] = defaultdict(float)
                    self.model[prev_tok][curr_tok] += 1.0
                self.model[prev_tok][self.tok_counter] += 1.0
        # Normalization
        for (prev_tok, dct) in self.model.items():
            for (curr_tok, val) in self.model[prev_tok].items():
                if curr_tok != self.tok_counter: # to avoid normalizing the counter token
                    self.model[prev_tok][curr_tok] /= self.model[prev_tok][self.tok_counter]
        print("Done with modeling building\nVocab size = {}".format(len(self.vocab)))
            

    def eval(self, text):
        """ Evaluate a given text
        """
        tokens = text.strip().split()
        L = len(tokens)
        logprob = 0.0
        for i in range(1, L):
            prev_tok = tokens[i-1]
            curr_tok = tokens[i]
            if prev_tok not in self.vocab:
                prev_tok = 'UNK'
            if curr_tok not in self.vocab:
                curr_tok = 'UNK'
            try:
                logprob += log2(self.model[prev_tok][curr_tok])
            except ValueError:
                print("{} -> {}".format(prev_tok, curr_tok))
                logprob += -100 # A large number, technically this should be infty
        # Compute PPLx
        pplx = pow(2, -1*logprob/(L-1))
        return pplx


    def generate(self, method="random", length=20):
        """ Random sampling words from this model for generation
        """
        text = []
        prev_tok = "<start>"
        text.append(prev_tok)
        while (prev_tok != "<end>") and (len(text) <= length):
            tokens, probs = [], []
            # The following for loop is time-consuming
            # For large-scale text generation, a pre-processing may be necessary 
            for (tok, prob) in self.model[prev_tok].items():
                if tok != self.tok_counter:
                    tokens.append(tok)
                    probs.append(prob)
            widx = choice(len(probs), 1, p=probs)[0]
            prev_tok = tokens[widx]
            text.append(prev_tok)
        return text            

In [None]:
bigram = BigramLM()
bigram.build("lang-model.txt")
text = "<start> MY notes on deep learning for nlp <end>"
pplx = bigram.eval(text)
print("Text = {}\nPPLx = {:.4f}".format(text, pplx))

In [None]:
text = bigram.generate()
print("Generated text = {}".format(" ".join(text)))