<h1>Natural Language Processing: Assignment 1</h1>

In [1]:
import random
from itertools import chain, accumulate, tee
from functools import partial
import numpy as np
import nltk
from nltk.corpus.reader.util import ConcatenatedCorpusView, StreamBackedCorpusView
from nltk.corpus.europarl_raw import english as europarl_en

random.seed(93)

<h2>Sentence Loading</h2>

In [2]:
def custom_read_sent_block(stream):
    
    sentence = stream.readline().rstrip()
    if len(sentence):
        return [[word.lower() for word in sentence.split() if word.isalnum()]]
    
    return []

if hasattr(europarl_en, "_LazyCorpusLoader__load()"):
    europarl_en._LazyCorpusLoader__load()
europarl_en._read_sent_block = custom_read_sent_block

In [3]:
corpus_sents = list(europarl_en.sents())
random.shuffle(corpus_sents)

<h2>Corpus Splitting</h2>

In [4]:
def split_corpus_sents_(corpus_sents, factors = (0.08, 0.1, 0.1)):
    
    start_idx   = 0
    split_sents = []
    
    for end_idx in map(lambda x: int(x * len(corpus_sents)), accumulate((1.0 - sum(factors),) + factors)):
        split_sents.append(corpus_sents[start_idx:end_idx])
        start_idx = end_idx
        
    return split_sents

In [5]:
split_factors = (0.08, 0.1, 0.1)

training_sents, validation_sents, development_sents, test_sents = split_corpus_sents_(corpus_sents, split_factors)

num_training_sents = len(training_sents)

<h2>Text Normalization</h2>

In [6]:
def create_vocab_(sents, hapaxes_offset = 1):
    
    fdist = nltk.FreqDist(nltk.ngrams(chain.from_iterable(sents), n=1))
    
    hapaxes = set(token for token, frequency in fdist.items() if frequency < hapaxes_offset)
    num_unk = 0
    for token in hapaxes:
        num_unk += fdist.pop(token)
    fdist["*unk*"] = num_unk
    
    return fdist

In [7]:
hapaxes_offset = 10

fdists = [create_vocab_(training_sents, hapaxes_offset)]

<h2>N-Gram Calculation</h2>

In [8]:
def sent_ngrams_(sent, vocab=None, n=1):
    
    return nltk.ngrams(chain(
        ["*start*"] * (n - 1),
         sent if vocab is None else map(lambda token: token if (token,) in vocab else "*unk*", sent),
        ["*end*"]
    ), n)

In [9]:
ngram_dimensions = 3

fdists.extend(map(
    lambda sents, n: nltk.FreqDist(chain.from_iterable(map(partial(sent_ngrams_, vocab=fdists[0], n=n), sents))),
    tee(training_sents, ngram_dimensions - 1),
    range(2, ngram_dimensions + 1)
))

<h2>Language Model Training</h2>

In [10]:
def make_ngram_estimates_(sents, fdists, vocab, ngram_dimension=2):
    
    num_vocab = len(vocab)
    sent_ngrams_fn_ = partial(sent_ngrams_, vocab=vocab, n=ngram_dimension)
    estimates = {}
    
    for sent in sents:
        
        sent_ngrams = sent_ngrams_fn_(sent)
        cumulative_nom_log = cumulative_denom_log = 0
        
        for ngram in sent_ngrams:
            cumulative_nom_log   += np.log2(fdists[ngram_dimension - 1][ngram] + 1)
            cumulative_denom_log += np.log2(fdists[ngram_dimension - 2][ngram[:-1]] + num_vocab)
            
        estimates[" ".join(sent)] = cumulative_nom_log - cumulative_denom_log
        
    return estimates

In [11]:
def make_random_sents_(real_sents, vocab):
    
    vocab_tokens = [token[0] for token in vocab.keys()]
    random_sents = [random.choices(vocab_tokens, k=len(sent)) for sent in real_sents]

    return random_sents

In [28]:
def evaluate_model_(sents, fdists, vocab, estimator_fn, ngram_dimension=2, model_name="Model"):

    true_estimates = estimator_fn(sents)
    false_estimates = estimator_fn(make_random_sents_(sents, vocab))
    
    np_true_estimates  = np.fromiter(true_estimates.values(), dtype=np.float)
    np_false_estimates = np.fromiter(false_estimates.values(), dtype=np.float)
    
    crossentropy = -np_true_estimates.sum() / (sum(map(len, sents)) + len(sents) * (ngram_dimension - 1))
    
    print("Language Model: {} | Sentences Average LogProb(STD): {}({}) | Random Sentences Average LogProb(STD): {}({}) | Crossentropy: {} | Perplexity: {}".format(
        model_name, np_true_estimates.mean(), np_true_estimates.std(), np_false_estimates.mean(), np_false_estimates.std(), crossentropy, 2 ** crossentropy
    ), end="\n\n")

In [13]:
for fdist, n in zip(fdists, range(1, ngram_dimensions)):
    fdist[("*start*",) * n] = num_training_sents

ngram_estimator_fns = {n: partial(make_ngram_estimates_, fdists=fdists, vocab=fdists[0], ngram_dimension=n) for n in range(2, ngram_dimensions + 1)}
    
for n in range(2, ngram_dimensions + 1):
    evaluate_model_(training_sents, fdists, vocab=fdists[0], estimator_fn=ngram_estimator_fns[n], ngram_dimension=n, model_name="{}-Gram".format(n))

Language Model: 2-Gram | Sentences Average LogProb(STD): -198.0404684299962(127.69854734026177) | Random Sentences Average LogProb(STD): -300.2155406600317(196.27538379748566) | Crossentropy: 7.575632716676362 | Perplexity: 190.7623587795095
Language Model: 3-Gram | Sentences Average LogProb(STD): -241.1462377217555(154.62337506466056) | Random Sentences Average LogProb(STD): -300.9694897752996(195.72701371308523) | Crossentropy: 8.882389782851902 | Perplexity: 471.917147216093


<h2>Language Model Hyper-Parameterization</h2>

In [14]:
for n in range(2, ngram_dimensions + 1):
    evaluate_model_(validation_sents, fdists, estimator_fn=ngram_estimator_fns[n], vocab=fdists[0], ngram_dimension=n, model_name="{}-Gram".format(n))

Language Model: 2-Gram | Sentences Average LogProb(STD): -208.1427119957932(131.2019715604889) | Random Sentences Average LogProb(STD): -308.37189942107835(188.3389820182272) | Crossentropy: 7.79290707950603 | Perplexity: 221.7679512417197
Language Model: 3-Gram | Sentences Average LogProb(STD): -262.68157182482884(165.54647879279685) | Random Sentences Average LogProb(STD): -308.98458638330595(187.91481652326283) | Crossentropy: 9.479277306330548 | Perplexity: 713.7511260547658


<h2>Language Model Testing</h2>

In [15]:
for n in range(2, ngram_dimensions + 1):
    evaluate_model_(development_sents, fdists, estimator_fn=ngram_estimator_fns[n], vocab=fdists[0], ngram_dimension=n, model_name="{}-Gram".format(n))

Language Model: 2-Gram | Sentences Average LogProb(STD): -202.6171279498383(118.89002182250977) | Random Sentences Average LogProb(STD): -299.6441424922481(172.12018526962413) | Crossentropy: 7.803999303926824 | Perplexity: 223.47959550552298
Language Model: 3-Gram | Sentences Average LogProb(STD): -255.44767958017556(150.08991616031932) | Random Sentences Average LogProb(STD): -300.36950480624574(171.55141861822946) | Crossentropy: 9.473215163412483 | Perplexity: 710.7582666576358


<h2>Interpolation Model Evaluation</h2>

In [22]:
def make_interpolation_estimates_(sents, fdists, vocab, l=(1 / 3,) * 2):
    
    l = (1 - sum(l),) + l
    assert l[0] >= 0
    num_vocab = len(vocab)
    sent_ngrams_fn_ = partial(sent_ngrams_, vocab=vocab, n=len(fdists))
    estimates = {}
    
    for sent in sents:
        
        sent_ngrams = sent_ngrams_fn_(sent)
        sent_prop_log = 0
        
        for ngram in sent_ngrams:
            interpol_sum = l[0] * fdists[0].freq(ngram[:-1])
            for n in range(2, len(fdists) + 1):
                interpol_sum  += l[n - 1] * (fdists[n - 1][ngram[-n:]] + 1) / (fdists[n - 2][ngram[-n:-1]] + num_vocab)
            sent_prop_log += np.log2(interpol_sum)
            
        estimates[" ".join(sent)] = sent_prop_log
        
    return estimates

In [29]:
interpolation_params = (3 / 4,  1 / 5)
interpolation_estimator_fn = partial(make_interpolation_estimates_, fdists=fdists, vocab=fdists[0], l=interpolation_params)

print("Training Set Evaluation:", end="\n\n")
evaluate_model_(training_sents, fdists, vocab=fdists[0], estimator_fn=interpolation_estimator_fn, ngram_dimension=len(fdists), model_name="Interpolated {}-Gram".format(len(fdists)))
print("-" * 100)

print("Test Set Evaluation:", end="\n\n")
evaluate_model_(development_sents, fdists, vocab=fdists[0], estimator_fn=interpolation_estimator_fn, ngram_dimension=len(fdists), model_name="Interpolated {}-Gram".format(len(fdists)))

Training Set Evaluation:

Language Model: Interpolated 3-Gram | Sentences Average LogProb(STD): -203.12852404611402(130.76272578584627) | Random Sentences Average LogProb(STD): -302.106310429056(197.3296582135732) | Crossentropy: 7.482043857034284 | Perplexity: 178.78028688303735

----------------------------------------------------------------------------------------------------
Test Set Evaluation:

Language Model: Interpolated 3-Gram | Sentences Average LogProb(STD): -208.37304546918386(122.09237477754071) | Random Sentences Average LogProb(STD): -301.5565779204233(173.04727390691872) | Crossentropy: 7.727463789177062 | Perplexity: 211.93290491870502

