# Part 01 - Exercise 3 (n-gram language models)

## Organizing our Data

At first, we downloaded via *nltk* package a valuable corpus such as 'reuters'. Moreover, we downloaded the method tokenization 'punkt' via nltk package.
We splitted our data into training, development and test set and we transformed any rare word (freq<=10) or out-of-vocabulary word to the special token 'UNK'.
As we can see from the printed console, a lot of words transformed into the special token 'UNK' in order to be able to handle the unknown words better.

In [1]:
# If you running for first time uncomment the following 3 lines iot download the corpus
# import nltk
# nltk.download()
# nltk.download('punkt')
from nltk.corpus import reuters
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.util import ngrams
import math
from more_itertools import windowed


# Load the 'reuters' corpus
sentences = reuters.sents()

# Splitting data into Training, Development and Test set
train_sents, test_sents = train_test_split(reuters.sents(), test_size=0.3, random_state=42)
dev_sents, test_sents = train_test_split(test_sents, test_size=0.5, random_state=42)


# Transform the train sentences into words
train_words = [word for sentence in train_sents for word in sentence]
freq_dist_train = FreqDist(train_words)

# Replace rare words in train set
cleaned_train_sentences = []
for sentence in train_sents:
    cleaned_train_sentence = [word if freq_dist_train[word] > 10 else 'UNK' for word in sentence]
    cleaned_train_sentences.append(cleaned_train_sentence)

print(cleaned_train_sentences[:3])


# Transform the development sentences into words
dev_words = [word for sentence in dev_sents for word in sentence]
freq_dist_dev = FreqDist(dev_words)

# Replace rare words or Out-of-Vocabulary words in dev set
cleaned_dev_sentences = []
for sentence in dev_sents:
    cleaned_dev_sentence = ['UNK' if freq_dist_dev[word] <= 10 or word not in train_words else word for word in sentence]
    cleaned_dev_sentences.append(cleaned_dev_sentence)

print(cleaned_dev_sentences[:3])

# Transform the test sentences into words
test_words = [word for sentence in test_sents for word in sentence]
freq_dist_test = FreqDist(test_words)

# Replace rare words or Out-of-Vocabulary words in test set
cleaned_test_sentences = []
for sentence in test_sents:
    cleaned_test_sentence = ['UNK' if freq_dist_test[word] <= 10 or word not in train_words else word for word in sentence]
    cleaned_test_sentences.append(cleaned_test_sentence)
    
print(cleaned_test_sentences[:3])

[['The', 'holders', ',', 'UNK', 'of', 'whom', 'was', 'willing', 'to', 'be', 'identified', ',', 'said', 'although', 'Harcourt', 'has', 'urged', 'that', 'they', 'convert', 'their', 'shares', 'to', 'common', 'stock', 'by', 'the', 'June', 'eight', 'record', 'date', 'for', 'a', 'special', 'dividend', ',', 'they', 'were', 'unable', 'to', 'determine', 'if', 'it', 'might', 'be', 'better', 'for', 'them', 'to', 'continue', 'holding', 'the', 'debentures', '.'], ['UNK', 'UNK', '&', 'lt', ';', 'UNK', '>', 'TO', 'UNK', 'UNK', 'OFFER', 'UNK', 'UNK', 'Corp', 'said', 'it', 'plans', 'to', 'respond', 'to', 'an', 'unsolicited', 'recapitalization', 'plan', 'proposed', 'by', 'Gabelli', 'and', 'Co', 'Inc', 'after', 'the', 'company', ',', 'its', 'board', 'and', 'its', 'investment', 'bankers', 'evaluate', 'the', 'proposal', '.'], ['The', 'main', 'reason', 'for', 'the', 'expected', 'increase', 'in', 'beet', 'UNK', 'is', 'that', 'returns', 'from', 'competing', 'crops', 'such', 'as', 'soybeans', 'and', 'grains', 

## i) Build our unigram, bigram & trigram model

In [2]:
# Build unigram, bigram and trigram counters for our training set
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for sent in cleaned_train_sentences:

    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True,
                                                    left_pad_symbol='<s>', right_pad_symbol='<e>')])
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True,
                                                       left_pad_symbol='<s>', right_pad_symbol='<e>')])
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True,
                                                        left_pad_symbol='<s>', right_pad_symbol='<e>')])
print(unigram_counter.most_common(10))
print(bigram_counter.most_common(10))
print(trigram_counter.most_common(10))

[(('UNK',), 73623), (('.',), 66403), ((',',), 50713), (('the',), 41011), (('of',), 25252), (('to',), 23930), (('in',), 18595), (('and',), 17693), (('said',), 17659), (('a',), 16401)]
[(('.', '<e>'), 34142), (('<s>', 'UNK'), 8218), (('UNK', 'UNK'), 7971), ((',', '000'), 7220), (("'", 's'), 6427), (('<s>', 'The'), 6167), (('lt', ';'), 6057), (('&', 'lt'), 6055), (('said', '.'), 5581), (('UNK', ','), 5060)]
[(('.', '<e>', '<e>'), 34142), (('<s>', '<s>', 'UNK'), 8218), (('<s>', '<s>', 'The'), 6167), (('&', 'lt', ';'), 6054), (('said', '.', '<e>'), 5580), (('lt', ';', 'UNK'), 4843), (('U', '.', 'S'), 3977), (('.', 'S', '.'), 3726), ((';', 'UNK', '>'), 3027), (('<s>', '<s>', '"'), 2528)]


## Calculation of bigram and trigram probabilities via Laplace smoothing

In the following block of code we constructed a function which is responsible for calculating the probability of a ngram (bigram or trigram) model. 
We used Laplace smoothing for this purpose. We also added the special tokens in order to include them in the size of the vocabulary.

In [3]:
# Define the hyperparameter alpha. Fine-tuning on the development set
alpha = 0.1

# Sum the tokens for the whole corpus (training, dev & test sets)
tokens = [token for sent in sentences for token in sent]
# Calculate vocabulary size (including any special tokens)
special_tokens = ['<s>', '<e>', 'UNK']
vocab_size = len(set(tokens + special_tokens))
print(f'The whole vocabulary size by train, development and test sets is: {vocab_size}')


def calc_ngram_proba(ngram_counter, ngram_minus_one_counter, ngram, alpha, vocab_size):
    """
    Calculate ngram probability with Laplace smoothing
    :param bigram_counter: Counter which the key is a tuple of ngram and value its frequency
    :param gram_counter: Counter which the key is a tuple of n-1gram and value its frequency
    :param ngram: tuple
    :param alpha: float hyperparameter for Laplace smoothing
    :param vocab_size: int value which defines the whole size of the corpus
    :return: float probability of the ngram inside the corpus
    """
    ngram_count = ngram_counter[ngram]
    context = ngram[:-1]
    ngram_minus_one_count = ngram_minus_one_counter[context]
    if ngram_count>ngram_minus_one_count:
        print(f'The following ngram occurs an error in the counter: {ngram}')
    ngram_prob = (ngram_count + alpha) / (ngram_minus_one_count + (alpha * vocab_size))
    # if ngram_prob>0.6:
    #     print(f'ngram: {ngram}, ngram_count: {ngram_count}, ngram_minus_one_count: {ngram_minus_one_count}')
    return ngram_prob


The whole vocabulary size by train, development and test sets is: 41602


## ii) Calculation of probabilities, Cross-Entropy and Perplexity for our bigram model (Laplace smoothing)

In [4]:
# Calculate bigram probability and Cross-Entropy of sentences in the test set
total_log_proba_bigram = 0.0
for sent in cleaned_test_sentences:
    # Pad the sentence with '<s>' and '<e>' tokens
    padded_sent = ['<s>'] + sent + ['<e>']

    # Iterate over the bigrams of the sentence
    for first_token, second_token in windowed(padded_sent, 2):
        if first_token == '<s>': # Avoid calculating that because unigram counter does not does not have counts for <s>
            pass
        else:
            bigram = (first_token, second_token)
            bigram_prob = calc_ngram_proba(bigram_counter, unigram_counter, bigram, alpha, vocab_size)
            total_log_proba_bigram += math.log2(bigram_prob)

# Calculation of total tokens for test set, including only 'end' token for each sentence
num_tokens = sum(len(sent) + 1 for sent in cleaned_test_sentences)

cross_entropy_bigram = - total_log_proba_bigram / num_tokens
print(f"The total Cross-Entropy of bigram model for our Test set is: {cross_entropy_bigram:.3f}")

# Calculation of the perplexity of bigram model for the test set

bigram_perplexity = 2 ** (cross_entropy_bigram)
print(f"Perplexity of bigram model for Test Set: {bigram_perplexity:.3f}")

The total Cross-Entropy of bigram model for our Test set is: 7.107
Perplexity of bigram model for Test Set: 137.878


## Calculation of probabilities, Cross-Entropy and Perplexity for our trigram model

In [5]:
# Calculate trigram probability and Cross-Entropy of sentences in the test set
total_log_proba_trigram = 0.0
for sent in cleaned_test_sentences:
    # Pad the sentence with '<s>' and '<e>' tokens
    padded_sent = ['<s>'] + ['<s>'] + sent + ['<e>']

    # Iterate over the bigrams of the sentence
    for first_token, second_token, third_token in windowed(padded_sent, 3):
        if first_token == '<s>' and second_token == '<s>': # Avoid calculating that because bigram counter does not have counts for <s>, <s>
            pass
        else:
            trigram = (first_token, second_token, third_token)
            trigram_prob = calc_ngram_proba(trigram_counter, bigram_counter, trigram, alpha, vocab_size)
            total_log_proba_trigram += math.log2(trigram_prob)

cross_entropy_trigram = - total_log_proba_trigram / num_tokens 
print(f"The total Cross-Entropy of trigram model for our Test set is: {cross_entropy_trigram: .3f}")

# Calculation of the perplexity of bigram model for the test set
trigram_perplexity = 2 ** (cross_entropy_trigram)
print(f"Perplexity of trigram model for Test Set: {trigram_perplexity:.3f}")

The total Cross-Entropy of trigram model for our Test set is:  9.843
Perplexity of trigram model for Test Set: 918.252


## Calculation of bigram and trigram probabilities via Improved Kneser-Ney smoothing

In the following block of code we constructed a function which is responsible for calculating the probability of a ngram (bigram or trigram) model. 
In the following block of code we used Kneser-Ney smoothing which is more challenging and efficient. We generalized the purpose of our function in order to calculate either for bigram or trigram models.

In [None]:
def calc_kneser_ney_proba(ngram_counter, ngram_minus_one_counter, continuation_counts, total_continuations, ngram, delta):
    """
    Calculate ngram probability with simplified Kneser-Ney smoothing for bigrams or trigrams
    :param ngram_counter: Counter for ngrams (bigrams or trigrams)
    :param ngram_minus_one_counter: Counter for n-1 grams
    :param continuation_counts: Counter for continuation counts
    :param total_continuations: Total number of unique continuations
    :param ngram: tuple representing the ngram (bigram or trigram)
    :param delta: discount value
    :return: float probability of the ngram
    """
    ngram_count = ngram_counter[ngram]
    context = ngram[:-1]
    ngram_minus_one_count = ngram_minus_one_counter[context]

    adjusted_count = max(ngram_count - delta, 0)
    epsilon = 1e-10

    # For bigrams, use the second token for continuation, for trigrams use the third token
    continuation_token = ngram[-1]

    # Retrieve or calculate the set for the current context
    if context not in context_set_cache:
        context_set_cache[context] = set(ng for ng in ngram_counter if ng[:-1] == context)

    # Use the counter for the unique elements
    context_set_counters[context] = Counter(context_set_cache[context])

    # Calculate our interpolation weight
    continuation_prob = continuation_counts[continuation_token] / total_continuations
    alpha_weight = (delta * len(context_set_counters[context]) + epsilon)/ (ngram_minus_one_count + epsilon)

    # Kneser-Ney probability
    kn_probability = (adjusted_count + epsilon) / (ngram_minus_one_count + epsilon) + alpha_weight * continuation_prob

    return kn_probability


## Calculation of probabilities, Cross-Entropy and Perplexity for our bigram model (Kneser-Ney smoothing)

In [31]:
from tqdm import tqdm

# Calculate continuation counts
continuation_counts = Counter([bigram[1] for bigram in bigram_counter])
total_continuations = len(continuation_counts)


# Initialize caches
context_set_cache = {}
context_set_counters = {}

total_log_proba_bigram_kn = 0.0
delta = 0.5
with tqdm(total=256000) as pbar:  # Check our time and iters remaining!
    for sent in cleaned_test_sentences:
        padded_sent = ['<s>'] + sent + ['<e>']
    
        for first_token, second_token in windowed(padded_sent, 2):
            if first_token == '<s>': # Avoid calculating that because unigram counter does not does not have counts for <s>
                pass
            else:
                bigram = (first_token, second_token)
                bigram_prob = calc_kneser_ney_proba(bigram_counter, unigram_counter, continuation_counts, total_continuations, bigram, delta)
                total_log_proba_bigram_kn += math.log2(bigram_prob)
                if bigram_prob > 1:
                    print(bigram_prob)
                pbar.update(1)  # Update the progress bar

cross_entropy_bigram_kn = - total_log_proba_bigram_kn / num_tokens 
print(f"The total Cross-Entropy of bigram model via Kneser-Ney smoothing for our Test set is: {cross_entropy_bigram_kn: .3f}")

# Calculation of the perplexity of bigram model for the test set via Kneser-Ney smoothing
bigram_perplexity_kn = 2 ** (cross_entropy_bigram_kn)
print(f"Perplexity of bigram model for Test Set: {bigram_perplexity_kn:.3f}")

256664it [03:13, 1323.16it/s]                                                                                                                                                    

The total Cross-Entropy of bigram model via Kneser-Ney smoothing for our Test set is:  4.841
Perplexity of bigram model for Test Set: 28.654





## Calculation of probabilities, Cross-Entropy and Perplexity for our trigram model (Kneser-Ney smoothing)

In [33]:
# Calculate continuation counts
continuation_counts_tri = Counter([trigram[2] for trigram in trigram_counter])
total_continuations_tri = len(continuation_counts_tri)

# Initialize caches again
context_set_cache = {}
context_set_counters = {}

total_log_proba_trigram_kn = 0.0
delta = 0.75

with tqdm(total=256000) as pbar:  # Check our time and iters remaining!
    for sent in cleaned_test_sentences:
        padded_sent = ['<s>'] + ['<s>'] + sent + ['<e>']
    
        for first_token, second_token, third_token in windowed(padded_sent, 3):
            if first_token == '<s>' and second_token == '<s>': # Avoid calculating that because bigram counter does not have counts for <s>, <s>
                pass
            else:
                trigram = (first_token, second_token, third_token)
                trigram_prob = calc_kneser_ney_proba(trigram_counter, bigram_counter, continuation_counts_tri, total_continuations_tri, 
                                                     trigram, delta)
                if trigram_prob == 0:
                    print(trigram_prob)
                total_log_proba_trigram_kn += math.log2(trigram_prob)
                pbar.update(1)  # Update the progress bar

cross_entropy_trigram_kn = - total_log_proba_trigram_kn / num_tokens 
print(f"The total Cross-Entropy of trigram model for our Test set is: {cross_entropy_trigram_kn: .3f}")

# Calculation of the perplexity of bigram model for the test set
trigram_perplexity_kn = 2 ** (cross_entropy_trigram_kn)
print(f"Perplexity of trigram model for Test Set: {trigram_perplexity_kn:.3f}")

256664it [1:01:36, 69.44it/s]                                                                                                                                                    

The total Cross-Entropy of trigram model for our Test set is:  2.428
Perplexity of trigram model for Test Set: 5.380





## The initial Kneser-Ney smoothing technique

In [None]:
def calc_kneser_ney_proba1(ngram_counter, ngram_minus_one_counter, continuation_counts, total_continuations, ngram, delta):
    """
    Calculate ngram probability with simplified Kneser-Ney smoothing for bigrams or trigrams
    :param ngram_counter: Counter for ngrams (bigrams or trigrams)
    :param ngram_minus_one_counter: Counter for n-1 grams
    :param continuation_counts: Counter for continuation counts
    :param total_continuations: Total number of unique continuations
    :param ngram: tuple representing the ngram (bigram or trigram)
    :param delta: discount value
    :return: float probability of the ngram
    """
    ngram_count = ngram_counter[ngram]
    context = ngram[:-1]
    ngram_minus_one_count = ngram_minus_one_counter[context]

    adjusted_count = max(ngram_count - delta, 0)
    epsilon = 1e-10

    # For bigrams, use the second token for continuation, for trigrams use the third token
    continuation_token = ngram[-1]

    # Calculate our interpolation weight
    continuation_prob = continuation_counts[continuation_token] / total_continuations
    alpha_weight = delta * len(set([ng for ng in ngram_counter if ng[:-1] == context])) / (ngram_minus_one_count + epsilon)

    # Kneser-Ney probability
    kn_probability = adjusted_count / (ngram_minus_one_count + epsilon) + alpha_weight * continuation_prob

    return kn_probability