## The initial Kneser-Ney smoothing technique

In [None]:
def calc_kneser_ney_proba(ngram_counter, ngram_minus_one_counter, continuation_counts, total_continuations, ngram, delta):
    """
    Calculate ngram probability with simplified Kneser-Ney smoothing for bigrams or trigrams
    :param ngram_counter: Counter for ngrams (bigrams or trigrams)
    :param ngram_minus_one_counter: Counter for n-1 grams
    :param continuation_counts: Counter for continuation counts
    :param total_continuations: Total number of unique continuations
    :param ngram: tuple representing the ngram (bigram or trigram)
    :param delta: discount value
    :return: float probability of the ngram
    """
    ngram_count = ngram_counter[ngram]
    context = ngram[:-1]
    if context == ('<s>', '<s>',) or context == ('<s>',):   
        ngram_minus_one_count = len(cleaned_train_sentences)
    else:
        ngram_minus_one_count = ngram_minus_one_counter[context]

    adjusted_count = max(ngram_count - delta, 0)
    epsilon = 1e-10

    # For bigrams, use the second token for continuation, for trigrams use the third token
    continuation_token = ngram[-1]

    # Calculate our interpolation weight
    continuation_prob = continuation_counts[continuation_token] / total_continuations
    alpha_weight = (delta * len(set([ng for ng in ngram_counter if ng[:-1] == context]))+ epsilon) / (ngram_minus_one_count + epsilon)
    kn_probability = adjusted_count / (ngram_minus_one_count + epsilon) + alpha_weight * continuation_prob
    if kn_probability==0 or kn_probability>=1:
        print(f'Error occured with ngram: {ngram}. Probability more than 1 or 0.')
    return kn_probability

## Improved Kneser-Ney smoothing implementation by using context_set_cache

In [None]:
def calc_kneser_ney_proba(ngram_counter, ngram_minus_one_counter, continuation_counts, total_continuations, ngram, delta):
    """
    Calculate ngram probability with simplified Kneser-Ney smoothing for bigrams or trigrams
    :param ngram_counter: Counter for ngrams (bigrams or trigrams)
    :param ngram_minus_one_counter: Counter for n-1 grams
    :param continuation_counts: Counter for continuation counts
    :param total_continuations: Total number of unique continuations
    :param ngram: tuple representing the ngram (bigram or trigram)
    :param delta: discount value
    :return: float probability of the ngram
    """
    ngram_count = ngram_counter[ngram]
    context = ngram[:-1]
    if context == ('<s>', '<s>',) or context == ('<s>',):   
        ngram_minus_one_count = len(cleaned_train_sentences)
    else:
        ngram_minus_one_count = ngram_minus_one_counter[context]

    adjusted_count = max(ngram_count - delta, 0)
    epsilon = 1e-10

    # For bigrams, use the second token for continuation, for trigrams use the third token
    continuation_token = ngram[-1]

    # Retrieve or calculate the set for the current context
    if context not in context_set_cache:
        context_set_cache[context] = set(ng for ng in ngram_counter if ng[:-1] == context)

    # Use the counter for the unique elements
    context_set_counters[context] = Counter(context_set_cache[context])

    # Calculate our interpolation weight
    continuation_prob = continuation_counts[continuation_token] / total_continuations
    alpha_weight = (delta * len(context_set_counters[context]) + epsilon)/ (ngram_minus_one_count + epsilon)
    kn_probability = adjusted_count / (ngram_minus_one_count + epsilon) + alpha_weight * continuation_prob
    if kn_probability==0 or kn_probability>=1:
        print(f'Error occured with ngram: {ngram}. Probability more than 1 or 0.')
    return kn_probability


# Initialize caches
context_set_cache = {}
context_set_counters = {}

## Best Kneser-Ney smoothing implementation by calculating prefixes counter outside the function for all the ngram Counter

In [None]:
# Convert list of n-grams to a list of tuples
ngram_tuples_tri = [tuple(ng) for ng in trigram_counter]

# Create a Counter for the prefixes for the trigram model
prefixes_counter_tri = Counter(ng[:-1] for ng in ngram_tuples_tri)


def calc_kneser_ney_proba(ngram_counter, ngram_minus_one_counter, continuation_counts, ngram, delta, prefixes_counter):
    """
    Calculate ngram probability with simplified Kneser-Ney smoothing for bigrams or trigrams
    :param ngram_counter: Counter for ngrams (bigrams or trigrams)
    :param ngram_minus_one_counter: Counter for n-1 grams
    :param continuation_counts: Counter for continuation counts
    :param total_continuations: Total number of unique continuations
    :param ngram: tuple representing the ngram (bigram or trigram)
    :param delta: discount value
    :param prefixes_counter: Counter for prefixes of ngram
    :return: float probability of the ngram
    """
    ngram_count = ngram_counter[ngram]
    context = ngram[:-1]
    if context == ('<s>', '<s>',) or context == ('<s>',):   
        ngram_minus_one_count = len(cleaned_train_sentences)
    else:
        ngram_minus_one_count = ngram_minus_one_counter[context]

    adjusted_count = max(ngram_count - delta, 0)
    epsilon = 1e-10

    # For bigrams, use the second token for continuation, for trigrams use the third token
    continuation_token = ngram[-1]

    # Calculate our interpolation weight
    continuation_prob = continuation_counts[continuation_token] / len(ngram_counter)
    if continuation_prob>1:
        print(f'The continuation probability is: {continuation_prob:.3f}')
    alpha_weight = (delta * prefixes_counter[(context)] + epsilon) / (ngram_minus_one_count + epsilon)
    kn_probability = adjusted_count / (ngram_minus_one_count + epsilon) + alpha_weight * continuation_prob
    if kn_probability==0 or kn_probability>=1:
        print(f'Error occured with ngram: {ngram}. Probability more than 1 or 0.')
    return kn_probability
