<a href="https://colab.research.google.com/github/Nimrat4/laplace-smoothing-nlp/blob/main/laplace_smoothing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
from collections import defaultdict

def tokenize_and_generate_bigrams(input_text):
    tokens = re.findall(r'\b\w+\b', input_text.lower())  # Tokenize words using regex
    bigrams = [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]  # Generate bigrams
    return tokens, bigrams

def calculate_bigram_probabilities_with_pms(input_text, alpha=0.1):
    tokens, bigrams = tokenize_and_generate_bigrams(input_text)

    # Count occurrences of unigrams and bigrams
    unigram_count = defaultdict(int)
    bigram_count = defaultdict(int)

    for token in tokens:
        unigram_count[token] += 1
    for bigram in bigrams:
        bigram_count[bigram] += 1

    # Vocabulary size
    vocab_size = len(unigram_count)

    # Compute Maximum Likelihood Estimation (MLE) probabilities
    bigram_probabilities = {}
    for (w1, w2), count in bigram_count.items():
        bigram_probabilities[(w1, w2)] = count / unigram_count[w1]

    # Apply Probability Mass Stealing (PMS)
    bigram_probabilities_pms = {}
    for (w1, w2), prob in bigram_probabilities.items():
        bigram_probabilities_pms[(w1, w2)] = (1 - alpha) * prob  # Reduce probability of seen bigrams

    # Assign probability mass to unseen bigrams
    unseen_probability = alpha / vocab_size
    for w1 in unigram_count:
        for w2 in unigram_count:
            if (w1, w2) not in bigram_probabilities_pms:
                bigram_probabilities_pms[(w1, w2)] = unseen_probability  # Assign small probability to unseen bigrams

    return bigram_probabilities_pms

# Given corpus
input_text = "machine learning is amazing. deep learning is powerful. learning is continuous."

# Compute bigram probabilities using PMS
bigram_probabilities_pms = calculate_bigram_probabilities_with_pms(input_text)

# Print results
print("\nBigram Probabilities (After PMS):")
for bigram, prob in bigram_probabilities_pms.items():
    print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")

# Compute probability of "learning is"
target_bigram = ("learning", "is")
print(f"\nProbability of 'learning is' after PMS: {bigram_probabilities_pms.get(target_bigram, 0):.4f}")



Bigram Probabilities (After PMS):
P(learning | machine) = 0.9000
P(is | learning) = 0.9000
P(amazing | is) = 0.3000
P(deep | amazing) = 0.9000
P(learning | deep) = 0.9000
P(powerful | is) = 0.3000
P(learning | powerful) = 0.9000
P(continuous | is) = 0.3000
P(machine | machine) = 0.0143
P(is | machine) = 0.0143
P(amazing | machine) = 0.0143
P(deep | machine) = 0.0143
P(powerful | machine) = 0.0143
P(continuous | machine) = 0.0143
P(machine | learning) = 0.0143
P(learning | learning) = 0.0143
P(amazing | learning) = 0.0143
P(deep | learning) = 0.0143
P(powerful | learning) = 0.0143
P(continuous | learning) = 0.0143
P(machine | is) = 0.0143
P(learning | is) = 0.0143
P(is | is) = 0.0143
P(deep | is) = 0.0143
P(machine | amazing) = 0.0143
P(learning | amazing) = 0.0143
P(is | amazing) = 0.0143
P(amazing | amazing) = 0.0143
P(powerful | amazing) = 0.0143
P(continuous | amazing) = 0.0143
P(machine | deep) = 0.0143
P(is | deep) = 0.0143
P(amazing | deep) = 0.0143
P(deep | deep) = 0.0143
P(pow

In [None]:

from collections import defaultdict
import re

def laplace_smoothing_ngram_model(input_text, n_value): #input text and n for gram size- 2for bi 3 for tri
    word_tokens = re.findall(r'\b\w+\b', input_text.lower()) #text to lowercase to ensure case senstivity and word extraction
    unique_words = set(word_tokens)
    vocab_size = len(unique_words)  # Total number of unique words

    ngram_frequency = defaultdict(int) #for counting n grams tri int this case
    n_minus1_gram_frequency = defaultdict(int) #for counting bi

    for i in range(len(word_tokens) - n_value + 1):
        current_ngram = tuple(word_tokens[i:i+n_value])  # N-gram 3 in case of trigram , 2 in case of bi gram
        current_n_minus1_gram = tuple(word_tokens[i:i+n_value-1])  # (N-1)-gram

        ngram_frequency[current_ngram] += 1
        n_minus1_gram_frequency[current_n_minus1_gram] += 1

    # Computing prob  with Laplace Smoothing
    ngram_probabilities = {}

    for ngram, count in ngram_frequency.items():
        ngram_prefix = ngram[:-1]  # (N-1)-gram
        next_word = ngram[-1]  # Last word in the N-gram

        # Apply Laplace Smoothing formula
        smoothed_probability = (count + 1) / (n_minus1_gram_frequency[ngram_prefix] + vocab_size) # if n=3 P(new word/prefix)= count of n gram+1/count of n-1 gram+vocab size

        if ngram_prefix not in ngram_probabilities: #assigns 1/vocab size prob (1/6) default
            ngram_probabilities[ngram_prefix] = {}
        ngram_probabilities[ngram_prefix][next_word] = smoothed_probability

    # Assign probability of 1/V to all unseen words
    for ngram_prefix in n_minus1_gram_frequency:
        if ngram_prefix not in ngram_probabilities:
            ngram_probabilities[ngram_prefix] = {}
        for word in unique_words:
            if word not in ngram_probabilities[ngram_prefix]:
                ngram_probabilities[ngram_prefix][word] = 1 / vocab_size  # Default probability

    return ngram_probabilities

# Example Usage
input_text = "I love NLP and I love machine learning"
n_value = 3  # Trigram model
ngram_probabilities = laplace_smoothing_ngram_model(input_text, n_value)

# Print results
for ngram_prefix, next_word_probabilities in ngram_probabilities.items():
    print(f"{ngram_prefix}: {dict(next_word_probabilities)}")
