<a href="https://colab.research.google.com/github/Teja3993/NLP_Lab/blob/main/NLP_Lab_Exercise_4_Bi_Gram_Add_One_Smoothing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Experiment 4: Bigram Model & Add-One Smoothing ---

def get_data():
    # The dataset from the PDF
    data = ['This is a dog', 'This is a cat', 'I love my cat', 'This is my name']

    # Flatten into a single list of words
    tokens = []
    for sentence in data:
        for word in sentence.split():
            tokens.append(word)
    return tokens

def build_models(tokens):
    # Dictionaries to store counts
    bigram_counts = {}
    unigram_counts = {}
    bigram_list = []

    # Vocabulary size (V) for smoothing
    unique_words = set(tokens)
    V = len(unique_words)

    # Logic to create bigrams
    # The PDF logic skips bigrams if the second word is Capitalized
    # (assuming it's the start of a new sentence)
    for i in range(len(tokens) - 1):
        current_word = tokens[i]
        next_word = tokens[i+1]

        # Update Unigram Count
        if current_word in unigram_counts:
            unigram_counts[current_word] += 1
        else:
            unigram_counts[current_word] = 1

        # Check condition to form bigram (heuristics from PDF)
        # We only form a bigram if the next word is lowercase (part of same sentence)
        if next_word.islower():
            bigram = (current_word, next_word)
            bigram_list.append(bigram)

            if bigram in bigram_counts:
                bigram_counts[bigram] += 1
            else:
                bigram_counts[bigram] = 1

    return bigram_list, unigram_counts, bigram_counts, V

def calculate_probabilities(bigram_counts, unigram_counts, V):
    raw_probs = {}
    smoothed_probs = {}

    # Iterate through all found bigrams
    for bigram in bigram_counts:
        word1 = bigram[0]
        word2 = bigram[1]

        count_w1w2 = bigram_counts[bigram]
        count_w1 = unigram_counts[word1]

        # 1. Raw Probability Formula (Matches PDF Output)
        raw_probs[bigram] = count_w1w2 / count_w1

        # 2. Add-One Smoothing Formula (Matches PDF Aim)
        # (Count + 1) / (UnigramCount + VocabularySize)
        smoothed_probs[bigram] = (count_w1w2 + 1) / (count_w1 + V)

    return raw_probs, smoothed_probs

def test_sentence(sentence, raw_probs):
    print(f"\n--- Testing Sentence: \"{sentence}\" ---")
    words = sentence.split()
    total_prob = 1.0
    found_bigrams = []

    for i in range(len(words) - 1):
        bg = (words[i], words[i+1])
        found_bigrams.append(bg)

        if bg in raw_probs:
            prob = raw_probs[bg]
            print(f"P({bg[1]}|{bg[0]}) = {prob}")
            total_prob *= prob
        else:
            print(f"P({bg[1]}|{bg[0]}) = 0.0 (Not found)")
            total_prob *= 0.0

    print(f"Total Probability: {total_prob}")

# --- Execution ---
tokens = get_data()
print("Tokens:", tokens)

# Build Counts
bigram_list, unigram_counts, bigram_counts, V = build_models(tokens)

# Calculate Probs
raw_probs, smoothed_probs = calculate_probabilities(bigram_counts, unigram_counts, V)

# Display Results (Similar to PDF Output)
print("\nUnigram Frequencies:", unigram_counts)
print("Bigram Frequencies:", bigram_counts)

print("\n--- Raw Probabilities (Matches PDF Output) ---")
for bg, prob in raw_probs.items():
    print(f"{bg}: {prob:.4f}")

# Run the Test Case from the PDF
test_sentence("This is my cat", raw_probs)

Tokens: ['This', 'is', 'a', 'dog', 'This', 'is', 'a', 'cat', 'I', 'love', 'my', 'cat', 'This', 'is', 'my', 'name']

Unigram Frequencies: {'This': 3, 'is': 3, 'a': 2, 'dog': 1, 'cat': 2, 'I': 1, 'love': 1, 'my': 2}
Bigram Frequencies: {('This', 'is'): 3, ('is', 'a'): 2, ('a', 'dog'): 1, ('a', 'cat'): 1, ('I', 'love'): 1, ('love', 'my'): 1, ('my', 'cat'): 1, ('is', 'my'): 1, ('my', 'name'): 1}

--- Raw Probabilities (Matches PDF Output) ---
('This', 'is'): 1.0000
('is', 'a'): 0.6667
('a', 'dog'): 0.5000
('a', 'cat'): 0.5000
('I', 'love'): 1.0000
('love', 'my'): 1.0000
('my', 'cat'): 0.5000
('is', 'my'): 0.3333
('my', 'name'): 0.5000

--- Testing Sentence: "This is my cat" ---
P(is|This) = 1.0
P(my|is) = 0.3333333333333333
P(cat|my) = 0.5
Total Probability: 0.16666666666666666
