<a href="https://colab.research.google.com/github/Raka7317/Generative_ai_Lab/blob/main/BIGRAM_TRIGRAM_PART2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import collections
import re
import math

def preprocess_text(text):
    """Converts text to lowercase and removes punctuation."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    return text

def tokenize(text):
    """Splits text into words."""
    return text.split()

def generate_ngrams(tokens, n):
    """Generates n-grams from a list of tokens."""
    if len(tokens) < n:
        return []
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams

def ngram_prob_no_smoothing(ngrams_list, prefixes_list):
    """Calculates n-gram probabilities without smoothing.

    Args:
        ngrams_list: A list of n-grams (e.g., [('a', 'b'), ('b', 'c')]).
        prefixes_list: A list of (n-1)-gram prefixes (e.g., [('a',), ('b',)]).

    Returns:
        A dictionary mapping n-gram to its probability.
    """
    ngram_counts = collections.Counter(ngrams_list)
    prefix_counts = collections.Counter(prefixes_list)

    probabilities = {}
    for ngram in ngrams_list:
        prefix = ngram[:-1]

        ngram_count = ngram_counts[ngram]
        prefix_count = prefix_counts[prefix]

        if prefix_count == 0:
            probabilities[ngram] = 0.0 # If prefix count is 0, probability is 0
        else:
            probabilities[ngram] = ngram_count / prefix_count

    return probabilities

def ngram_prob_laplace(ngrams_list, prefixes_list, vocab_size):
    """Calculates n-gram probabilities with Laplace smoothing.

    Args:
        ngrams_list: A list of n-grams.
        prefixes_list: A list of (n-1)-gram prefixes.
        vocab_size: The size of the vocabulary.

    Returns:
        A dictionary mapping n-gram to its smoothed probability.
    """
    ngram_counts = collections.Counter(ngrams_list)
    prefix_counts = collections.Counter(prefixes_list)

    probabilities = {}

    for ngram in ngrams_list:
        prefix = ngram[:-1]

        ngram_count = ngram_counts[ngram]
        prefix_count = prefix_counts[prefix]

        # Laplace smoothing: (C(ngram) + 1) / (C(prefix) + V)
        probabilities[ngram] = (ngram_count + 1) / (prefix_count + vocab_size)

    return probabilities


def calculate_perplexity(ngrams, probabilities):
    """Calculates the perplexity of a sequence of n-grams.

    Args:
        ngrams: The sequence of n-grams (e.g., test set).
        probabilities: A dictionary mapping n-grams to their probabilities.

    Returns:
        The perplexity as a float.
    """
    log_sum_prob = 0.0
    N = len(ngrams)

    if N == 0:
        return float('inf')

    for ngram in ngrams:
        prob = probabilities.get(ngram)
        if prob is None or prob == 0:
            # If an n-gram has 0 probability, perplexity becomes infinite.
            return float('inf')
        log_sum_prob += math.log(prob)

    # Perplexity = (1 / P(W))^(1/N) = exp(-1/N * sum(log P(wi)))
    perplexity = math.exp(-log_sum_prob / N)
    return perplexity

text = "Data science is fun and data science is powerful"

# Step 1: Preprocess
clean_text = preprocess_text(text)

# Step 2: Tokenize
tokens = tokenize(clean_text)
vocab_size = len(set(tokens))

# ---------------- BIGRAM ----------------
bigrams = generate_ngrams(tokens, 2)
unigrams = [(token,) for token in tokens]

# Without smoothing
bigram_probs_no = ngram_prob_no_smoothing(bigrams, unigrams)
bigram_pp_no = calculate_perplexity(bigrams, bigram_probs_no)

# With Laplace smoothing
bigram_probs_la = ngram_prob_laplace(bigrams, unigrams, vocab_size)
bigram_pp_la = calculate_perplexity(bigrams, bigram_probs_la)

# ---------------- TRIGRAM ----------------
trigrams = generate_ngrams(tokens, 3)
bigram_prefixes = generate_ngrams(tokens, 2)

# Without smoothing
trigram_probs_no = ngram_prob_no_smoothing(trigrams, bigram_prefixes)
trigram_pp_no = calculate_perplexity(trigrams, trigram_probs_no)

# With Laplace smoothing
trigram_probs_la = ngram_prob_laplace(trigrams, bigram_prefixes, vocab_size)
trigram_pp_la = calculate_perplexity(trigrams, trigram_probs_la)

# ---------------- OUTPUT ----------------
print("Tokens:", tokens)

print("\nBIGRAM PERPLEXITY")
print("Without smoothing:", bigram_pp_no)
print("With Laplace smoothing:", bigram_pp_la)

print("\nTRIGRAM PERPLEXITY")
print("Without smoothing:", trigram_pp_no)
print("With Laplace smoothing:", trigram_pp_la)


NameError: name 'ngram_prob_no_smoothing' is not defined

In [5]:
import re
import math
from collections import Counter

# -----------------------------
# 1. TEXT PREPROCESSING
# -----------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


# -----------------------------
# 2. TOKENIZATION
# -----------------------------
def tokenize(text):
    return text.split()


# -----------------------------
# 3. GENERATE N-GRAMS
# -----------------------------
def generate_ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]


# -----------------------------
# 4. PROBABILITY WITHOUT SMOOTHING
# -----------------------------
def ngram_prob_no_smoothing(ngrams, lower_ngrams):
    ngram_count = Counter(ngrams)
    lower_count = Counter(lower_ngrams)

    probs = {}
    for ngram in ngram_count:
        prefix = ngram[:-1]
        probs[ngram] = ngram_count[ngram] / lower_count[prefix]
    return probs


# -----------------------------
# 5. PROBABILITY WITH LAPLACE SMOOTHING
# -----------------------------
def ngram_prob_laplace(ngrams, lower_ngrams, vocab_size):
    ngram_count = Counter(ngrams)
    lower_count = Counter(lower_ngrams)

    probs = {}
    for ngram in ngram_count:
        prefix = ngram[:-1]
        probs[ngram] = (ngram_count[ngram] + 1) / (lower_count[prefix] + vocab_size)
    return probs


# -----------------------------
# 6. PERPLEXITY CALCULATION
# -----------------------------
def calculate_perplexity(ngrams, probabilities):
    N = len(ngrams)
    log_sum = 0

    for ngram in ngrams:
        prob = probabilities.get(ngram, 1e-10)  # avoid log(0)
        log_sum += math.log(prob)

    return math.exp(-log_sum / N)


# -----------------------------
# 7. MAIN EXECUTION
# -----------------------------
text = "Data science is fun and data science is powerful"

# Preprocessing
clean_text = preprocess_text(text)

# Tokenization
tokens = tokenize(clean_text)
vocab_size = len(set(tokens))

# -------- BIGRAM --------
bigrams = generate_ngrams(tokens, 2)
unigrams = [(token,) for token in tokens]

bigram_probs_no = ngram_prob_no_smoothing(bigrams, unigrams)
bigram_pp_no = calculate_perplexity(bigrams, bigram_probs_no)

bigram_probs_la = ngram_prob_laplace(bigrams, unigrams, vocab_size)
bigram_pp_la = calculate_perplexity(bigrams, bigram_probs_la)

# -------- TRIGRAM --------
trigrams = generate_ngrams(tokens, 3)
bigram_prefixes = generate_ngrams(tokens, 2)

trigram_probs_no = ngram_prob_no_smoothing(trigrams, bigram_prefixes)
trigram_pp_no = calculate_perplexity(trigrams, trigram_probs_no)

trigram_probs_la = ngram_prob_laplace(trigrams, bigram_prefixes, vocab_size)
trigram_pp_la = calculate_perplexity(trigrams, trigram_probs_la)


# -----------------------------
# 8. OUTPUT
# -----------------------------
print("Tokens:", tokens)

print("\nBIGRAM PERPLEXITY")
print("Without smoothing:", bigram_pp_no)
print("With Laplace smoothing:", bigram_pp_la)

print("\nTRIGRAM PERPLEXITY")
print("Without smoothing:", trigram_pp_no)
print("With Laplace smoothing:", trigram_pp_la)


Tokens: ['data', 'science', 'is', 'fun', 'and', 'data', 'science', 'is', 'powerful']

BIGRAM PERPLEXITY
Without smoothing: 1.189207115002721
With Laplace smoothing: 3.158758147025058

TRIGRAM PERPLEXITY
Without smoothing: 1.2190136542044754
With Laplace smoothing: 3.364298418765503
