<a href="https://colab.research.google.com/github/Raka7317/Generative_ai_Lab/blob/main/bi_gram_tri_gram_practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import math
from collections import Counter

# -------------------------------
# 1. TEXT PREPROCESSING
# -------------------------------
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation & special characters
    text = re.sub(r'[^a-z\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


# -------------------------------
# 2. TOKENIZATION
# -------------------------------
def tokenize(text):
    return text.split()


# -------------------------------
# 3. N-GRAM GENERATION
# -------------------------------
def generate_ngrams(tokens, n):
    ngrams = []
    for i in range(len(tokens) - n + 1):
        ngrams.append(tuple(tokens[i:i+n]))
    return ngrams


# -------------------------------
# 4. N-GRAM PROBABILITY
# -------------------------------
def calculate_ngram_probabilities(ngrams, lower_ngrams):
    ngram_counts = Counter(ngrams)
    lower_ngram_counts = Counter(lower_ngrams)

    probabilities = {}
    for ngram in ngram_counts:
        prefix = ngram[:-1]   # (w1,w2) for trigram or (w1) for bigram
        probabilities[ngram] = ngram_counts[ngram] / lower_ngram_counts[prefix]

    return probabilities


# -------------------------------
# 5. PERPLEXITY CALCULATION
# -------------------------------
def calculate_perplexity(ngrams, probabilities):
    N = len(ngrams)
    log_prob_sum = 0

    for ngram in ngrams:
        prob = probabilities.get(ngram, 1e-10)  # avoid log(0)
        log_prob_sum += math.log(prob)

    perplexity = math.exp(-log_prob_sum / N)
    return perplexity


# -------------------------------
# 6. MAIN EXECUTION
# -------------------------------
text = "Data science is fun and data science is powerful"

# Preprocess
clean_text = preprocess_text(text)

# Tokenize
tokens = tokenize(clean_text)

# ---------- BIGRAM MODEL ----------
bigrams = generate_ngrams(tokens, 2)
unigrams = [(token,) for token in tokens]

bigram_probs = calculate_ngram_probabilities(bigrams, unigrams)
bigram_perplexity = calculate_perplexity(bigrams, bigram_probs)

# ---------- TRIGRAM MODEL ----------
trigrams = generate_ngrams(tokens, 3)
bigram_prefixes = generate_ngrams(tokens, 2)

trigram_probs = calculate_ngram_probabilities(trigrams, bigram_prefixes)
trigram_perplexity = calculate_perplexity(trigrams, trigram_probs)


# -------------------------------
# 7. OUTPUT
# -------------------------------
print("Clean Text:", clean_text)
print("Tokens:", tokens)

print("\nBigrams:", bigrams)
print("Bigram Perplexity:", bigram_perplexity)

print("\nTrigrams:", trigrams)
print("Trigram Perplexity:", trigram_perplexity)


Clean Text: data science is fun and data science is powerful
Tokens: ['data', 'science', 'is', 'fun', 'and', 'data', 'science', 'is', 'powerful']

Bigrams: [('data', 'science'), ('science', 'is'), ('is', 'fun'), ('fun', 'and'), ('and', 'data'), ('data', 'science'), ('science', 'is'), ('is', 'powerful')]
Bigram Perplexity: 1.189207115002721

Trigrams: [('data', 'science', 'is'), ('science', 'is', 'fun'), ('is', 'fun', 'and'), ('fun', 'and', 'data'), ('and', 'data', 'science'), ('data', 'science', 'is'), ('science', 'is', 'powerful')]
Trigram Perplexity: 1.2190136542044754
