In [5]:
import nltk
from nltk import ngrams
from collections import Counter
import math

nltk.download('punkt')
nltk.download('stopwords')

# Sample text data
text = "the cat sat on the mat and the dog barked"

# Preprocess the text data
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())  # Tokenize and convert to lowercase
    tokens = [token for token in tokens if token.isalnum()]  # Remove punctuation
    tokens = [token for token in tokens if token not in nltk.corpus.stopwords.words('english')]  # Remove stopwords
    return tokens

# Create n-grams from preprocessed text
def create_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Build the language model (unigram in this example)
def build_language_model(tokens):
    unigram_counts = Counter(tokens)
    total_tokens = len(tokens)
    unigram_probs = {token: count / total_tokens for token, count in unigram_counts.items()}
    return unigram_probs

# Prepare test dataset
test_text = "the cat meowed"

# Calculate perplexity
def calculate_perplexity(test_text, language_model, n):
    test_tokens = preprocess_text(test_text)
    test_ngrams = create_ngrams(test_tokens, n)
    
    perplexity_sum = 0
    vocab_size = len(language_model)
    epsilon = 1e-10  # Small epsilon value for Laplace smoothing
    
    for ngram in test_ngrams:
        probability = (language_model.get(ngram, 0) + epsilon) / (vocab_size + epsilon * vocab_size)
        perplexity_sum += -math.log(probability)
    
    perplexity = math.exp(perplexity_sum / len(test_ngrams))
    return perplexity

# Example usage
tokens = preprocess_text(text)
language_model = build_language_model(tokens)
perplexity = calculate_perplexity(test_text, language_model, n=1)
print(f"Perplexity: {perplexity:.2f}")

Perplexity: 50000000005.00


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
