In [None]:
import nltk
from nltk.util import ngrams
from nltk import word_tokenize
from collections import defaultdict, Counter
import math

nltk.download('punkt')



In [None]:
nltk.download('punkt_tab')

In [None]:
text = """
Natural language processing is a subfield of artificial intelligence.
It deals with the interaction between computers and humans using language.
NLP helps machines understand, interpret, and generate human language.
"""

# Tokenize and lowercase
tokens = word_tokenize(text.lower())


In [None]:
# Count n-grams
unigrams = list(ngrams(tokens, 1))
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))

# Frequency counts
unigram_freq = Counter(unigrams)
bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

# Vocabulary
vocab = set(tokens)
V = len(vocab)

print("Top 5 Unigrams:", unigram_freq.most_common(5))
print("Top 5 Bigrams:", bigram_freq.most_common(5))
print("Top 5 Trigrams:", trigram_freq.most_common(5))


In [None]:
def unigram_prob(word, unigram_freq, V):
    return (unigram_freq[(word,)] + 1) / (sum(unigram_freq.values()) + V)

def bigram_prob(w1, w2, bigram_freq, unigram_freq, V):
    return (bigram_freq[(w1, w2)] + 1) / (unigram_freq[(w1,)] + V)

def trigram_prob(w1, w2, w3, trigram_freq, bigram_freq, V):
    return (trigram_freq[(w1, w2, w3)] + 1) / (bigram_freq[(w1, w2)] + V)


In [None]:
def compute_bigram_sentence_prob(sentence):
    sentence = word_tokenize(sentence.lower())
    prob = 1.0
    for w1, w2 in ngrams(sentence, 2):
        p = bigram_prob(w1, w2, bigram_freq, unigram_freq, V)
        prob *= p
        print(f"P({w2}|{w1}) = {p:.4f}")
    print(f"\nTotal Sentence Probability: {prob:.10f}")

compute_bigram_sentence_prob("language processing is a subfield")


In [None]:
def log_prob_sentence_trigram(sentence):
    sentence = word_tokenize(sentence.lower())
    log_prob = 0.0
    for w1, w2, w3 in ngrams(sentence, 3):
        p = trigram_prob(w1, w2, w3, trigram_freq, bigram_freq, V)
        log_prob += math.log(p)
        print(f"log P({w3}|{w1} {w2}) = {math.log(p):.4f}")
    print(f"\nTotal Log Probability: {log_prob:.4f}")
