In [None]:
import re

In [None]:
from collections import Counter,defaultdict

In [None]:
def tokenize(text):
  """
  Tokenizes the input text into words.
  """
  text=re.sub(r'\W+','',text.lower())
  tokens=text.split()
  return tokens

In [None]:
def generate_ngrams(tokens,n):
  """
  Generates n-grams from the input tokens.
  """
  ngrams=zip(*[tokens[i:] for i in range(n)])
  return [''.join(ngram) for ngram in ngrams]

In [None]:
def laplace_smoothing(training_text,n=2):
  """
  Applies Laplace smoothing to the input training text.
  """
  tokens=tokenize(training_text)
  ngrams=generate_ngrams(tokens,n)
  ngram_counts=Counter(ngrams)
  unigram_counts=Counter(tokens)
  vocab_size=len(set(tokens))
  total_ngrams=sum(ngram_counts.values())
  smoothed_probs=defaultdict(lambda:1/(total_ngrams+vocab_size))

  for ngram,count in ngram_counts.items():
    smoothed_probs[ngram]=(count+1)/(total_ngrams+vocab_size)
  return smoothed_probs

In [None]:
def get_smoothed_probability(ngram,smoothed_probs):
  """
  Returns the smoothed probability of the input ngram.
  """
  return smoothed_probs[ngram]

In [None]:
text="I love programming in Python. Python is awesome for data science."

smoothed_probs=laplace_smoothing(text,n=2)

In [None]:
test_bigram='python is'
prob=get_smoothed_probability(test_bigram,smoothed_probs)
print(f"Probability of '{test_bigram}': {prob:6f}")

Smoothed probability of 'python is': 1.000000


In [None]:
test_bigram_unseen='data awesome'
prob_unseen=get_smoothed_probability(test_bigram_unseen,smoothed_probs)
print(f"Probability of '{test_bigram_unseen}': {prob_unseen:6f}")

Probability of 'data awesome': 1.000000
