In [1]:
import nltk
from nltk.corpus import gutenberg
from nltk.util import ngrams
from nltk.probability import FreqDist, ConditionalFreqDist

In [3]:
# Download the Gutenberg corpus if not already downloaded
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\salmank\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


True

In [5]:
# Load the Gutenberg corpus
gutenberg_corpus = gutenberg.sents()

# Flatten the list of sentences into a list of words
gutenberg_words = [word.lower() for sentence in gutenberg_corpus for word in sentence]
gutenberg_words[:100]

['[', 'emma', 'by', 'jane', 'austen', '1816', ']', 'volume', 'i', 'chapter']

In [8]:

# Define the desired N-gram order
n = 3  # Change this value for different N-grams (e.g., 2 for bigrams, 4 for 4-grams)

# Add Laplace smoothing constant
smoothing_constant = 1

# Create N-grams
n_grams = list(ngrams(gutenberg_words, n))
n_grams[:20]

[('[', 'emma', 'by'),
 ('emma', 'by', 'jane'),
 ('by', 'jane', 'austen'),
 ('jane', 'austen', '1816'),
 ('austen', '1816', ']'),
 ('1816', ']', 'volume'),
 (']', 'volume', 'i'),
 ('volume', 'i', 'chapter'),
 ('i', 'chapter', 'i'),
 ('chapter', 'i', 'emma'),
 ('i', 'emma', 'woodhouse'),
 ('emma', 'woodhouse', ','),
 ('woodhouse', ',', 'handsome'),
 (',', 'handsome', ','),
 ('handsome', ',', 'clever'),
 (',', 'clever', ','),
 ('clever', ',', 'and'),
 (',', 'and', 'rich'),
 ('and', 'rich', ','),
 ('rich', ',', 'with')]

In [10]:
# Create frequency distributions for N-grams
freq_dist = FreqDist(n_grams)
freq_dist[-1]

0

In [None]:

# Create conditional frequency distributions for predicting the next word with Laplace smoothing
cfd = ConditionalFreqDist((ngram[:-1], ngram[-1]) for ngram in n_grams)

# Function to generate the next word in a sentence with Laplace smoothing
def generate_next_word_with_smoothing(prefix):
    vocabulary_size = len(set(gutenberg_words))
    context_frequency = len(cfd[prefix])
    if prefix in cfd:
        return cfd[prefix].max()
    else:
        return None

# Generate a sentence with Laplace smoothing
sentence = ["the", "project", "gutenberg"]
while True:
    next_word = generate_next_word_with_smoothing(tuple(sentence[-(n-1):]))
    if next_word is None:
        break
    sentence.append(next_word)

# Print the generated sentence
print("Generated Sentence with Laplace Smoothing:")
print(" ".join(sentence))
