In [2]:
# Import necessary libraries
import nltk
import random
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import pandas as pd

nltk.download("punkt")
nltk.download('gutenberg')

from nltk.corpus import gutenberg
corpus = gutenberg.sents("austen-emma.txt")[:1000]  # Limit to 1,000 sentences

flat_corpus = [word.lower() for sentence in corpus for word in sentence if word.isalpha()]

# Unigram Model
unigram_counts = Counter(flat_corpus)
total_unigrams = sum(unigram_counts.values())

# Bigram Model
bigram_counts = Counter(bigrams(flat_corpus))
total_bigrams = sum(bigram_counts.values())

# Trigram Model
trigram_counts = Counter(trigrams(flat_corpus))
total_trigrams = sum(trigram_counts.values())

def predict_unigram():
    # Predict next word based on unigram probabilities (random choice weighted by frequency)
    next_word = random.choices(list(unigram_counts.keys()), weights=unigram_counts.values())[0]
    return next_word

def predict_bigram(sequence):
    # Predict next word based on the last word's bigram probabilities
    last_word = sequence[-1]
    possible_bigrams = {bigram[1]: count for bigram, count in bigram_counts.items() if bigram[0] == last_word}
    if possible_bigrams:
        next_word = random.choices(list(possible_bigrams.keys()), weights=possible_bigrams.values())[0]
    else:
        next_word = predict_unigram()  # fallback to unigram if no bigram available
    return next_word

def predict_trigram(sequence):
    # Predict next word based on the last two words' trigram probabilities
    last_bigram = tuple(sequence[-2:])
    possible_trigrams = {trigram[2]: count for trigram, count in trigram_counts.items() if trigram[:2] == last_bigram}
    if possible_trigrams:
        next_word = random.choices(list(possible_trigrams.keys()), weights=possible_trigrams.values())[0]
    else:
        next_word = predict_bigram(sequence)  # fallback to bigram if no trigram available
    return next_word



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Unnamed: 0,Sequence,Unigram,Bigram,Trigram
0,she was,hope,really,so
1,he had,usual,been,not
2,they were,nearer,in,overcome
3,it is,had,not,not
4,i think,and,of,mrs
5,you should,has,not,ever
6,we are,she,henry,privileged
7,this was,harriet,the,not
8,do you,meet,must,think
9,how can,small,the,emma


In [3]:
sample_sequences = [
    ["she", "was", "so"],
    ["he", "had", "not"],
    ["they", "were", "about"],
    ["it", "is", "a"],
    ["i", "think", "i"],
    ["you", "should", "have"],
    ["we", "are", "going"],
    ["this", "was", "tremendously"],
    ["do", "you", "love"],
    ["how", "can", "model"]
]

results = []

for sequence in sample_sequences:
    unigram_prediction = predict_unigram()
    bigram_prediction = predict_bigram(sequence)
    trigram_prediction = predict_trigram(sequence)
    results.append({
        "Sequence": " ".join(sequence),
        "Unigram": unigram_prediction,
        "Bigram": bigram_prediction,
        "Trigram": trigram_prediction
    })

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Sequence,Unigram,Bigram,Trigram
0,she was so,good,as,just
1,he had not,her,in,the
2,they were about,him,her,hartfield
3,it is a,not,sort,flatterer
4,i think i,were,said,ought
5,you should have,success,come,been
6,we are going,account,only,to
7,this was tremendously,not,little,with
8,do you love,harriet,with,he
9,how can model,miss,mr,mr


## A Simple Sentence Generator

In [4]:
import nltk.corpus

In [7]:
from nltk.corpus import gutenberg
nltk.download('gutenberg')
corpus = gutenberg.sents("austen-emma.txt")

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [30]:
coupus[2]

NameError: name 'coupus' is not defined

In [11]:
n_gram = {}

In [12]:
for sent in corpus:
    # Now lets ignore commas
    words = [word for word in sent if word[0].isalpha()]

    # lets create combinations
    for ix in range(len(words) - 1):
        try:
            n_gram[words[ix]].append(words[ix + 1])
        except KeyError as _:
            n_gram[words[ix]] = []
            n_gram[words[ix]].append(words[ix + 1])

In [16]:
n_gram['the']

['best',
 'world',
 'youngest',
 'two',
 'intimacy',
 'nominal',
 'mildness',
 'shadow',
 'power',
 'disadvantages',
 'shape',
 'wedding',
 'bride',
 'match',
 'kindness',
 'affection',
 'various',
 'intercourse',
 'last',
 'equal',
 'ways',
 'family',
 'change',
 'difference',
 'house',
 'actual',
 'friendliness',
 'next',
 'house',
 'large',
 'place',
 'origin',
 'rest',
 'advantage',
 'carriage',
 'horses',
 'poor',
 'lock',
 'door',
 'right',
 'help',
 'evening',
 'family',
 'elder',
 'wedding',
 'bye',
 'question',
 'few',
 'only',
 'wedding',
 'marriage',
 'match',
 'match',
 'right',
 'greatest',
 'world',
 'year',
 'son',
 'uncle',
 'subject',
 'day',
 'subject',
 'match',
 'last',
 'match',
 'pleasure',
 'do',
 'do',
 'same',
 'only',
 'best',
 'fish',
 'chicken',
 'last',
 'more',
 'militia',
 'chances',
 'connexion',
 'full',
 'family',
 'marriage',
 'infinite',
 'great',
 'best',
 'luxuries',
 'wife',
 'Churchills',
 'worst',
 'bargain',
 'expense',
 'child',
 'additional',

In [17]:
# now lets write a fucntion that generates a sentence from ngrams
import random

In [28]:
def generate_sentence(num_words=15):
    words = []

    # Taking the first word randomly
    next_word = random.choice(list(n_gram.keys()))
    # appending it
    words.append(next_word)

    # now loop to take next words
    while len(words) < num_words:
        # new next_word given from the list of possibilities against last word
        next_word = random.choice(n_gram[next_word])
        words.append(next_word)

    return " ".join(words)

In [29]:
generate_sentence()

'After a motto to heat themselves off and soon allowed to another said she does'