In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from collections import defaultdict
import nltk
nltk.download('punkt')

# Sample text corpus
text = "This is a sample text corpus used to generate ngrams. ngrams are sequences of n-consecutive words."

# Tokenize the text into words
tokens = word_tokenize(text)

# Initialize CountVectorizer for trigrams
vectorizer = CountVectorizer(ngram_range=(4, 4), tokenizer=word_tokenize)
ngram_matrix = vectorizer.fit_transform([text])

# Get trigram names (features) from the vectorizer
ngram_names = vectorizer.get_feature_names_out()

# Convert trigram matrix to a dictionary of trigram frequencies
ngram_freq = dict(zip(ngram_names, ngram_matrix.toarray()[0]))
for ngram, freq in ngram_freq.items():
    print(f"ngram: {ngram}, Frequency: {freq}")

def predict_next_word(mgram, ngram_freq):
    """
    Predict the next word given a sequence of two words (bigram).
    """
    mgram_str = ' '.join(mgram)
    max_prob = 0
    predicted_word = None

    # Find the trigram with the highest probability that starts with the given bigram
    for ngram, freq in ngram_freq.items():
        if ngram.startswith(mgram_str):
            probability = freq / sum(ngram_freq.values())
            if probability > max_prob:
                max_prob = probability
                predicted_word = ngram.split()[-1]  # Get the last word of the trigram

    return predicted_word, max_prob

# Example: Predict next word after the bigram "sample text"
mgram = ["sample", "text" , "corpus"]
predicted_word, probability = predict_next_word(mgram, ngram_freq)

print(f"Next word after '{' '.join(mgram)}': '{predicted_word}', Probability: {probability:.2f}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


ngram: . ngrams are sequences, Frequency: 1
ngram: a sample text corpus, Frequency: 1
ngram: are sequences of n-consecutive, Frequency: 1
ngram: corpus used to generate, Frequency: 1
ngram: generate ngrams . ngrams, Frequency: 1
ngram: is a sample text, Frequency: 1
ngram: ngrams . ngrams are, Frequency: 1
ngram: ngrams are sequences of, Frequency: 1
ngram: of n-consecutive words ., Frequency: 1
ngram: sample text corpus used, Frequency: 1
ngram: sequences of n-consecutive words, Frequency: 1
ngram: text corpus used to, Frequency: 1
ngram: this is a sample, Frequency: 1
ngram: to generate ngrams ., Frequency: 1
ngram: used to generate ngrams, Frequency: 1
Next word after 'sample text corpus': 'used', Probability: 0.07


