In [2]:
!pip install nltk python-docx

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [3]:
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import defaultdict, Counter
import re
from docx import Document
import random
import numpy as np

In [4]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
doc = Document("data.docx")

sentences = []
for para in doc.paragraphs:
    if para.text.strip():
        sentences.append(para.text)

sentences


['text',
 'I absolutely loved the clean design of the application.',
 'This machine learning course explains complex ideas very clearly.',
 'Waiting for the update ruined my overall experience.',
 'The product quality exceeded my expectations by a lot.',
 'I regret buying this service due to frequent errors.',
 'The new feature improved the performance significantly.',
 'The interface feels intuitive and user friendly.',
 'Technical glitches caused repeated failures during checkout.',
 'Poor network connectivity makes the app frustrating to use.',
 'The customer support response was slow and disappointing.']

In [7]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

processed_sentences = [preprocess(sent) for sent in sentences]
processed_sentences


['text',
 'i absolutely loved the clean design of the application',
 'this machine learning course explains complex ideas very clearly',
 'waiting for the update ruined my overall experience',
 'the product quality exceeded my expectations by a lot',
 'i regret buying this service due to frequent errors',
 'the new feature improved the performance significantly',
 'the interface feels intuitive and user friendly',
 'technical glitches caused repeated failures during checkout',
 'poor network connectivity makes the app frustrating to use',
 'the customer support response was slow and disappointing']

In [8]:

bigram_counts = defaultdict(Counter)

for sentence in processed_sentences:
    tokens = nltk.word_tokenize(sentence)
    for w1, w2 in ngrams(tokens, 2):
        bigram_counts[w1][w2] += 1


In [9]:
def predict_next_word(current_word, n_predictions=3):
    current_word = current_word.lower()

    if current_word in bigram_counts:
        next_words = bigram_counts[current_word]
        total_count = sum(next_words.values())

        # Sort by probability
        sorted_words = sorted(
            next_words.items(),
            key=lambda x: x[1]/total_count,
            reverse=True
        )

        predictions = []
        for word, count in sorted_words[:n_predictions]:
            probability = count / total_count
            predictions.append((word, probability))

        return predictions
    else:
        return [("No prediction found", 0)]


In [10]:
test_words = ["i", "the", "this", "waiting", "product"]

for word in test_words:
    preds = predict_next_word(word)
    print(f"Input Word: '{word}'")
    for next_word, prob in preds:
        print(f"→ Predicted Next Word: '{next_word}' | Probability: {prob:.2f}")
    print("-"*50)


Input Word: 'i'
→ Predicted Next Word: 'absolutely' | Probability: 0.50
→ Predicted Next Word: 'regret' | Probability: 0.50
--------------------------------------------------
Input Word: 'the'
→ Predicted Next Word: 'clean' | Probability: 0.11
→ Predicted Next Word: 'application' | Probability: 0.11
→ Predicted Next Word: 'update' | Probability: 0.11
--------------------------------------------------
Input Word: 'this'
→ Predicted Next Word: 'machine' | Probability: 0.50
→ Predicted Next Word: 'service' | Probability: 0.50
--------------------------------------------------
Input Word: 'waiting'
→ Predicted Next Word: 'for' | Probability: 1.00
--------------------------------------------------
Input Word: 'product'
→ Predicted Next Word: 'quality' | Probability: 1.00
--------------------------------------------------


In [11]:
def generate_sequence(start_word, length=5):
    current_word = start_word
    sequence = [current_word]

    for _ in range(length):
        preds = predict_next_word(current_word)
        next_word = preds[0][0]  # Pick the most probable
        if next_word == "No prediction found":
            break
        sequence.append(next_word)
        current_word = next_word

    return " ".join(sequence)

# Example
generate_sequence("i", length=5)


'i absolutely loved the clean design'

In [12]:
def predict_next_word_single(word, n_predictions=3):
    """
    Predict the next word for a single input word.
    Returns top N predicted words with their probabilities.
    """
    word = word.lower()

    if word in bigram_counts:
        next_words = bigram_counts[word]
        total = sum(next_words.values())
        # Sort by probability
        sorted_words = sorted(next_words.items(), key=lambda x: x[1]/total, reverse=True)
        # Format predictions
        predictions = [(w, c/total) for w, c in sorted_words[:n_predictions]]
        return predictions
    else:
        return [("No prediction found", 0)]


In [13]:
user_word = input("Enter a word: ")
preds = predict_next_word_single(user_word)

print(f"Predictions for '{user_word}':")
for w, prob in preds:

    print(f"→ {w} (Probability: {prob:.2f})")


Enter a word: my
Predictions for 'my':
→ overall (Probability: 0.50)
→ expectations (Probability: 0.50)


In [None]:
# Flatten all bigrams in dataset
all_bigrams = []

for sentence in processed_sentences:
    tokens = nltk.word_tokenize(sentence)
    for w1, w2 in ngrams(tokens, 2):
        all_bigrams.append((w1, w2))

all_bigrams[:10]  # peek at first 10 bigrams


[('i', 'absolutely'),
 ('absolutely', 'loved'),
 ('loved', 'the'),
 ('the', 'clean'),
 ('clean', 'design'),
 ('design', 'of'),
 ('of', 'the'),
 ('the', 'application'),
 ('the', 'customer'),
 ('customer', 'support')]

In [None]:
correct = 0
total = len(all_bigrams)

for w1, actual_next in all_bigrams:
    preds = predict_next_word_single(w1, n_predictions=1)  # top 1 prediction
    predicted_word = preds[0][0]
    if predicted_word == actual_next:
        correct += 1

accuracy = correct / total
print(f"Next Word Prediction Accuracy: {accuracy*100:.2f}%")


Next Word Prediction Accuracy: 81.94%


In [None]:
correct_top3 = 0

for w1, actual_next in all_bigrams:
    preds = predict_next_word_single(w1, n_predictions=3)
    predicted_words = [w for w, p in preds]
    if actual_next in predicted_words:
        correct_top3 += 1

top3_accuracy = correct_top3 / total
print(f"Top-3 Accuracy: {top3_accuracy*100:.2f}%")


Top-3 Accuracy: 91.67%


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Flatten all bigrams in dataset
all_bigrams = []
for sentence in processed_sentences:
    tokens = nltk.word_tokenize(sentence)
    for w1, w2 in ngrams(tokens, 2):
        all_bigrams.append((w1, w2))

# Create a DataFrame to store predictions and probabilities
eval_data = []

for w1, actual_next in all_bigrams:
    preds = predict_next_word(w1, n_predictions=3)  # top-3
    predicted_words = [p[0] for p in preds]
    probabilities = [p[1] for p in preds]
    confidence = preds[0][1]  # probability of top-1 word

    # Top-1 correctness
    top1_correct = int(predicted_words[0] == actual_next)
    # Top-3 correctness
    top3_correct = int(actual_next in predicted_words)

    eval_data.append({
        "Input Word": w1,
        "Actual Next": actual_next,
        "Top-1 Prediction": predicted_words[0],
        "Top-1 Probability": probabilities[0],
        "Top-3 Predictions": predicted_words,
        "Confidence": confidence,
        "Top-1 Correct": top1_correct,
        "Top-3 Correct": top3_correct
    })

eval_df = pd.DataFrame(eval_data)
eval_df


Unnamed: 0,Input Word,Actual Next,Top-1 Prediction,Top-1 Probability,Top-3 Predictions,Confidence,Top-1 Correct,Top-3 Correct
0,i,absolutely,absolutely,0.500000,"[absolutely, regret]",0.500000,1,1
1,absolutely,loved,loved,1.000000,[loved],1.000000,1,1
2,loved,the,the,1.000000,[the],1.000000,1,1
3,the,clean,clean,0.111111,"[clean, application, customer]",0.111111,1,1
4,clean,design,design,1.000000,[design],1.000000,1,1
...,...,...,...,...,...,...,...,...
67,glitches,caused,caused,1.000000,[caused],1.000000,1,1
68,caused,repeated,repeated,1.000000,[repeated],1.000000,1,1
69,repeated,failures,failures,1.000000,[failures],1.000000,1,1
70,failures,during,during,1.000000,[during],1.000000,1,1


In [None]:
top1_accuracy = eval_df["Top-1 Correct"].mean()
top3_accuracy = eval_df["Top-3 Correct"].mean()
avg_confidence = eval_df["Confidence"].mean()

print(f"Top-1 Accuracy: {top1_accuracy*100:.2f}%")
print(f"Top-3 Accuracy: {top3_accuracy*100:.2f}%")
print(f"Average Confidence: {avg_confidence*100:.2f}%")


Top-1 Accuracy: 81.94%
Top-3 Accuracy: 91.67%
Average Confidence: 81.94%
