<a href="https://colab.research.google.com/github/NipunRaj96/DeepLearning/blob/main/bigram_nextWordPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.util import ngrams
from collections import defaultdict
import random
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
text= """Hello, how are you? I am doing well.
How are you today? I am good.
How are you? I am unwell."""
print(text)

Hello, how are you? I am doing well.
How are you today? I am good.
How are you? I am unwell.


In [None]:
text= text.lower()
text

'hello, how are you? i am doing well.\nhow are you today? i am good.\nhow are you? i am unwell.'

In [None]:
tokens = nltk.word_tokenize(text) # Tokenizes the input text into lowercase words.
print(tokens)

['hello', ',', 'how', 'are', 'you', '?', 'i', 'am', 'doing', 'well', '.', 'how', 'are', 'you', 'today', '?', 'i', 'am', 'good', '.', 'how', 'are', 'you', '?', 'i', 'am', 'unwell', '.']


In [None]:
n=2
ngrams_list = list(ngrams(tokens, n)) # 2. Generates a list of n-grams (e.g., bigrams) from the tokens.
print(ngrams_list)

[('hello', ','), (',', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'doing'), ('doing', 'well'), ('well', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', 'today'), ('today', '?'), ('?', 'i'), ('i', 'am'), ('am', 'good'), ('good', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'unwell'), ('unwell', '.')]


In [None]:
# Nested dictionary initialization using defaultdict
model = defaultdict(lambda: defaultdict(int))

In [None]:
l= ['hello', 'Hello', 'hi', 'hello', 'bye']
d= defaultdict(int)
for word in l:
  d[word]+=1
d

defaultdict(int, {'hello': 2, 'Hello': 1, 'hi': 1, 'bye': 1})

In [None]:
a= ['hello', 'hello', 'hi', 'are', 'you', 'you']
b= defaultdict(int)
for word in a:
  b[word]+=1
b

defaultdict(int, {'hello': 2, 'hi': 1, 'are': 1, 'you': 2})

In [None]:
print(ngrams_list)

[('hello', ','), (',', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'doing'), ('doing', 'well'), ('well', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', 'today'), ('today', '?'), ('?', 'i'), ('i', 'am'), ('am', 'good'), ('good', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'unwell'), ('unwell', '.')]



```
from collections import defaultdict
bigram_model = defaultdict(lambda: defaultdict(int))

## Add a bigram ('hello', 'world')
bigram_model['hello']['world'] += 1

What Happens Internally?
# bigram_model['hello'] does not exist → So, it creates a new defaultdict(int).

bigram_model = {'hello': defaultdict(int)}

bigram_model['hello']['world'] does not exist →
So, it initializes to 0 and increments by 1

bigram_model['hello']['world'] = 1

--- without using defaultdict---

bigram_model = {}
This would raise KeyError because 'hello' is not initialized

bigram_model['hello']['world'] += 1

Instead, we’d need to manually check and initialize:

if 'hello' not in bigram_model:
    bigram_model['hello'] = {}

if 'world' not in bigram_model['hello']:
    bigram_model['hello']['world'] = 0

bigram_model['hello']['world'] += 1

Using defaultdict eliminates this manual initialization, making the code cleaner.
```




In [None]:
print(ngrams_list)

[('hello', ','), (',', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'doing'), ('doing', 'well'), ('well', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', 'today'), ('today', '?'), ('?', 'i'), ('i', 'am'), ('am', 'good'), ('good', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'unwell'), ('unwell', '.')]


In [None]:
model= defaultdict(lambda: defaultdict(int))
for pair in ngrams_list:
  history= pair[0:-1]
  pred= pair[-1]
  model[history][pred]+=1

model


defaultdict(<function __main__.<lambda>()>,
            {('hello',): defaultdict(int, {',': 1}),
             (',',): defaultdict(int, {'how': 1}),
             ('how',): defaultdict(int, {'are': 3}),
             ('are',): defaultdict(int, {'you': 3}),
             ('you',): defaultdict(int, {'?': 2, 'today': 1}),
             ('?',): defaultdict(int, {'i': 3}),
             ('i',): defaultdict(int, {'am': 3}),
             ('am',): defaultdict(int, {'doing': 1, 'good': 1, 'unwell': 1}),
             ('doing',): defaultdict(int, {'well': 1}),
             ('well',): defaultdict(int, {'.': 1}),
             ('.',): defaultdict(int, {'how': 2}),
             ('today',): defaultdict(int, {'?': 1}),
             ('good',): defaultdict(int, {'.': 1}),
             ('unwell',): defaultdict(int, {'.': 1})})

In [None]:
# The lambda function is used to dynamically create nested dictionaries when a new key is accessed
#  for the first time
model= defaultdict(lambda: defaultdict(int))
for pair in ngrams_list:
  history= pair[:-1]
  predictions= pair[-1]
  model[history][predictions]+= 1

In [None]:
# Print the model with standard dictionaries
for history, predictions in model.items():
    print(history, dict(predictions)) #Convert the defaultdict to a regular dict.

('hello',) {',': 1}
(',',) {'how': 1}
('how',) {'are': 3}
('are',) {'you': 3}
('you',) {'?': 2, 'today': 1}
('?',) {'i': 3}
('i',) {'am': 3}
('am',) {'doing': 1, 'good': 1, 'unwell': 1}
('doing',) {'well': 1}
('well',) {'.': 1}
('.',) {'how': 2}
('today',) {'?': 1}
('good',) {'.': 1}
('unwell',) {'.': 1}


In [None]:
for k, v in model.items():
  count= float(sum(v.values()))
  for ke, va in v.items():
    model[k][ke] = va / count

# Print the model with standard dictionaries
for history, predictions in model.items():
    print(history, dict(predictions)) #Convert the defaultdict to a regular dict.

In [None]:
model[('you',)]

defaultdict(int, {'?': 0.6666666666666666, 'today': 0.3333333333333333})

In [None]:
import random
history = ['you']
history = tuple(history) # Converts the history (list) to a tuple (immutable, used as dictionary key).
if history in model: # Checks if the history exists in the model.
      probabilities = model[history] # Retrieves the probabilities of words following the history.
      words = list(probabilities.keys()) # Extracts the list of words from the probabilities dictionary.
      probs = list(probabilities.values()) # Extracts the list of words from the probabilities dictionary.
      random.choices(words, weights=probs)[0]
else:
  print(" ")



In [None]:
def train_bigram_model(text, n=2):
    """
    Trains a bigram model from the given text.

    Parameters:
        text (str): Input text.
        n (int): N-gram size (default=2 for bigrams).

    Returns:
        model (defaultdict): Nested dictionary with word probabilities.
    """
    # Tokenization
    tokens = nltk.word_tokenize(text.lower())  # Convert text to lowercase
    print(f"Tokens: {tokens}")

    # Generate N-grams (bigrams in this case)
    ngrams_list = list(ngrams(tokens, n))
    print(f"N-Grams: {ngrams_list}")

    # Initialize nested dictionary
    model = defaultdict(lambda: defaultdict(int))

    # Count occurrences of word pairs
    for pair in ngrams_list:
        history = pair[:-1]  # First word
        predictions = pair[-1]  # Next word
        model[history][predictions] += 1

    # Convert counts to probabilities
    for k, v in model.items():
        count = float(sum(v.values()))
        for ke, va in v.items():
            model[k][ke] = va / count  # Convert counts to probabilities

    return model

def predict_next_word(model, history):
    """
    Predicts the next word based on the given history.

    Parameters:
        model (defaultdict): The trained bigram model.
        history (list): List of words representing the context.

    Returns:
        str: Predicted next word or an empty string if history is unseen.
    """
    history = tuple(history)  # Convert list to tuple (immutable dictionary key)

    if history in model:
        probabilities = model[history]
        words = list(probabilities.keys())  # Possible next words
        probs = list(probabilities.values())  # Corresponding probabilities
        return random.choices(words, weights=probs)[0]  # Sample based on probability

    return " "  # Return empty string if history is unseen

# Sample text input
text = """Hello, how are you? I am doing well.
How are you today? I am good.
How are you? I am unwell."""

# Train the bigram model
bigram_model = train_bigram_model(text)

# Predict the next word
history = ["you"]
predicted_word = predict_next_word(bigram_model, history)

print(f"\nGiven history: {history}, Predicted next word: {predicted_word}")


Tokens: ['hello', ',', 'how', 'are', 'you', '?', 'i', 'am', 'doing', 'well', '.', 'how', 'are', 'you', 'today', '?', 'i', 'am', 'good', '.', 'how', 'are', 'you', '?', 'i', 'am', 'unwell', '.']
N-Grams: [('hello', ','), (',', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'doing'), ('doing', 'well'), ('well', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', 'today'), ('today', '?'), ('?', 'i'), ('i', 'am'), ('am', 'good'), ('good', '.'), ('.', 'how'), ('how', 'are'), ('are', 'you'), ('you', '?'), ('?', 'i'), ('i', 'am'), ('am', 'unwell'), ('unwell', '.')]

Given history: ['you'], Predicted next word: ?
