# **N-Gram Model**

## Import Files

In [1]:
import matplotlib.pyplot as plt

## Insert Dataset

In [2]:
with open('text.txt', 'r', encoding='utf-8') as file:
    text = file.read()
import re
corpus = re.findall(r'\b\w+\b', text)

## Split Dataset into tokens

In [2]:
corpus_lower = [word.lower() for word in corpus]
corpus_vocab = set(corpus_lower)
corpus_vocab = list(corpus_vocab)

## Turning Tokens into N-Gram tuples

In [3]:
# N is variable. Changing it changes the model from bigram to trigram etc.
N = 2

N_gram = {}
N_plus_1_gram = {}

for i in range(len(corpus_lower) - N):
    N_gram_tuple = tuple(corpus_lower[i:i+N])
    N_plus_1_gram_tuple = tuple(corpus_lower[i:i+N+1])

    if N_gram_tuple in N_gram.keys():
        N_gram[N_gram_tuple] += 1
    else:
        N_gram[N_gram_tuple] = 1
        

    if N_plus_1_gram_tuple in N_plus_1_gram.keys():
        N_plus_1_gram[N_plus_1_gram_tuple] += 1
    else:
        N_plus_1_gram[N_plus_1_gram_tuple] = 1

## Model to Predict Next Word

In [4]:
def generate_text(input_text, N, N_gram, N_plus_1_gram, corpus_vocab, num_tokens):
    words = input_text.split()
    
    for _ in range(num_tokens):
        if len(words) < N:
            next_word = "N/A"
        else:
            last_N_words = tuple(words[-N:])
        
            if last_N_words not in N_gram:
                next_word = "N/A"
            else:
                denominator = N_gram[last_N_words]
        
                count = []
                for word in corpus_vocab:
                    next_word_tuple = last_N_words + (word,)
                    if next_word_tuple in N_plus_1_gram:
                        count.append([word, N_plus_1_gram[next_word_tuple]])
        
                prob_count = []
                for word_count in count:
                    prob_count.append([word_count[1] / denominator, word_count[0]])
        
                prob_count = sorted(prob_count, reverse=True)
        
                next_word = prob_count[0][1] if prob_count else None
        
                words.append(next_word)
    
    return ' '.join(words)


## Main Testing

In [10]:
# N is variable. Changing it changes the model from bigram to trigram etc.
N = 2
# Set this to the number of tokens you want to generate
num_tokens = 6


input_text = "since when has the president to establish a variety"
result = generate_text(input_text, N, N_gram, N_plus_1_gram, corpus_vocab, num_tokens)
print(result)

since when has the president to establish a variety of articles in stoneware and the
