In [34]:
from nltk import sent_tokenize, word_tokenize, ngrams
import nltk
import re
import string
from nltk.probability import FreqDist
import json

## Model Building

In [25]:
class FourGramProbabilityEstimator:
    def __init__(self):
        self.unigram_counts = {}
        self.bigram_counts = {}
        self.trigram_counts = {}
        self.fourgram_counts = {}
        
    def train(self, corpus):
        unigram_counts = FreqDist()
        bigram_counts = FreqDist()
        trigram_counts = FreqDist()
        fourgram_counts = FreqDist()

        for sentence in corpus['uni_grams']:
            for word in sentence:
                unigram_counts[word] += 1
        

        for sentence in corpus['bi_grams']:
            for bigram in sentence:
                bigram_counts[bigram] += 1

        for sentence in corpus['tri_grams']:
            for trigram in sentence:
                trigram_counts[trigram] += 1

        for sentence in corpus['four_grams']:
            for fourgram in sentence:
                fourgram_counts[fourgram] += 1

    def probability_uni(self, word):
        return self.unigram_counts.get(word, 0) / sum(self.unigram_counts.values())

    def probability_bi(self, prev_word, word):
        prev_word_count = self.unigram_counts.get(prev_word, 0)
        if prev_word_count > 0:
            bigram_key = (prev_word, word)
            return self.bigram_counts.get(bigram_key, 0) / prev_word_count
        else:
            return self.probability_uni(word)

    def probability_tri(self, prev_word2, prev_word1, word):
        prev_bigram_count = self.bigram_counts.get((prev_word2, prev_word1), 0)
        if prev_bigram_count > 0:
            trigram_key = (prev_word2, prev_word1, word)
            return self.trigram_counts.get(trigram_key, 0) / prev_bigram_count
        else:
            return self.probability_bi(prev_word1, word)

    def probability_four(self, prev_word3, prev_word2, prev_word1, word):
        prev_trigram_count = self.trigram_counts.get((prev_word3, prev_word2, prev_word1), 0)
        if prev_trigram_count > 0:
            fourgram_key = (prev_word3, prev_word2, prev_word1, word)
            return self.fourgram_counts.get(fourgram_key, 0) / prev_trigram_count
        else:
            return self.probability_tri(prev_word2, prev_word1, word)

    def probability_backoff(self, prev_word3, prev_word2, prev_word1, word):
        fourgram = (prev_word3, prev_word2, prev_word1, word)
        trigram = (prev_word2, prev_word1, word)
        bigram = (prev_word1, word)

        if self.fourgram_counts.get(fourgram, 0) > 0:
            return self.probability_four(prev_word3, prev_word2, prev_word1, word)
        elif self.trigram_counts.get(trigram, 0) > 0:
            return self.probability_tri(prev_word2, prev_word1, word)
        elif self.bigram_counts.get(bigram, 0) > 0:
            return self.probability_bi(prev_word1, word)
        else:
            return self.probability_uni(word)

    def predict_next_word(self, prev_word3, prev_word2, prev_word1):
        # Given the context of the previous three words, predict the next word
        possible_next_words = set(self.unigram_counts.keys())  # Consider all possible words
        probabilities = []

        for word in possible_next_words:
            prob = self.probability_backoff(prev_word3, prev_word2, prev_word1, word)
            probabilities.append((word, prob))

        # Sort the predictions by probability in descending order
        predictions = sorted(probabilities, key=lambda x: x[1], reverse=True)

        return predictions
    
    def continue_predicting(self, prev_word3, prev_word2, prev_word1, num_predictions=5):
        # Continue predicting the next words based on the previous context
        predictions = []

        for _ in range(num_predictions):
            predicted_words = self.predict_next_word(prev_word3, prev_word2, prev_word1)
            if predicted_words:
                next_word = predicted_words[0][0]  # Take the most probable next word
                predictions.append(next_word)
                prev_word3, prev_word2, prev_word1 = prev_word2, prev_word1, next_word

        return predictions


In [27]:
fourgram_estimator = FourGramProbabilityEstimator()
fourgram_estimator.train(data)

prev_word3 = "Computer"
prev_word2 = "Science"
prev_word1 = "is"
word = "a"

# Predict the next word given the context
predictions = fourgram_estimator.probability_backoff(prev_word3, prev_word2, prev_word1, word)
print("Predictions for the next word:", predictions)

ZeroDivisionError: division by zero

## Model Evaluation

In [58]:
def perplexity_back_off():
    return

## Text Generation