In [2]:
#SarinaKasaiyan
from collections import defaultdict, Counter

class BackoffLanguageModel:
    def __init__(self, n):
        self.n = n  # Maximum order of n-grams (e.g., 1 for unigram, 2 for bigram, etc.)
        self.ngrams = defaultdict(Counter)  # Dictionary to hold counts of n-grams

    def train(self, sentences):
        """Train the model using the provided sentences."""
        for sentence in sentences:
            tokens = sentence.split()  # Split the sentence into words (tokens)
            for i in range(len(tokens)):
                # Generate n-grams of order from 1 to n
                for j in range(1, self.n + 1):
                    if i + j <= len(tokens):  # Ensure we don't go out of bounds
                        ngram = tuple(tokens[i:i + j])  # Create an n-gram as a tuple
                        self.ngrams[j][ngram] += 1  # Increment the count of this n-gram

    def backoff_probability(self, ngram):
        """Calculate the probability of an n-gram using backoff."""
        order = len(ngram)  # Get the order of the n-gram (number of words)

        if order > self.n:  # If the order exceeds the maximum, return probability of zero
            return 0.0

        # Check the count of the current n-gram
        count = self.ngrams[order][ngram]

        if count > 0:
            # If the n-gram exists, return its probability
            return count / sum(self.ngrams[order].values())

        # Back off to lower order n-grams if current n-gram is not found
        for backoff_order in range(order - 1, 0, -1):  # Loop through lower orders
            backoff_ngram = ngram[1:]  # Remove the first word to create a backoff n-gram
            count = self.ngrams[backoff_order][backoff_ngram]  # Get count for backoff n-gram

            if count > 0:
                # If the backoff n-gram exists, calculate its probability and apply scaling
                return (0.4 * count) / sum(self.ngrams[backoff_order].values())

        return 0.0  # If no valid n-gram is found, return zero probability

# Example usage
sentences = [
    "I want to go",
    "I want to eat",
    "I want to sleep"
]

# Create an instance of BackoffLanguageModel with maximum n=3 (trigrams)
model = BackoffLanguageModel(n=3)
model.train(sentences)  # Train the model with example sentences

# Print probabilities for various n-grams
print(model.backoff_probability(("want", "to", "go")))  # Probability of trigram
print(model.backoff_probability(("to", "go")))           # Probability of bigram
print(model.backoff_probability(("go",)))                 # Probability of unigram

0.16666666666666666
0.1111111111111111
0.08333333333333333
