In [17]:
import nltk
from nltk.util import ngrams
from collections import Counter
import pandas as pd

# Function to read text from a file
def read_file(file_path):
    with open(file_path, 'r') as file:
        return file.read().lower()

# Function to calculate bigram probabilities with add-one smoothing
def add_one_smoothing(bigrams, unigram_counts, vocabulary_size):
    bigram_counts = Counter(bigrams)
    smoothed_probabilities = {}

    for bigram in bigram_counts:
        smoothed_probabilities[bigram] = (bigram_counts[bigram] + 1) / (unigram_counts[bigram[0]] + vocabulary_size)

    return smoothed_probabilities

# Generate a smoothed count matrix
def generate_bigram_count_matrix(tokens, bigrams, vocabulary_size):
    vocab = sorted(set(tokens))
    unigram_counts = Counter(tokens)
    bigram_counts = Counter(bigrams)

    bigram_count_matrix = pd.DataFrame(0, index=vocab, columns=vocab, dtype=int)
    
    for w1 in vocab:
        for w2 in vocab:
            bigram_count_matrix.loc[w1, w2] = bigram_counts[(w1, w2)] + 1  # Add-One Smoothing

    return bigram_count_matrix

# Generate a bigram probability matrix
def generate_bigram_probability_matrix(tokens, smoothed_probabilities):
    vocab = sorted(set(tokens))
    bigram_matrix = pd.DataFrame(0, index=vocab, columns=vocab, dtype=float)

    for (w1, w2), prob in smoothed_probabilities.items():
        bigram_matrix.loc[w1, w2] = prob

    return bigram_matrix

# Main function
def main():
    nltk.download('punkt')

    # Read text from file
    file_path = 'corpus.txt'  # Replace with your file path
    text = read_file(file_path)

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Generate unigrams and bigrams
    unigrams = tokens
    bigrams = list(ngrams(tokens, 2))

    # Calculate unigram counts
    unigram_counts = Counter(unigrams)
    vocabulary_size = len(unigram_counts)

    # Apply Add-One Smoothing
    add_one_probs = add_one_smoothing(bigrams, unigram_counts, vocabulary_size)

    # Generate bigram count matrix
    bigram_count_matrix = generate_bigram_count_matrix(tokens, bigrams, vocabulary_size)
    
    # Generate bigram probability matrix
    bigram_probability_matrix = generate_bigram_probability_matrix(tokens, add_one_probs)

    # Print Bigram Probability Matrix
    print("\nBigram Probability Matrix with Add-One Smoothing:")
    print(bigram_probability_matrix.round(4))

    # Print Bigram Count Matrix
    print("Bigram Count Matrix with Add-One Smoothing:")
    print(bigram_count_matrix)

    # Prompt the user for a bigram input
    def get_bigram_probability(smoothed_probabilities, bigram):
        return smoothed_probabilities.get(bigram, "Bigram not found")

    # Example usage for querying a bigram
    random_bigram = ('spend', 'money')  # example input bigram
    probability = get_bigram_probability(add_one_probs, random_bigram)
    print(f"Probability of the bigram {random_bigram}: {probability}")

if __name__ == "__main__":
    main()



Bigram Probability Matrix with Add-One Smoothing:
           chinese  delicious     eat  favorite    food     for       i  \
chinese     0.0000     0.0000  0.0000    0.0000  0.2963  0.0000  0.0000   
delicious   0.1053     0.0000  0.0000    0.0000  0.1053  0.0000  0.0000   
eat         0.2308     0.0000  0.0000    0.0000  0.0769  0.0000  0.0000   
favorite    0.0000     0.0000  0.0000    0.0000  0.0000  0.0000  0.0000   
food        0.0000     0.0000  0.1071    0.0000  0.0000  0.1071  0.0714   
for         0.0000     0.0000  0.0000    0.0000  0.0000  0.0000  0.0000   
i           0.0000     0.0000  0.0000    0.0000  0.0000  0.0000  0.0000   
is          0.0000     0.1429  0.0000    0.0000  0.0000  0.0000  0.0000   
lunch       0.1111     0.0000  0.0741    0.0000  0.0000  0.0000  0.1111   
meal        0.0000     0.0000  0.0000    0.0000  0.0000  0.0000  0.0000   
money       0.0000     0.0000  0.0000    0.0000  0.0000  0.0000  0.0909   
my          0.0000     0.0000  0.0000    0.1111  

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hema\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
