In [3]:
import numpy as np
from collections import defaultdict, Counter
import pandas as pd

# Step 1: Read text file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text.lower().split()

# Step 2: Unigram LM
def unigram_lm(words):
    unigram_counts = Counter(words)
    return unigram_counts

# Step 3: Bigram LM
def bigram_lm(words):
    bigram_counts = defaultdict(int)
    for i in range(len(words) - 1):
        bigram_counts[(words[i], words[i+1])] += 1
    return bigram_counts

# Step 4: Bigram Matrix
def bigram_matrix(unigram_counts, bigram_counts):
    vocab = list(unigram_counts.keys())
    vocab_size = len(vocab)
    matrix = np.zeros((vocab_size, vocab_size), dtype=int)
    word_to_index = {word: i for i, word in enumerate(vocab)}
    
    for (word1, word2), count in bigram_counts.items():
        matrix[word_to_index[word1], word_to_index[word2]] = count
    
    return matrix, word_to_index

# Step 5: Add-1 Smoothing
def add_one_smoothing(matrix):
    return matrix + 1

# Step 6: Calculate P* using the formula
def calculate_p_star(matrix, unigram_counts, vocab_size):
    total_unigrams = sum(unigram_counts.values())
    smoothed_matrix = add_one_smoothing(matrix)
    p_star = smoothed_matrix / (total_unigrams + vocab_size)
    return p_star

# Step 7: Calculate Reconstituted Count
def reconstituted_count(p_star, total_unigrams):
    return p_star * total_unigrams

# Main Function
def main(file_path):
    # Step 1: Read text file
    words = read_file(file_path)
    
    # Step 2: Unigram LM
    unigram_counts = unigram_lm(words)
    
    # Step 3: Bigram LM
    bigram_counts = bigram_lm(words)
    
    # Step 4: Bigram Matrix
    matrix, word_to_index = bigram_matrix(unigram_counts, bigram_counts)
    print("Bigram Matrix:")
    print(pd.DataFrame(matrix, index=word_to_index, columns=word_to_index))
    
    # Step 5: Add-1 Smoothing
    smoothed_matrix = add_one_smoothing(matrix)
    print("\nBigram Matrix after Add-1 Smoothing:")
    print(pd.DataFrame(smoothed_matrix, index=word_to_index, columns=word_to_index))
    
    # Step 6: Calculate P* using the formula
    p_star = calculate_p_star(matrix, unigram_counts, len(unigram_counts))
    print("\nP* Matrix:")
    print(pd.DataFrame(p_star, index=word_to_index, columns=word_to_index))
    
    # Step 7: Calculate Reconstituted Count
    reconstituted_counts = reconstituted_count(p_star, sum(unigram_counts.values()))
    print("\nReconstituted Count Matrix:")
    print(pd.DataFrame(reconstituted_counts, index=word_to_index, columns=word_to_index))

# Example Usage
file_path = 'THE BERKELEY RESTAURANT PROJECT.txt'
main(file_path)


Bigram Matrix:
                in  icslp-94,  2139-2142  the  berkeley  restaurant  project  \
in               0          2          0   15         0           0        0   
icslp-94,        0          0          1    0         0           0        0   
2139-2142        0          0          0    1         0           0        0   
the              0          0          0    0         2           0        0   
berkeley         0          0          0    0         0           2        0   
...             ..        ...        ...  ...       ...         ...      ...   
27.7             0          0          0    0         0           0        0   
numbers          0          0          0    0         0           0        0   
do               0          0          0    0         0           0        0   
tight-coupling   0          0          0    0         0           0        0   
augmentations.   0          0          0    0         0           0        0   

                daniel  