In [2]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import nltk
from nltk.util import bigrams
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Sample text from William Shakespeare's "Hamlet"
text = """
To be, or not to be, that is the question—
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
And, by opposing, end them. To die, to sleep—
No more—and by a sleep to say we end
The heart-ache and the thousand natural shocks
That flesh is heir to—'tis a consummation
Devoutly to be wished.
"""

# Tokenize the text into words
tokens = word_tokenize(text.lower())

# Generate bigrams
bigrams_list = list(bigrams(tokens))

# Count bigrams and unigrams
bigram_counts = Counter(bigrams_list)
unigram_counts = Counter(tokens)

# Vocabulary size
vocab_size = len(unigram_counts)

# Apply Laplace smoothing to bigram counts
laplace_bigrams = defaultdict(float)
for bigram in bigram_counts:
    laplace_bigrams[bigram] = (bigram_counts[bigram] + 1) / (unigram_counts[bigram[0]] + vocab_size)

# Convert counts to a DataFrame for better visualization
bigram_table = pd.DataFrame([(bigram[0], bigram[1], count, laplace_bigrams[bigram]) 
                             for bigram, count in bigram_counts.items()],
                            columns=['First Word', 'Second Word', 'Count', 'Laplace Probability'])

# Display the bigram counts and Laplace probabilities in a tabular form
print(bigram_table)

# Alternatively, display the table in Jupyter Notebook
bigram_table.style.set_caption("Bigram Counts and Laplace Probabilities")

      First Word   Second Word  Count  Laplace Probability
0             to            be      3             0.070175
1             be             ,      2             0.057692
2              ,            or      2             0.054545
3             or           not      1             0.039216
4            not            to      1             0.040000
..           ...           ...    ...                  ...
69             a  consummation      1             0.038462
70  consummation      devoutly      1             0.040000
71      devoutly            to      1             0.040000
72            be        wished      1             0.038462
73        wished             .      1             0.040000

[74 rows x 4 columns]


Unnamed: 0,First Word,Second Word,Count,Laplace Probability
0,to,be,3,0.070175
1,be,",",2,0.057692
2,",",or,2,0.054545
3,or,not,1,0.039216
4,not,to,1,0.04
5,",",that,1,0.036364
6,that,is,1,0.039216
7,is,the,1,0.038462
8,the,question—,1,0.037037
9,question—,whether,1,0.04
