In [1]:
import nltk
from nltk.corpus import gutenberg
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

# Download necessary nltk data
nltk.download('gutenberg')
nltk.download('punkt')

# Load sample corpus data
corpus = gutenberg.raw('austen-emma.txt')  # Jane Austen's "Emma"

# Tokenize the corpus into words
tokens = nltk.word_tokenize(corpus)

# Filter tokens to remove punctuation and lowercase
filtered_tokens = [word.lower() for word in tokens if word.isalpha()]

# Create a BigramCollocationFinder
bigram_finder = BigramCollocationFinder.from_words(filtered_tokens)
bigram_finder.apply_freq_filter(5)  # Filter out bigrams that occur less than 5 times

# Extract top 10 bigrams based on their likelihood ratio
top_bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

# Create a TrigramCollocationFinder
trigram_finder = TrigramCollocationFinder.from_words(filtered_tokens)
trigram_finder.apply_freq_filter(3)  # Filter out trigrams that occur less than 3 times

# Extract top 10 trigrams based on their likelihood ratio
top_trigrams = trigram_finder.nbest(TrigramAssocMeasures.likelihood_ratio, 10)

# Print results
print("Top 10 Bigrams:")
for bigram in top_bigrams:
    print(bigram)

print("\nTop 10 Trigrams:")
for trigram in top_trigrams:
    print(trigram)


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Praveena\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Praveena\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 10 Bigrams:
('i', 'am')
('had', 'been')
('to', 'be')
('frank', 'churchill')
('it', 'was')
('miss', 'woodhouse')
('have', 'been')
('could', 'not')
('any', 'thing')
('my', 'dear')

Top 10 Trigrams:
('i', 'am', 'not')
('i', 'am', 'sure')
('the', 'sort', 'of')
('the', 'whole', 'of')
('the', 'subject', 'of')
('the', 'rest', 'of')
('the', 'idea', 'of')
('the', 'part', 'of')
('the', 'evening', 'of')
('the', 'degree', 'of')
