In [7]:
import nltk
import json
from collections import Counter
from nltk.util import ngrams

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def get_top_ngrams(input_file_path):
    # Load the preprocessed news corpus
    with open(input_file_path, 'r', encoding='utf-8') as file:
        preprocessed_data = json.load(file)
    
    # Combine all preprocessed text into a single string
    all_text = ' '.join(preprocessed_data)
    
    # Debug statement to check the combined text
    print("Combined Text Preview:")
    print(all_text[:500])  # Print the first 500 characters of the combined text

    # Function to extract n-grams and count their frequencies
    def extract_ngrams(text, n, top_k):
        # Tokenize the text into words
        words = nltk.word_tokenize(text)
        # Generate n-grams
        n_grams = ngrams(words, n)
        # Count the frequencies of each n-gram
        n_gram_freq = Counter(n_grams)
        # Get the top k n-grams by frequency
        top_ngrams = n_gram_freq.most_common(top_k)
        return top_ngrams

    # Define the range of n-grams and the number of top n-grams to retrieve
    n_values = [2, 3, 4]
    top_k = 10

    # Find and print the top n-grams for each n
    for n in n_values:
        top_ngrams = extract_ngrams(all_text, n, top_k)
        print(f"\nTop {top_k} {n}-grams:")
        for ngram, freq in top_ngrams:
            print(f"{' '.join(ngram)}: {freq}")

# Example usage
get_top_ngrams('tokenized_300.json')


[nltk_data] Downloading package punkt to /Users/QuangAP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/QuangAP/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/QuangAP/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Combined Text Preview:
amazon offer fulfilment infrastructure seller outside platform economic time september friday copyright bennett coleman right reserved section tech internet length word byline annapurna roy body amazon thursday announced launch new product multichannel fulfilment enable seller run online business accessing fulfilment infrastructure even platform excited announce product called multichannel fulfilment allows small business brand take advantage fulfilment infrastructure worry inventory fulfilment 

Top 10 2-grams:
new york: 561
artificial intelligence: 514
york time: 461
length word: 300
word byline: 279
right reserved: 252
reserved section: 225
generative ai: 210
copyright new: 158
time company: 154

Top 10 3-grams:
new york time: 461
length word byline: 279
right reserved section: 225
copyright new york: 158
york time company: 146
article appeared print: 100
appeared print page: 100
generative artificial intelligence: 93
company right reserved: 90
pg length word: