In [1]:
import nltk
import json
from collections import Counter
from nltk.util import ngrams

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def get_top_ngrams(input_file_path):
    # Load the preprocessed news corpus
    with open(input_file_path, 'r', encoding='utf-8') as file:
        preprocessed_data = json.load(file)
    
    # Combine all preprocessed text into a single string
    all_text = ' '.join(preprocessed_data)
    
    # Debug statement to check the combined text
    print("Combined Text Preview:")
    print(all_text[:500])  # Print the first 500 characters of the combined text

    # Function to extract n-grams and count their frequencies
    def extract_ngrams(text, n, top_k):
        # Tokenize the text into words
        words = nltk.word_tokenize(text)
        # Generate n-grams
        n_grams = ngrams(words, n)
        # Count the frequencies of each n-gram
        n_gram_freq = Counter(n_grams)
        # Get the top k n-grams by frequency
        top_ngrams = n_gram_freq.most_common(top_k)
        return top_ngrams

    # Define the range of n-grams and the number of top n-grams to retrieve
    n_values = [2, 3, 4]
    top_k = 30

    # Find and print the top n-grams for each n
    for n in n_values:
        top_ngrams = extract_ngrams(all_text, n, top_k)
        print(f"\nTop {top_k} {n}-grams:")
        for ngram, freq in top_ngrams:
            print(f"{' '.join(ngram)}: {freq}")

# Example usage
get_top_ngrams('tokenizedFine_300.json')


[nltk_data] Downloading package punkt to /Users/QuangAP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/QuangAP/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/QuangAP/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Combined Text Preview:
amazon thursday announced launch new product multichannel fulfilment enable seller run online business accessing fulfilment infrastructure even platform excited announce product called multichannel fulfilment allows small business brand take advantage fulfilment infrastructure worry inventory fulfilment look irrespective whether selling amazon amazon matter digital seller run operation using amazon fulfilment infrastructure amit agarwal senior vice president emerging market amazon speaking amazo

Top 30 2-grams:
artificial intelligence: 470
new york: 241
generative ai: 203
york time: 168
chief executive: 149
united state: 115
last year: 112
language model: 105
tech company: 101
article appeared: 100
appeared print: 100
print page: 100
social medium: 91
generative artificial: 88
large language: 79
last month: 73
high school: 71
tech giant: 66
san francisco: 66
search engine: 63
silicon valley: 62
new technology: 57
year ago: 56
sam altman: 51
last week: 48
next ye