In [1]:
import nltk
import json
from collections import Counter
from nltk.util import ngrams

# Ensure necessary NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def get_top_ngrams(input_file_path):
    # Load the preprocessed news corpus
    with open(input_file_path, 'r', encoding='utf-8') as file:
        preprocessed_data = json.load(file)
    
    # Combine all preprocessed text into a single string
    all_text = ' '.join(preprocessed_data)
    
    # Debug statement to check the combined text
    print("Combined Text Preview:")
    print(all_text[:500])  # Print the first 500 characters of the combined text

    # Function to extract n-grams and count their frequencies
    def extract_ngrams(text, n, top_k):
        # Tokenize the text into words
        words = nltk.word_tokenize(text)
        # Generate n-grams
        n_grams = ngrams(words, n)
        # Count the frequencies of each n-gram
        n_gram_freq = Counter(n_grams)
        # Get the top k n-grams by frequency
        top_ngrams = n_gram_freq.most_common(top_k)
        return top_ngrams

    # Define the range of n-grams and the number of top n-grams to retrieve
    n_values = [2, 3, 4]
    top_k = 30

    # Find and print the top n-grams for each n
    for n in n_values:
        top_ngrams = extract_ngrams(all_text, n, top_k)
        print(f"\nTop {top_k} {n}-grams:")
        for ngram, freq in top_ngrams:
            print(f"{' '.join(ngram)}: {freq}")

# Example usage
get_top_ngrams('tokenizedBERT_500.json')


[nltk_data] Downloading package punkt to /Users/QuangAP/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/QuangAP/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/QuangAP/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Combined Text Preview:
info ink large deal new economic time paper edition june thursday mumbai edition copyright bennett coleman co ltd right reserved section startup tech length word bureau highlight nile say firm solid foundation grow build re month come body bengal info solid foundation grow back large deal bag fiscal year ended march none chairman nan nile large deal valued billion year net new deal promise solid foundation grow build re month come nile wednesday annual general meeting india second largest compan

Top 30 2-grams:
new york: 878
artificial intelligence: 847
york time: 726
length word: 500
load date: 500
genus ai: 446
right reserved: 399
reserved section: 354
last year: 263
copyright new: 247
time company: 241
chief executive: 238
ezra klein: 238
language model: 187
article appeared: 173
appeared print: 173
print page: 173
social medium: 167
tech company: 164
economic time: 154
page load: 152
copyright bennett: 146
bennett coleman: 146
coleman co: 146
co ltd: 146
ltd