In [171]:
import pandas as pd
import json

def read_jsonl(file_path, number_of_lines = 100):
    """
    Reads a specified number of lines from a JSON Lines file and splits the data into train and test sets.

    :param file_path: Path to the JSON Lines file.
    :param number_of_lines: Number of lines to read from the file.
    :return: DataFrame 
    """
    
    # Read specified number of lines from file
    with open(file_path) as file:
        lines = [json.loads(next(file)) for _ in range(number_of_lines)]
        
    # Convert list of JSON objects to Pandas DF
    data = pd.DataFrame(lines)
    
    return data

data = read_jsonl("dataset/grocery_fixed.jsonl", 500000)

In [172]:
import re
# Preprocessing 

def pre_process(text):
    # lowercase
    text = text.lower()
    
    # remove tags
    text = re.sub("&lt;/?.*&gt;"," &lt;&gt; ", text)
    
    # remove special characters 
    text = re.sub("(\\d|\\W)+"," ", text)
    
    return text

data['text'] = data['title'] + " " + data['text']
data['text'] = data['text'].apply(pre_process)

In [173]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key = lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """ Get feature names and tf-idf scores of top n items"""
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results

def get_stop_words(stop_file_path):
    """Load stop words"""
    with open(stop_file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        stop_set = [m.strip() for m in stopwords]
        return stop_set  


In [174]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures
from nltk.tokenize import word_tokenize

def extract_keywords_for_product(data, asin, topn=10):
    """
    Extract top n keywords for a specific product based on its reviews.
    
    :param data: The dataset containing reviews.
    :param asin: The product identifier (e.g., ASIN) to extract keywords for.
    :param topn: The number of top keywords to extract.
    :return: A dictionary of top n keywords and their TF-IDF scores for the specified product.
    """
    # Load set of stop words
    stopwords = get_stop_words("dataset/stopwords.txt")

    # Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)

    # Filter reviews for the specified product
    product_data = data.loc[data['parent_asin'] == asin]
    docs = product_data['text'].tolist()

    # Compute TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Sort items by TF-IDF score
    sorted_items = sort_coo(tfidf_matrix.tocoo())

    # Extract top n keywords
    keywords = extract_topn_from_vector(feature_names, sorted_items, topn=topn)

    return keywords
def get_top_ngrams_for_product(reviews, topn=10, min_freq=3):
    """
    Get the top bigrams and trigrams for a specific product's reviews.
    
    :param reviews: List of reviews for a specific product.
    :param topn: Number of top n-grams to return.
    :param min_freq: Minimum frequency for n-grams to be considered.
    :return: A tuple containing lists of the top bigrams and trigrams.
    """
    
    # Preprocess and tokenize reviews
    tokenized_reviews = [word_tokenize(review) for review in reviews]
    
    # Flatten list of tokenized reviews into one list of words
    all_words = [word for tokens in tokenized_reviews for word in tokens]
    
    # Instantiate collocation finders
    bigram_finder = BigramCollocationFinder.from_words(all_words)
    trigram_finder = TrigramCollocationFinder.from_words(all_words)
    
    # Apply frequency filter 
    bigram_finder.apply_freq_filter(min_freq)
    trigram_finder.apply_freq_filter(min_freq)
    
    # Measures for calculating PMI
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()
    
    # Extract top n-grams with highest PMI 
    top_bigrams = bigram_finder.nbest(bigram_measures.pmi, topn)
    top_trigrams = trigram_finder.nbest(trigram_measures.pmi, topn)
    
    return top_bigrams, top_trigrams



In [175]:
data['parent_asin'].value_counts()

parent_asin
B00ESE0DC4    3443
B01NAYX4S3    2269
B0BZZWHKHQ    2256
B0C396H8QB    1934
B07VV7T465    1659
              ... 
B07871X7S6       1
B00E1ZMXFA       1
B00IG8GP20       1
B01LY8QGD1       1
B091NC2LBC       1
Name: count, Length: 135818, dtype: int64

In [176]:

asin = "B0BZZWHKHQ"
topn = 50

product_keywords = extract_keywords_for_product(data, asin, topn=topn)

product_reviews = data.loc[data['parent_asin'] == asin]['text'].tolist()
top_bigrams, top_trigrams = get_top_ngrams_for_product(product_reviews, topn=topn, min_freq=3)

print(f"Product keywords: {product_keywords}\n")
print(f"Top bigrams: {top_bigrams}\n")
print(f"Top trigrams: {top_trigrams}\n")


Product keywords: {'yummy': 0.919, 'works': 0.896, 'test': 1.0, 'price': 1.0, 'perfect': 1.0, 'love': 0.874, 'great': 0.882, 'good': 0.893, 'excellent': 0.917, 'bad': 1.0, 'salud': 0.961, 'nutrition': 0.954, 'disgusting': 0.946, 'flav': 0.941, 'terrible': 0.936, 'liked': 0.929, 'tasteful': 0.928, 'review': 0.913, 'nice': 0.894, 'light': 0.903, 'smoothie': 0.902, 'item': 0.9, 'awesome': 0.9, 'pricey': 0.895, 'supplement': 0.894, 'delicious': 0.888, 'loved': 0.877, 'vegan': 0.866, 'easy': 0.865, 'worst': 0.856, 'advertised': 0.853, 'everyday': 0.853, 'stevia': 0.85, 'matallic': 0.848, 'tasty': 0.845, 'mixed': 0.844, 'supergreenfood': 0.842}

Top bigrams: [('sams', 'club'), ('college', 'student'), ('static', 'electricity'), ('trader', 'joe'), ('psyllium', 'husk'), ('game', 'changer'), ('flax', 'seed'), ('bowel', 'movements'), ('heavy', 'metals'), ('room', 'temperature'), ('crystal', 'light'), ('cider', 'vinegar'), ('fairly', 'priced'), ('chia', 'seeds'), ('citric', 'acid'), ('repeat', 'cu