In [1]:
import pandas as pd
import json

def read_jsonl(file_path, number_of_lines = 100):
    """
    Reads a specified number of lines from a JSON Lines file and splits the data into train and test sets.

    :param file_path: Path to the JSON Lines file.
    :param number_of_lines: Number of lines to read from the file.
    :return: DataFrame 
    """
    
    # Read specified number of lines from file
    with open(file_path) as file:
        lines = [json.loads(next(file)) for _ in range(number_of_lines)]
        
    # Convert list of JSON objects to Pandas DF
    data = pd.DataFrame(lines)
    
    return data

data = read_jsonl("dataset/grocery_fixed.jsonl", 1000000)

In [2]:
import re
# Preprocessing 

def pre_process(text):
    # lowercase
    text = text.lower()
    
    # remove tags
    text = re.sub("&lt;/?.*&gt;"," &lt;&gt; ", text)
    
    # remove special characters 
    text = re.sub("(\\d|\\W)+"," ", text)
    
    return text

data['text'] = data['title'] + " " + data['text']
data['text'] = data['text'].apply(pre_process)

In [3]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key = lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """ Get feature names and tf-idf scores of top n items"""
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results

def get_stop_words(stop_file_path):
    """Load stop words"""
    with open(stop_file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        stop_set = [m.strip() for m in stopwords]
        return stop_set  


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder, TrigramAssocMeasures
from nltk.tokenize import word_tokenize

def extract_keywords_for_product(data, asin, topn=10):
    """
    Extract top n keywords for a specific product based on its reviews.
    
    :param data: The dataset containing reviews.
    :param asin: The product identifier (e.g., ASIN) to extract keywords for.
    :param topn: The number of top keywords to extract.
    :return: A dictionary of top n keywords and their TF-IDF scores for the specified product.
    """
    # Load set of stop words
    stopwords = get_stop_words("dataset/stopwords.txt")

    # Initialize TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)

    # Filter reviews for the specified product
    product_data = data.loc[data['parent_asin'] == asin]
    docs = product_data['text'].tolist()

    # Compute TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(docs)
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Sort items by TF-IDF score
    sorted_items = sort_coo(tfidf_matrix.tocoo())

    # Extract top n keywords
    keywords = extract_topn_from_vector(feature_names, sorted_items, topn=topn)

    return keywords
def get_top_ngrams_for_product(reviews, topn=10, min_freq=3):
    """
    Get the top bigrams and trigrams for a specific product's reviews.
    
    :param reviews: List of reviews for a specific product.
    :param topn: Number of top n-grams to return.
    :param min_freq: Minimum frequency for n-grams to be considered.
    :return: A tuple containing lists of the top bigrams and trigrams.
    """
    
    # Preprocess and tokenize reviews
    tokenized_reviews = [word_tokenize(review) for review in reviews]
    
    # Flatten list of tokenized reviews into one list of words
    all_words = [word for tokens in tokenized_reviews for word in tokens]
    
    # Instantiate collocation finders
    bigram_finder = BigramCollocationFinder.from_words(all_words)
    trigram_finder = TrigramCollocationFinder.from_words(all_words)
    
    # Apply frequency filter 
    bigram_finder.apply_freq_filter(min_freq)
    trigram_finder.apply_freq_filter(min_freq)
    
    # Measures for calculating PMI
    bigram_measures = BigramAssocMeasures()
    trigram_measures = TrigramAssocMeasures()
    
    # Extract top n-grams with highest PMI 
    top_bigrams = bigram_finder.nbest(bigram_measures.pmi, topn)
    top_trigrams = trigram_finder.nbest(trigram_measures.pmi, topn)
    
    return top_bigrams, top_trigrams



In [5]:
data['parent_asin'].value_counts()

parent_asin
B07LFJF6TR    3973
B00ESE0DC4    3443
B07MDTNZ66    2535
B01NAYX4S3    2341
B0BZZWHKHQ    2330
              ... 
B01HOER10Q       1
B00HRG4ORU       1
B00I2ZWD9Q       1
B08HMV5WSW       1
B07C3QCFB3       1
Name: count, Length: 196617, dtype: int64

In [6]:

asin = "B07LFJF6TR"
topn = 20

product_keywords = extract_keywords_for_product(data, asin, topn=topn)

product_reviews = data.loc[data['parent_asin'] == asin]['text'].tolist()
top_bigrams, top_trigrams = get_top_ngrams_for_product(product_reviews, topn=topn, min_freq=3)

print(f"Product keywords: {product_keywords}\n")
print(f"Top bigrams: {top_bigrams}\n")
print(f"Top trigrams: {top_trigrams}\n")


Product keywords: {'yuk': 1.0, 'thank': 1.0, 'nice': 1.0, 'great': 1.0, 'good': 1.0, 'flavor': 1.0, 'excellent': 1.0, 'delicious': 1.0, 'awesome': 0.992, 'love': 0.975, 'weak': 0.963}

Top bigrams: [('tierra', 'intenso'), ('koffee', 'kult'), ('tim', 'hortons'), ('gon', 'na'), ('west', 'coast'), ('costa', 'rican'), ('timely', 'manner'), ('asin', 'b'), ('san', 'antonio'), ('puerto', 'rico'), ('san', 'francisco'), ('caf', 'eacute'), ('ash', 'tray'), ('kicking', 'horse'), ('gas', 'station'), ('bullet', 'proof'), ('jamaican', 'blue'), ('blue', 'mountain'), ('central', 'america'), ('trader', 'joes')]

Top trigrams: [('eight', 'o', 'clock'), ('jamaican', 'blue', 'mountain'), ('guatemalan', 'san', 'antonio'), ('peaberry', 'guatemalan', 'san'), ('honduran', 'peaberry', 'guatemalan'), ('burke', 'brands', 'llc'), ('caf', 'eacute', 'don'), ('law', 'said', 'she'), ('sam', 's', 'club'), ('of', 'koffee', 'kult'), ('blown', 'away', 'by'), ('an', 'air', 'tight'), ('worth', 'every', 'penny'), ('anyone',

In [7]:
# Zero shot classification model

from transformers import pipeline 
model_name = "facebook/bart-large-mnli"
classifier = pipeline('zero-shot-classification', model=model_name)

  from .autonotebook import tqdm as notebook_tqdm
2024-04-07 10:30:41.753738: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-07 10:30:42.361967: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [17]:
import numpy as np
from tqdm import tqdm

# Filter reviews for the specified product
product_data = data.loc[data['parent_asin'] == asin][:100].copy()

# Init two new columns for the classification and score
product_data['label'] = np.nan
product_data['score'] = np.nan

# Define candidate labels and text to classify
candidate_labels = ["positive", "negative"]

# Classify each review and store the results
for index, row in tqdm(product_data.iterrows(), total=product_data.shape[0]):
    # Classify the current review text
    output = classifier(row['text'], candidate_labels)

    # Update the predicted label and score in the DataFrame
    product_data.at[index, 'predicted_label'] = output['labels'][0]  
    product_data.at[index, 'predicted_score'] = output['scores'][0]  




100%|██████████| 100/100 [01:06<00:00,  1.49it/s]


In [18]:
product_data['predicted_label'].value_counts()

predicted_label
positive    86
negative    14
Name: count, dtype: int64

In [19]:
product_data['predicted_score'].describe()

count    100.000000
mean       0.974206
std        0.062934
min        0.633873
25%        0.985333
50%        0.995517
75%        0.996761
max        0.998831
Name: predicted_score, dtype: float64

In [11]:
# BART summarization on aggregated reviews

AGGREGATED_REVIEW_TEXT = " ".join(product_data['text'].tolist())

from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

summary = summarizer(AGGREGATED_REVIEW_TEXT, max_length=len(AGGREGATED_REVIEW_TEXT), min_length=30, do_sample=False)

print(summary)

Your max_length is set to 4035, but your input_length is only 852. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=426)


[{'summary_text': '"This coffee is really good try it br long version the chocolate notes really shine and this is a really bold low acidity brew" "I can\'t believe how good this coffee is plus water process decaf and certified organic win win in my book  not much flavor sorry but it tastes like water  very smokey flavor like it s roasted too hot or too long  great deal great tasting coffee  delicious coffee good packaging smells fresh"'}]


In [15]:
summary

[{'summary_text': '"This coffee is really good try it br long version the chocolate notes really shine and this is a really bold low acidity brew" "I can\'t believe how good this coffee is plus water process decaf and certified organic win win in my book  not much flavor sorry but it tastes like water  very smokey flavor like it s roasted too hot or too long  great deal great tasting coffee  delicious coffee good packaging smells fresh"'}]