In [37]:
import pandas as pd
import json

# Define the number of lines you want to read
number_of_lines = 100000

# Define the path to your JSON Lines file
file_path = 'dataset/grocery_fixed.jsonl'

# Read the specified number of lines from the file
with open(file_path) as file:
    lines = [json.loads(next(file)) for _ in range(number_of_lines)]

# Convert the list of JSON objects to a DataFrame
data = pd.DataFrame(lines)


In [38]:
data.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')

In [55]:
# Preprocessing 
import re
def pre_process(text):
    # lowercase
    text = text.lower()
    
    # remove special characters 
    text = re.sub("(\\d|\\W)+"," ", text)
    
    return text

# Apply to DF
data['text'] = data['title'] + data['text']

data['text'] = data['text'].apply(lambda x: pre_process(x))


In [56]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def get_stop_words(stop_file_path):
    """Load stop words"""
    with open(stop_file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        stop_set = [m.strip() for m in stopwords]
        return stop_set  # Return a set directly

# Load set of stop words
stopwords = get_stop_words("dataset/stopwords.txt")

# Get text
docs = data['text'].tolist()

# Create vocab while ignoring words appearing in 85% of docs & stopwords
cv = CountVectorizer(max_df=0.85, stop_words=stopwords, max_features=10000)
word_count_vector = cv.fit_transform(docs)

# TfidfTransformer to compute inverse document frequency
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf = True)
tfidf_transformer.fit(word_count_vector)


In [57]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key = lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """ Get feature names and tf-idf scores of top n items"""
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
    
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results

In [58]:
feature_names = cv.get_feature_names_out()

doc = docs[50]

tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))

# Sort tf_idf vectors by descending order of scores
sorted_items = sort_coo(tf_idf_vector.tocoo())

# extract only top n; n of 10
keywords = extract_topn_from_vector(feature_names, sorted_items, 10)

print("\n=====Doc=====")
print(doc)
print("\n=====Keywords=====")
for k in keywords:
    print(k, keywords[k])


=====Doc=====
great coffee at a great price great coffee at a great price great coffee at a great price great coffee at a great price great coffee at a great price shipping was fast as well 

=====Keywords=====
great 0.687
coffee 0.503
price 0.479
shipping 0.153
fast 0.147
