## Exploring the content of a text corpus

In [1]:
# Q1
# load the data file
import json
from pprint import pprint

with open('data/yelp.json', 'r') as f:
    data = json.load(f)

# count rows in this dataset
print('Number of reviews:', len(data))

Number of reviews: 10000


In [2]:
# Q2
# count number of unique user ids in this dataset
user_ids = [entry['user_id'] for entry in data]
unique_user_ids = set(user_ids)

print('Number of unique user ids:', len(unique_user_ids))

Number of unique user ids: 6403


In [27]:
# Q3

from nltk.tokenize import word_tokenize

reviews = [entry['text'] for entry in data]

# tokenize each review into words and calculate the length of each review in words
review_lengths = [len(word_tokenize(review)) for review in reviews]

# calculate the average words of reviews 
average_words = sum(review_lengths) / len(reviews)

# find the shortest and longest reviews length
shortest_review_words = min(review_lengths)
longest_review_words = max(review_lengths)

# get the content of the shortest review
shortest_review_content = []
for entry in data:
    if len(word_tokenize(entry["text"])) == shortest_review_words:
        shortest_review_content.append(entry["text"])

# print the results
print('Average review length:', average_words, 'words')
print('Shortest review length:', shortest_review_words, 'words')
print('Longest review length:', longest_review_words, 'words')
print('Content of the shortest review:', shortest_review_content)

Average review length: 152.062 words
Shortest review length: 1 words
Longest review length: 1137 words
Content of the shortest review: ['Excellent', 'X', 'Go']


In [4]:
# Q4

from nltk.tokenize import sent_tokenize
reviews = [entry['text'] for entry in data]

# tokenize each review into sentences and calculate the number of sentences in each review
review_sentence_lengths = [len(sent_tokenize(review)) for review in reviews]

# calculate the average length of reviews in sentences
average_sentence_length = sum(review_sentence_lengths) / len(reviews)

# find the shortest and longest reviews in sentences
shortest_review_sentences = min(review_sentence_lengths)
longest_review_sentences = max(review_sentence_lengths)

# get the longest reviews content in sentences
longest_sentence_review_index = review_sentence_lengths.index(longest_review_sentences)
longest_sentence_review_content = reviews[longest_sentence_review_index]

# get the longest reviews content in words
longest_review_index = review_lengths.index(longest_review_words)
longest_review_content = reviews[longest_review_index]

# print the results
print('Average review length:', average_sentence_length, 'sentences')
print('Shortest review length:', shortest_review_sentences, 'sentences')
print('Longest review length:', longest_review_sentences, 'sentences')

# Check if the longest review by sentences is the same as the longest review by words
if longest_sentence_review_content == longest_review_content:
    print("The longest review by sentences is the same as the longest review by words.")
else:
    print("The longest review by sentences is not the same as the longest review by words.")

Average review length: 9.2126 sentences
Shortest review length: 1 sentences
Longest review length: 92 sentences
The longest review by sentences is not the same as the longest review by words.


In [5]:
# Q5

# count tokens in review
from collections import Counter

reviews = [entry['text'] for entry in data]

# create an empty Counter
token_counts = Counter()

# tokenize each review and update the Counter
for review in reviews:
    tokens = word_tokenize(review)
    token_counts.update(tokens)

# Ten most common tokens
print('Ten most common tokens and counts:')
pprint(token_counts.most_common(10))
print()

# Ten least common tokens
print('Ten least common tokens and counts:')
pprint(token_counts.most_common()[-10:])

Ten most common tokens and counts:
[('.', 76320),
 ('the', 55130),
 (',', 54520),
 ('and', 42581),
 ('I', 40331),
 ('a', 35110),
 ('to', 29935),
 ('was', 20753),
 ('of', 20729),
 ('is', 17623)]

Ten least common tokens and counts:
[('filled/stuffed', 1),
 ('waznt', 1),
 ('pretensiousness', 1),
 ('seemlessly', 1),
 ('prunes', 1),
 ('obtainable', 1),
 ('5minutes', 1),
 ('WORK', 1),
 ('Spinatos', 1),
 ('altering', 1)]


Counting word tokens without further processing can lead to inflated token tounts, specifically, different word forms are counted separately and cause an overestimation of vocabulary diversity.I could also create difficulty in matching. Text retrieval and matching tasks may be less effective without normalizing the words. In addition,typos and misspellings can introduce noise in the analysis.

In [11]:
# Q6
# stem tokens in review
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
# reference: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
reviews = [entry['text'] for entry in data]

# create an empty Counter that collect stemmed tokens
stem_token_counts = Counter()

# tokenize each review and stem each token
for review in reviews:
    # tokenize the review and convert to lowercase
    tokens = word_tokenize(review.lower())
    # remove punctuation, remove stop words and stem the tokens
    stemmed_tokens = [ps.stem(t) for t in tokens if t not in stop_words and t.isalnum()]
    stem_token_counts.update(stemmed_tokens)

# Ten most common tokens
print('Ten most common tokens and counts:')
pprint(stem_token_counts.most_common(10))
print()

# Ten least common tokens
print('Ten least common tokens and counts:')
pprint(stem_token_counts.most_common()[-10:])

Ten most common tokens and counts:
[('place', 7474),
 ('good', 6921),
 ('food', 6285),
 ('like', 5626),
 ('great', 5115),
 ('go', 4882),
 ('get', 4656),
 ('time', 4603),
 ('one', 4290),
 ('order', 3659)]

Ten least common tokens and counts:
[('602', 1),
 ('en', 1),
 ('augment', 1),
 ('crisscross', 1),
 ('dugout', 1),
 ('unintend', 1),
 ('waznt', 1),
 ('pretensi', 1),
 ('seemlessli', 1),
 ('5minut', 1)]


Compare to problem 5, the preprocessing steps of question 6 cleaned up the text data and removed noise, leading to more meaningful and representative results in terms of common and rare tokens.For example, the most common tokens in question 6 include words like "place," "good," "food," "like," "great," and "go." These are likely to be more semantically meaningful words in the context of restaurant reviews.

In [12]:
# Q7
from nltk.util import ngrams

unigram_counts = Counter()
bigram_counts = Counter()
trigram_counts = Counter()

for entry in data:
    review_text = entry['text']
    tokens = word_tokenize(review_text.lower())
    tokens = [t for t in tokens if t.isalnum()]

    unigrams = tokens
    bigrams = list(ngrams(tokens, 2))
    trigrams = list(ngrams(tokens, 3))

    unigram_counts.update(unigrams)
    bigram_counts.update(bigrams)
    trigram_counts.update(trigrams)

print('Most prevalent unigrams:')
pprint(unigram_counts.most_common(1))
print()
print('Most prevalent bigrams:')
pprint(bigram_counts.most_common(1))
print()
print('Most prevalent trigrams:')
pprint(trigram_counts.most_common(1))

Most prevalent unigrams:
[('the', 66739)]

Most prevalent bigrams:
[(('of', 'the'), 4488)]

Most prevalent trigrams:
[(('this', 'place', 'is'), 708)]


'the' and 'of the'are all common stop word. Though 'this place is' can't be considered as stop word, it is a common phrase used in restaurant reviews and may not carry much specific meaning. 

In [13]:
# Q7 Bonus
stop_words = set(stopwords.words('english'))

unigram_counts = Counter()
bigram_counts = Counter()
trigram_counts = Counter()

for entry in data:
    review_text = entry['text']
    tokens = word_tokenize(review_text.lower())
    tokens = [t for t in tokens if t.isalnum() and t not in stop_words]

    unigrams = tokens
    bigrams = list(ngrams(tokens, 2))
    trigrams = list(ngrams(tokens, 3))
    
    unigram_counts.update(unigrams)
    bigram_counts.update(bigrams)
    trigram_counts.update(trigrams)

print('Most prevalent unigrams (after stop word removal):')
pprint(unigram_counts.most_common(1))
print()
print('Most prevalent bigrams (after stop word removal):')
pprint(bigram_counts.most_common(1))
print()
print('Most prevalent trigrams (after stop word removal):')
pprint(trigram_counts.most_common(1))

Most prevalent unigrams (after stop word removal):
[('good', 6762)]

Most prevalent bigrams (after stop word removal):
[(('happy', 'hour'), 600)]

Most prevalent trigrams (after stop word removal):
[(('sweet', 'potato', 'fries'), 99)]


Yes, they are generally more meaningful after removing stop words, these words are more informative and could help identify positive or negative reviews.

In [29]:
# Q8
# reference: https://saturncloud.io/blog/how-to-calculate-tfidf-using-sklearn-for-ngrams-in-python/
# reference: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
# reference: https://stackoverflow.com/questions/47898326/how-vectorizer-fit-transform-work-in-sklearn
# reference: https://www.geeksforgeeks.org/numpy-argsort-in-python/

with open('data/query.txt', 'r') as f:
    query = f.read()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

reviews = [entry["text"] for entry in data]

vectorizer = TfidfVectorizer(ngram_range=(1,3),min_df=10)
reviews_tfidf = vectorizer.fit_transform(reviews)
query_tfidf = vectorizer.transform([query])

cosine_sim = cosine_similarity(query_tfidf, reviews_tfidf)
similarity_scores = cosine_sim[0]
top_3_indices = np.argsort(similarity_scores)[-3:][::-1]

print("Top-3 most similar reviews:")
for idx in top_3_indices:
    print(data[idx]['text'])

Top-3 most similar reviews:
I have been going here for years. I love the "Hong Kong" style pan fried crispy noodles!
Shaved noodles, shaved noodles, shaved noodles!  Am I too butt lazy if I only go for shaved noodles?  The dish ordered with the handpulled meant I had to hold chopsticks high into the air trying to separate a portion of noodles out onto our plates.  Not bad but wasn't special... but the shaved noodles, loved the chewy texture.  I unashamedly used my baby godson as an excuse to get a mini noodle pulling show at the window.

I'm going to work my way thru the fresh fruit juice offerings...  I used to drink that daily while working as an expat in HK.  Speaking of which, the bathrooms are sooooo HK grand (Amy L, you and I think alike!)...

Overall the stir fried dishes were a bit greasy and the flavors decent, but the draw here is the shaved noodles (can I plug this anymore?!) and the fresh fruit juice.
I've been going here for the past 9+ years and it's pretty much the basic

Strengths:
Easy to implement and understand.
Captures n-gram significance, considering both individual words and phrases.
Feature extraction through TF-IDF allows for informative comparisons.

Weaknesses:
Overemphasis on frequency.
Does not consider context or relationships between n-grams.