In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/Reviews.csv', on_bad_lines='skip')

In [None]:
reviews = df['Text']

In [None]:
reviews = reviews.dropna()

In [None]:
reviews = reviews[:10000]

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def preprocess_text(text):
    import nltk
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords

    nltk.data.path.append('/root/nltk_data') # Add the default download directory to the data path

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_text = ' '.join(tokens)
    return cleaned_text

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
cleaned_reviews = reviews.apply(preprocess_text)

In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(cleaned_reviews)

In [None]:
def retrieve_similar_reviews(query, top_k=5):
    cleaned_query = preprocess_text(query)
    query_vec = vectorizer.transform([cleaned_query])
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix).flatten()
    top_indices = similarity_scores.argsort()[::-1][:top_k]
    results = pd.DataFrame({
        "Original Review": reviews.iloc[top_indices].values,
        "Cleaned Review": cleaned_reviews.iloc[top_indices].values,
        "Similarity Score": similarity_scores[top_indices]
    })
    return results

In [None]:
query1 = "great product with fast shipping"
query2 = "disappointed"
print("Top reviews for Query 1:")
display(retrieve_similar_reviews(query1))
print("Top reviews for Query 2:")
display(retrieve_similar_reviews(query2))

Top reviews for Query 1:


Unnamed: 0,Original Review,Cleaned Review,Similarity Score
0,Enjoyed the product and they also provided ver...,enjoyed product also provided fast shipping im...,0.502383
1,The tea is good and fresh. We enjoy it. The sh...,tea good fresh enjoy shipping fast cost reason...,0.443917
2,My daughter lives in Hawaii and sent me some g...,daughter lives hawaii sent great coffee keurig...,0.405738
3,The energy drink is a great product. The shipp...,energy drink great product shipping price craz...,0.403072
4,"Fast shipping, items were packaged nicely and ...",fast shipping items packaged nicely described ...,0.390348


Top reviews for Query 2:


Unnamed: 0,Original Review,Cleaned Review,Similarity Score
0,I am a bit disappointed. The flavor was not w...,bit disappointed flavor wanted expected,0.485029
1,The product is very good. Way too expensive an...,product good way expensive almost box get panc...,0.356803
2,Disappointed. The big boxes had a very differ...,disappointed big boxes different flavor smalle...,0.320826
3,Just plain nasty!!! This item tasted like card...,plain nasty item tasted like cardboard watered...,0.311353
4,"this stuff really works, i love it and cant ge...",stuff really works love cant get enough tastes...,0.308053


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
query = "great camera quality and battery life"
query_vec = vectorizer.transform([query])
cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
top_indices = cosine_sim.argsort()[::-1][:5]
most_relevant_reviews = reviews.iloc[top_indices]
print(most_relevant_reviews)

3681    It is great! I like it alot. Great price too. ...
239     Fresh,a great way to get a little chocolate in...
5971    You can taste the quality and care that goes i...
9783    This is the best candy cane cocoa, and it has ...
5851    The toasted flavor is so much better than regu...
Name: Text, dtype: object


In [None]:
def observe_query_results(query, top_k=5):
    print("=" * 60)
    print(f"🔍 Query: {query}")
    print("=" * 60)
    results = search_reviews(query, k=top_k)

    for i, row in results.iterrows():
        print(f"\n📄 Review #{i + 1}")
        print(f"➡️  Original: {row['Text']}\n")
        print(f"🧹 Cleaned: {row['Cleaned_Text']}\n")
        print("-" * 60)