In [1]:
!pip install -q kaggle
!pip install -q nltk
!pip install -q scikit-learn

In [13]:
import pandas as pd
import numpy as np
import re
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [24]:
csv_path = 'Reviews.csv'
df = pd.read_csv(csv_path, on_bad_lines='skip', encoding='utf-8', engine='python')


In [16]:
text_col = 'Text'
reviews_series = df[text_col].dropna().reset_index(drop=True)
reviews_series = reviews_series.iloc[:10000].copy()
reviews_df = pd.DataFrame({'original': reviews_series})

In [18]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [25]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

In [20]:
from tqdm import tqdm
tqdm.pandas()
reviews_df['cleaned'] = reviews_df['original'].progress_apply(preprocess_text)

100%|██████████| 10000/10000 [00:05<00:00, 1914.26it/s]


In [26]:
vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X = vectorizer.fit_transform(reviews_df['cleaned'])
print( X.shape)

(10000, 20000)


In [22]:
def retrieve_reviews(query, top_k=5):
    q_clean = preprocess_text(query)
    q_vec = vectorizer.transform([q_clean])
    sims = cosine_similarity(q_vec, X).flatten()
    top_idx = np.argsort(-sims)[:top_k]
    results = []
    for idx in top_idx:
        results.append({
            'score': float(sims[idx]),
            'original': reviews_df.at[idx, 'original'],
            'cleaned': reviews_df.at[idx, 'cleaned']
        })
    return results

In [23]:
for q in ["great product with fast shipping", "disappointed"]:
    print('\n===', q, '===')
    for r in retrieve_reviews(q):
        print(f"\nScore: {r['score']:.4f}")
        print('Original:', r['original'][:300])
        print('Cleaned:', r['cleaned'][:150])


=== great product with fast shipping ===

Score: 0.5160
Original: Use frequently as we like to do Asian dishes at least once a week.  Love this product.  Fast shipping, as usual.  Would buy again.
Cleaned: use frequently like asian dish least week love product fast shipping usual would buy

Score: 0.3938
Original: good products and fast shipping equals a happy me. a little pricey but you can hand pick a few good flavors...a few i cant find anywhere else so def worth the price
Cleaned: good product fast shipping equal happy little pricey hand pick good flavor cant find anywhere else def worth price

Score: 0.3927
Original: My 3 cats can't get enough of these treats. A great product at a great price AND fast shipping. What more can 3 cats ask for?
Cleaned: cat get enough treat great product great price fast shipping cat ask

Score: 0.3662
Original: Great Product, fast shipment,the food product tasted great. none of the fortune cookies where opened.  i would buy again from this seller!
C